예제 #1
def parse_csv(src, security, security_max, year, quarter):    

    print 'parse DB1B Ticket data from .zip to coupon_year_quarter.bin'
    assert isinstance(src, str), 'src must be a string'
    assert isinstance(security, bool), 'security must be a Boolean'
    assert ((security_max > 0) and isinstance(security_max, int)),\
        'security_max must be a positive integer'
    print '[source]\n\n\t' + src
    dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '_FOLDER.csv'

    t_unzip_csv_start = segment_timer.timer(True)

    print '\n' + 'unzipping folder to \\input'
    print '\n[destination]\n\n\t' + dst_folder
    zip = zipfile.ZipFile(src)
    src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv'    
    dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv'
    print '\n' + 'copying .csv from \\input (folder) to \\temp'
    shutil.move(src_csv, dst_csv)
    print 'deleting redundant folder from \\input'

    print '\n' + '[warning]\n\n\t .csv \\input datafile is large'
    print '\n' + 'opening .csv file for line count'
    data_reader = open(dst_csv, 'r')    
    print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start))

#    count number of lines in .csv file
    if security:
        print '\n[** running in reduced-lines mode **]'
        max_count = None
        print '\ncounting number of lines in dataset'    
        max_count = 0
        t_count_lines_start = segment_timer.timer(True)
        for line in data_reader:
            max_count += 1
        print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start))
        print '\nnumber of lines of data (including header):', max_count
        print 'sleeping for 5 seconds'
    print '\nre-open .csv file for parse'
    t_reopen_csv_start = segment_timer.timer(True)
    data_reader = open(dst_csv, 'r')
    print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start))
    print '\nconstruct list of Ticket .csv variable names, in key_list'
    t_parse_timer = segment_timer.timer(True)
    key_list = []
    for line in data_reader:
        key_list_raw = line.split('"')[1:-1]
        for variable_name in key_list_raw:
            if variable_name != ',':
    print 'list of variables in dataset (.csv column order):',\
    print '\n[variables]\n\n\t',
    for variable_name in key_list:
        print variable_name,
    retain_names_list = ['ItinID', 'ItinFare', 'DollarCred', 'BulkFare']
    print '\nretaining following variables (list order):',\
          len(retain_names_list), 'variables'
    print '\n[variables]\n\n\t',

    for variable_name in retain_names_list:
        if variable_name in key_list:
            print variable_name,
            print variable_name, '(not available)'
            raise Exception('variable_name missing from retain_names_list')
#    initialize data dictionary:
#    key/ variable name from retain_names_list, value/ list (empty by default)
#    the set of ith elements of the lists corresponds to one itinerary observation 
    data_itin_dict = {}
    for variable_name in retain_names_list:
        data_itin_dict[variable_name] = []
    print '\nretaining all carriers'
    count = 1
    if not security:
        t_intermediate_parse = segment_timer.timer(True)
        print '\nparsing data, percentage completed:'
        top_count = max_count / 100
        print count / top_count
#    loop over all lines in .csv file
    for line in data_reader:
        count += 1
#        if in test mode, exit data parse
        if security and count >= security_max:

        if not security:
            if float(count / top_count) == float(count) / top_count:
                print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse))
                t_intermediate_parse = segment_timer.timer(True)
        line_split = line.split(',')[:-1]
        line_data_list = [eval(element) for element in line_split]
#        build dictionary: append values to lists, for each retained variable
#        no error trap for multiple occurrences of the same ItinID

        for name in retain_names_list:

    print '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer))
    print '\nnumber of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]])
    print 'number of lines read:', count
    print '\nillustrative itineraries:\n'
    if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3:
        for itin_number in xrange(3):
            for variable in retain_names_list:
                print variable, data_itin_dict[variable][itin_number],
    if not security:
        print '\ntotal number of lines:', max_count
#    safe_cPickle Python dictionary ticket_year_quarter
    print 'save .bin to \\temp'
    dst = '..\\temp\\' + 'ticket_' + str(year) + '_' + str(quarter) + '.bin'

    print '\n[temp]\n\n\t' + dst
    safe_cPickle.safe_cPickle_dump(dst, data_itin_dict)
    if not security:
        print 'sleeping for 15 seconds'
    print '\ndeleting .csv file from \\temp'
    return None
예제 #2
def parse_csv(src, security, security_max, year, quarter):
    print 'parse DB1B Coupon data from .zip to coupon_year_quarter.bin'
    assert isinstance(src, str), 'src must be a string'
    assert isinstance(security, bool), 'security must be a Boolean'
    assert ((security_max > 0) and isinstance(security_max, int)),\
        'security_max must be a positive integer'
    print '[source]\n\n\t' + src
    dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_FOLDER.csv'

    t_unzip_csv_start = segment_timer.timer(True)

    print '\n' + 'unzipping folder to \\input\n'
    print '[destination]\n\n\t' + dst_folder
    zip = zipfile.ZipFile(src)
    src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv'    
    dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv'
    dst_csv_2011q1_to_2013q4_temp = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_UNSORTED.csv'

    if year >= 2014:
        raise Exception('sort not implemented', year)

    if 2011 <= year <= 2013:
        print '\n' + '[sort csv for 2011Q1 to 2013Q4 inclusive]'
        print '\n' + 'copying .csv from \\input (folder) to \\temp'
        shutil.move(src_csv, dst_csv_2011q1_to_2013q4_temp)
        sort_coupon_csv_2011_on.sort_coupon_csv(dst_csv_2011q1_to_2013q4_temp, dst_csv, year, quarter)
        print '\n' + 'copying .csv from \\temp (folder) to \\temp'
        shutil.move(src_csv, dst_csv)
    print 'deleting redundant folder from \\input'

    print '\n[warning]\n\n\t .csv \\input datafile is large\n'
    print 'opening .csv file for line count'
    data_reader = open(dst_csv, 'r')    
    print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start))

#    count number of lines in .csv file
    if security:
        print '\n[** running in reduced-lines mode **]'
        max_count = None
        print '\n' + 'counting number of lines in dataset'    
        max_count = 0
        t_count_lines_start = segment_timer.timer(True)
        for line in data_reader:
            max_count += 1
        print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start))
        print 'number of lines of data (including header):', max_count
        print '\n' + 'sleeping for 5 seconds'
    print '\n' + 're-open .csv file for parse'
    t_reopen_csv_start = segment_timer.timer(True)
    data_reader = open(dst_csv, 'r')
    print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start))
    large_carriers_dict = {'Southwest':'WN', 'American':'AA', 'Continental':'CO',\
                    'Delta':'DL', 'Northwest':'NW', 'Skywest':'OO',\
                    'United':'UA', 'US Airways':'US', 'American Eagle':'MQ',\
                    'Airtran Airways':'FL', 'Express Jet':'EV',\
                    'Jetblue':'B6', 'Alaska Airlines':'AS', 'Endeavor Air':'9E'}
    large_carrier_condition = False    
    print '\n' + 'construct list of Coupon .csv variable names, in key_list'
    t_parse_timer = segment_timer.timer(True)
    key_list = []
    for line in data_reader:
        key_list_raw = line.split('"')[1:-1]
        for variable_name in key_list_raw:
            if variable_name != ',':
    print 'list of variables in dataset (.csv column order):',\

    print '\n[variables]\n\n\t',
    for variable_name in key_list:
        print variable_name,
    retain_names_list = ['ItinID', 'Year', 'Quarter', 'Origin', 'Dest',\
        'OpCarrier', 'Passengers', 'TkCarrier', 'Distance', 'FareClass']
    print '\n' + 'retaining following variables (list order):',\
          len(retain_names_list), 'variables'
    print '\n' + '[variables]\n\n\t',

    for variable_name in retain_names_list:
        if variable_name in key_list:
            print variable_name,
            print variable_name, '(not available)'
            raise Exception('variable_name missing from retain_names_list')
#    initialize data dictionary:
#    key/ variable name from retain_names_list, value/ list (empty by default)
#    the set of ith elements of the lists corresponds to one itinerary observation 
    data_itin_dict = {}
    for variable_name in retain_names_list:
        data_itin_dict[variable_name] = []
    if large_carrier_condition:
        print '\n' + 'retaining following large carriers:'
        for i in large_carriers_dict:
            print i+' : ',
        print '\n' + 'retaining all carriers\n'
    count = 1
    if not security:
        t_intermediate_parse = segment_timer.timer(True)
        print 'parsing data, percentage completed:'
        top_count = max_count / 100
        print count / top_count
#    loop over all lines in .csv file
    for line in data_reader:
        count += 1
        if not security:
            if float(count / top_count) == float(count) / top_count:

                print str(count / top_count)+'\t','%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse))

                t_intermediate_parse = segment_timer.timer(True)  
#        line_data_list is a list of evaluated data elements
        line_split = line.split(',')[:-1]
        line_data_list = [eval(element) for element in line_split]
#        new itinerary for retained operating carrier if SeqNum == 1
#        deals with possible incomplete itinerary at start of data_reader (skipped)
        itin_test = line_data_list[key_list.index('ItinID')]   
#        if itin_test == eval('201122538709'):
#            print 'out', line_data_list
        if line_data_list[key_list.index('SeqNum')] == 1 and line_data_list[key_list.index('Coupons')] == 2:

#            number of coupons in newly-identified itinerary
            coupons_new = line_data_list[key_list.index('Coupons')]
            if coupons_new == 1:
#                one-way
            elif coupons_new == 2:
#                list of data corresponding to newly-identified itinerary
                line_data_list_first = line_data_list[:]
#                carrier_list will contain OpCarrier for each coupon in itinerary
#                to enable constant OpCarrier itineraries to be retained
#                (also applies to fare_class_list, passengers_list, and tk_carrier_list if required)
                carrier_list = [line_data_list[key_list.index('OpCarrier')][:]]            
                fare_class_list = [line_data_list[key_list.index('FareClass')][:]]
                passengers_list = [line_data_list[key_list.index('Passengers')]]
                tk_carrier_list = [line_data_list[key_list.index('TkCarrier')][:]]
                coupon_type_list = [line_data_list[key_list.index('CouponType')][:]]
#                if segment has trip break at SeqNum==1, create candidate destination variable
                if line_data_list[key_list.index('Break')] == 'X':
                    count_break = 1
                    line_data_list_dest = line_data_list[key_list.index('Dest')][:]
                    count_break = 0            
#                once new itinerary identified, continue to loop over lines in .csv file
                for line_ in data_reader:
                    count += 1
#                    if in test mode, exit data parse
                    if security and count >= security_max:
                    if not security:
                        if float(count / top_count) == float(count) / top_count:
                            print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse))
                            t_intermediate_parse = segment_timer.timer(True)
                    line_split_ = line_.split(',')[:-1]
                    line_data_list_ = [eval(element_) for element_ in line_split_]
                    itin_test_ = line_data_list_[key_list.index('ItinID')]   
                    if itin_test != itin_test_:
#                    if itin_test_ == eval('201122538709'):
#                        print 'return', line_data_list_
#                    if at end of itinerary, create candidate final destination variable
#                    if not at end of itinerary, but segment has trip break, 
#                    create candidate destination variable
                    if line_data_list_[key_list.index('SeqNum')] == coupons_new:
                        if line_data_list_[key_list.index('Break')] == 'X':
                            count_break += 1
                            line_data_list_final_dest = line_data_list_[key_list.index('Dest')][:]
#                        break out of inner (line_) for loop
#                        and check whether to retain itinerary
                        if line_data_list_[key_list.index('Break')] == 'X':
                            countBreak += 1
#                retain itinerary if the following conditions are satisfied:
#                1) 2 trip breaks
#                2) origin=destination (round-trip)
#                3) 2 coupons
#                4) constant OpCarrier
#                5) constant FareClass
#                6) constant Passengers
#                7) constant TkCarrier, same as OpCarrier
#                8) no 'E' in CouponType (no cabotage on itinerary)
#                9) legs in lower 48 states only
#                to do: need to check whether OpCarrier code (IATA?) is constant over time
#                or if there is an alternative unique carrier code
#                older versions of code retained all itineraries with <= 4 coupons
#                for large carrier restriction, add condition:
#                and carrier_list[0] in large_carriers_dict.values()
                itinerary_condition = (
                                    (count_break == 2)
                                    and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest)
                                    and (line_data_list_first[key_list.index('Coupons')] == 2)
                                    and carrier_list.count(carrier_list[0]) == len(carrier_list)
                                    and fare_class_list.count(fare_class_list[0]) == len(fare_class_list)
                                    and passengers_list.count(passengers_list[0]) == len(passengers_list)
                                    and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list)
                                    and coupon_type_list.count('E') == 0
                                    and (line_data_list[key_list.index('ItinGeoType')] == 2)
                                    and ((line_data_list[key_list.index('FareClass')] in ['X','Y','C','D','F','G']))
                                    and tk_carrier_list[0] == carrier_list[0]
#                if itin_test_ == eval('201122538709'):
#                    print 'testing ItinID 201122538709'
#                    print itinerary_condition
#                    print count_break
#                    print line_data_list_first[key_list.index('Origin')]
#                    print line_data_list_final_dest
#                    print line_data_list_first[key_list.index('Coupons')]
#                    print carrier_list
#                    print fare_class_list
#                    print passengers_list
#                    print tk_carrier_list
#                    print coupon_type_list
#                    print line_data_list[key_list.index('ItinGeoType')]
#                    print line_data_list[key_list.index('FareClass')]
#                    print
#                itinerary_condition_wn_first_class=(
#                                    (count_break == 2)
#                                    and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest)
#                                    and (line_data_list_first[key_list.index('Coupons')] == 2)
#                                    and carrier_list.count(carrier_list[0]) == len(carrier_list)
#                                    and fare_class_list.count(fare_class_list[0]) == len(fare_class_list)
#                                    and passengers_list.count(passengers_list[0]) == len(passengers_list)
#                                    and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list)
#                                    and coupon_type_list.count('E') == 0
#                                    and (line_data_list[key_list.index('ItinGeoType')] == 2)
#                                    and ((line_data_list[key_list.index('FareClass')] in ['F','G']))
#                                    and tk_carrier_list[0] == carrier_list[0]
#                                    and carrier_list[0] == 'WN'
#                                    )            
                if large_carrier_condition:
                    itinerary_condition = (itinerary_condition and carrier_list[0] in large_carriers_dict.values())
#                    itinerary_condition_wn_first_class = (itinerary_condition_wn_first_class and carrier_list[0] in large_carriers_dict.values())
#                if (itinerary_condition or itinerary_condition_wn_first_class):
                if itinerary_condition:
#                    if itinerary_condition satisfied, retain all data for itinerary
#                    otherwise, continue searching for a new itinerary in data_reader
                    for name in retain_names_list:
                        if name != 'Dest':
                    if itin_test_ == eval('201122538709'):
                        for key in data_itin_dict.keys():
                            print key, data_itin_dict[key][-1]
#                    continue with outer (line) for loop
#                if itin_test_ == eval('201122538709'):
#                    print data_itin_dict['OpCarrier'].count('WN')
#        if in test mode, exit data parse
        if security and count >= security_max:

    print '\n' + '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer))
    print '\n' + 'number of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]])
    print 'number of lines read:', count
    print '\n' + '[illustrative itineraries]\n\n\t',
    if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3:
        for itin_number in xrange(3):
            for variable in retain_names_list:
                print variable, data_itin_dict[variable][itin_number],
            print '\n\t',
    if not security:
        print 'total number of lines:', max_count
#    safe_cPickle Python dictionary coupon_year_quarter
    print '\n' + 'save .bin to \\temp'
#    if large_carrier_condition == True, dst filename will not change    
    dst = '..\\temp\\' + 'coupon_' + str(year) + '_' + str(quarter) + '.bin'

    print '\n[temp]\n\n\t' + dst
    safe_cPickle.safe_cPickle_dump(dst, data_itin_dict)
    if not security:
        print 'sleeping for 15 seconds'
    print '\n' + 'deleting .csv file from \\temp\n'
#    descriptive statistics for 2013Q4 (any quarter can be called)    
    if (year == 2013) and (quarter == 4):    
        coupon_descriptives.compute(year, quarter)
    return None