Пример #1
0
def safe_cPickle_dump(dst, object):

    import cPickle, segment_timer

    try:
        
        file=open(dst, 'r')
        file.close()
        
        rsp = ''
        
        while rsp != 'n' and rsp != 'y':
            rsp = raw_input('(safe_cPickle has detected object "' + dst + '") Overwrite? (y/n) ')
            
        if rsp == 'y':
            
            t_pickle_start = segment_timer.timer(True)
            
            file_ = open(dst, 'wb')
            cPickle.dump(object, file_)
            file_.close()
            
            print '%0.3f seconds to cPickle file'%(segment_timer.timer(False, t_pickle_start))
            print
            
        else:
            raise Exception('terminating')
        
    except IOError:
        
        print '\n' + '(safe_cPickle has not detected object "' + dst + '") Saving object.'
        
        t_pickle_start = segment_timer.timer(True)
        
        file_ = open(dst, 'wb')
        cPickle.dump(object, file_)
        file_.close()
        
        print '%0.3f seconds to cPickle file'%(segment_timer.timer(False, t_pickle_start))
        print
        
    return None
Пример #2
0
def merge(src_coupon, src_ticket, yyyy, q):
    
    print 'merge Coupon and Ticket .bin files to Itinerary'    
    
    t_start_total = segment_timer.timer(True)
    
    print
    print '[loading]\n\n\t' + src_ticket
    
    t_start = segment_timer.timer(True)
    
    f = open(src_ticket, 'r')
    ticket = cPickle.load(f)
    f.close()
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\n[loading]\n\n\t' + src_coupon
    
    t_start = segment_timer.timer(True)
    
    f = open(src_coupon, 'r')
    coupon = cPickle.load(f)
    f.close()
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
      
    print '\ncopying coupon dictionary'
    
    t_start = segment_timer.timer(True)
    
    output = copy.deepcopy(coupon)
    output['ItinFare'] = []
    output['ItinFareReal'] = []
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    coupon_length = len(coupon[coupon.keys()[0]])
    ticket_length = len(ticket[ticket.keys()[0]])
    
    del coupon    
    
    ticket_dict = {}
    dollar_cred_dict = {}
    bulk_fare_dict = {}
    
    print '\nbuilding ticket, DollarCred and BulkFare dictionaries'
    
    t_start = segment_timer.timer(True)
    
    for i in range(ticket_length):
        
        ticket_dict[ticket['ItinID'][i]] = ticket['ItinFare'][i]
        dollar_cred_dict[ticket['ItinID'][i]] = ticket['DollarCred'][i]
        bulk_fare_dict[ticket['ItinID'][i]] = int(ticket['BulkFare'][i])
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))

    del ticket
    
    if len(ticket_dict) != ticket_length:
        raise Exception('duplicate key in ticketDict')
    
    print '\nparsing CPI data'

    CPI2013Q4_dict = cpi_parse.parse()
    
    print '\nadd ItinFare and ItinFareReal (2013Q4 prices) to output dictionary'
    
    t_start = segment_timer.timer(True)
    
    for i in range(coupon_length):
        
        fare_nominal = ticket_dict[output['ItinID'][i]]
        fare_real = fare_nominal * CPI2013Q4_dict[str(yyyy) + '_' + str(q)]
        output['ItinFare'].append(fare_nominal)
        output['ItinFareReal'].append(fare_real)
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\nremove itineraries with DollarCred = 0'

    t_start = segment_timer.timer(True)
    
    output2 = {}
    
    for key in output.keys():
        
        output2[key] = []
    
    count_remove = 0
    
    for i in range(coupon_length):
        
        if dollar_cred_dict[output['ItinID'][i]] == 1:
            
            for key in output2.keys():
                
                output2[key].append(output[key][i])
                
        else:
            
#            fare not credible (DollarCred = 0)
            count_remove += 1

    del output

    output = copy.deepcopy(output2)
    
    del output2    
    
    print str(count_remove) + ' itineraries removed'
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))

    coupon_length_ = len(output[output.keys()[0]])
    
    print '\nremove itineraries with BulkFare = 1'

    t_start = segment_timer.timer(True)
    
    output2 = {}
    
    for key in output.keys():
        
        output2[key] = []
    
    count_remove = 0
    
    for i in range(coupon_length_):
        
        if bulk_fare_dict[output['ItinID'][i]] == 0:
            
            for key in output2.keys():
                
                output2[key].append(output[key][i])
                
        else:
            
#            bulk fare (BulkFare = 1)
            count_remove += 1

    del output

    output = copy.deepcopy(output2)
    
    del output2    
    
    print str(count_remove) + ' itineraries removed'
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    coupon_length_after_dollar_and_bulk_fare_cred = len(output[output.keys()[0]])

    output_explode_passengers = {}
    
    for key in output.keys():
        
        output_explode_passengers[key] = []

    for i in range(coupon_length_after_dollar_and_bulk_fare_cred):
        
        for key in output_explode_passengers.keys():
            
            for j in range(int(output['Passengers'][i])):
                
                output_explode_passengers[key].append(output[key][i])
    
    del output    
    
    del output_explode_passengers['Passengers']
    
    dst_itinerary = '..\\temp\\itinerary_' + str(yyyy) + '_' + str(q) + '.bin'

    f = open(dst_itinerary, 'wb')
    
    print '\nsave itinerary ' + dst_itinerary
    
    t_start = segment_timer.timer(True)
    
    cPickle.dump(output_explode_passengers, f)

    del output_explode_passengers

    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    f.close()
    
    print '\nTotal time: ' + ('%0.3f seconds '%(segment_timer.timer(False, t_start_total)))
    
    del ticket_dict
    del dollar_cred_dict
    del bulk_fare_dict
    del fare_nominal
    del fare_real    
    
    return None
Пример #3
0
def compress(src, yyyy, q):
    
    print 'aggregate itinerary*.bin to route-level'
    
    t_start_total = segment_timer.timer(True)
    
    print
    print 'loading: ' + src
        
    t_start = segment_timer.timer(True)
    
    f = open(src, 'r')
    itinerary = cPickle.load(f)
    f.close()
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    length_itinerary = len(itinerary[itinerary.keys()[0]])
    
    route_level_dict = {}
    
    print '\ncreating route-level dictionary'
    print '\n- remove all tickets with (nominal) ItinFare < $20'
    
    t_start = segment_timer.timer(True)
    
    for i in range(length_itinerary):
        
        origin = itinerary['Origin'][i]
        destination = itinerary['Dest'][i]
        opcarrier = itinerary['OpCarrier'][i]
        year = str(itinerary['Year'][i])
        quarter = str(itinerary['Quarter'][i])
        
#        non-directional route
        
        route = [origin, destination]
        route.sort()
    
        key = (route[0] + '_' + route[1] + '_' + opcarrier +
            '_' + year + '_' + quarter)
    
        distance = itinerary['Distance'][i]
        tkcarrier = itinerary['TkCarrier'][i]
        fareclass = itinerary['FareClass'][i]
        itinfare = itinerary['ItinFare'][i]
        itinfarereal = itinerary['ItinFareReal'][i]
        
#        no need to use ItinID from here onwards
        
        value = [distance, tkcarrier, fareclass, itinfare, itinfarereal]
        
        frequent_flyer = (itinfare < 20.0)
        
        if not frequent_flyer:    
        
            if key not in route_level_dict.keys():
                
                route_level_dict[key] = [value]
                
            else:
                
                route_level_dict[key].append(value)

    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\n# route-carriers remaining', len(route_level_dict.keys())
    
    del itinerary
    
    route_level_dict_2 = copy.deepcopy(route_level_dict)
    
    del route_level_dict
    
    route_level_dict_3 = {}
    
    print '\n- remove all tickets with (nominal) ItinFare > 99th percentile of route-carrier-quarter fare distribution'
    
    t_start = segment_timer.timer(True)
            
    for k in route_level_dict_2.keys():
       
        nominal_fare_list = []
        
        for ticket in route_level_dict_2[k]:
            
            nominal_fare_list.append(ticket[3])
            
        p99 = scipy.percentile(scipy.array(nominal_fare_list), 99)
        
        for ticket in route_level_dict_2[k]:
            
            high_fare = (ticket[3] > p99)
            
            if not high_fare:
                
                if k not in route_level_dict_3.keys():
                    
                    route_level_dict_3[k] = [ticket]
                    
                else:
                    
                    route_level_dict_3[k].append(ticket)
    
    del route_level_dict_2
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\n# route-carriers remaining', len(route_level_dict_3.keys())
    
    route_level_dict_4 = {}

    print '\n- remove all WN routes that involve DFW, from 1993Q1 to 1999Q4'

    t_start = segment_timer.timer(True)
    
    count_wn_dfw_tickets = 0
    
    for k in route_level_dict_3.keys():
        
        k__ = k.split('_')
        
        origin_ = k__[0]
        dest_ = k__[1]
        carrier_ = k__[2]
        year_ = int(k__[3])
        
        condition = (
                    (carrier_ == 'WN')
                    and ((origin_ == "DFW") or (dest_ == "DFW"))
                    and (year_ in range(1993, 2000))
                    )
        
        if condition:
            
            count_wn_dfw_tickets += 1
            
        else:
            
            route_level_dict_4[k] = route_level_dict_3[k]
        
    del route_level_dict_3

    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\n# WN DFW routes removed:', count_wn_dfw_tickets
    print '\n# route-carriers remaining', len(route_level_dict_4.keys())
    
    d_class = {}
    
    for key in route_level_dict_4:
        
        for ticket in route_level_dict_4[key]:
            
            if ticket[1] not in d_class.keys():
                
                d_class[ticket[1]] = {'coach' : 0, 'other' : 0}
                
            if ticket[2] not in ['X','Y']:
                
                d_class[ticket[1]]['other'] += 1
                
            else:
                d_class[ticket[1]]['coach'] += 1
                
    d_class_coach_only = {}
    
    for key in d_class:
        
        num = float(d_class[key]['other'])
        den = float(d_class[key]['coach'] + d_class[key]['other'])
        d_class_coach_only[key] = num / den
        
    print
    
    for key in d_class_coach_only:
        
        print key, '%0.1f percent not coach'%(100 * d_class_coach_only[key])
    
    print '\ncompress dictionary, no error trap for Distance, TkCarrier, FareClass'
        
    t_start = segment_timer.timer(True)
    
    route_level_dict_5 = {}
    
    for key in route_level_dict_4.keys():
        
        distance = route_level_dict_4[key][0][0]
        nominal_fare_list = []
        real_fare_list = []
        fare_class_list = []
        
        for ticket in route_level_dict_4[key]:
            
#            note that ticket[1] is the ticketing, not operating carrier; even if ticketing=operating here
            
            if d_class_coach_only[ticket[1]] > 0.75:
                
                nominal_fare_list.append(ticket[3])
                real_fare_list.append(ticket[4])
                fare_class_list.append(ticket[2])

            else:
                
                if ticket[2] in ['X','Y']:
                    
                    nominal_fare_list.append(ticket[3])
                    real_fare_list.append(ticket[4])
                    fare_class_list.append(ticket[2])
                    
        nominal_fare_list.sort()
        real_fare_list.sort()
        route_level_dict_5[key] = [distance, nominal_fare_list, real_fare_list]
        
        condition = (
                    (key.split('_')[2] == 'WN')
                    and (('F' in fare_class_list) or ('G' in fare_class_list))
                    )
                    
        if condition:
            
            dst_wn = '..\\temp\\' + key + '.txt'
            
            output_string = ''
            output_string += 'Southwest reporting first class tickets:\n'
            output_string += ('# coach class on route-carrier-quarter ' + key + ':\n')
            output_string += (str((fare_class_list.count('X') + fare_class_list.count('Y'))) + '\n')
            output_string += ('# first class on route-carrier-quarter ' + key + ':\n')
            output_string += (str((fare_class_list.count('F') + fare_class_list.count('G'))) + '\n')
            
            print 'saving temporary file: ' + dst_wn
            
            f = open(dst_wn, 'w')
            f.write(output_string)
            f.close()
            
    del route_level_dict_4
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    route_carrier_list = route_level_dict_5.keys()[:]
    
    route_level_dict_6 = {}
    
    print '\n- remove all route-carriers with < 100 passengers in quarter'
    
    t_start = segment_timer.timer(True)
    
    for j in route_carrier_list:
        
        low_volume = (len(route_level_dict_5[j][2]) < 100)
            
        if not low_volume:
            
            route_level_dict_6[j] = route_level_dict_5[j]
        
    del route_level_dict_5
    
    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\n# route-carriers remaining', len(route_level_dict_6.keys())
    
    dst_route_carrier = '..\\temp\\routecarrier_' + str(yyyy) + '_' + str(q) + '.bin'
    
    print '\nsaving: ' + dst_route_carrier
    
    t_start = segment_timer.timer(True)
    
    f = open(dst_route_carrier, 'wb')
    cPickle.dump(route_level_dict_6, f)
    f.close()

    print '%0.3f seconds '%(segment_timer.timer(False, t_start))
    
    print '\ntotal time:'
    print '%0.3f seconds '%(segment_timer.timer(False, t_start_total))

    del route_level_dict_6

    return None
Пример #4
0
def sort_coupon_csv(dst_csv_2011q1_to_2013q4_temp, dst_csv, year, quarter):
    
    data_reader = open(dst_csv_2011q1_to_2013q4_temp, 'r')
    
    print
    print '[sorting file]\n\n\t' + dst_csv_2011q1_to_2013q4_temp
    
    t_start = segment_timer.timer(True)
    
    key_list = []
    
    h = 'X'
    
    while h != '':
        
        flag = True
        
        bp = data_reader.tell()
        
        while flag:
            
            data_reader.seek(bp)
            h = data_reader.readline()
            check_itin_id = h.split(',')[0][:4]
            h_ = h.split(',')
            
            try:
                
                if bp != 0 and eval(check_itin_id) != year:
                    
                    bp -= 1   
                    
                else:
                    
                    flag = False                   
                    
            except SyntaxError:
                
#                fiddle : may lose last line (or more) of file?
                
                flag = False
                h = ''
                
        if bp != 0 and h != '':
            
            key_itin_id = h_[0].split('"')[-1]
            seq_num = h_[2]
            
            if eval(seq_num) <= 9:
                
                key_a = eval(key_itin_id + '0' + seq_num)
                
            else:
                
                key_a = eval(key_itin_id + seq_num)
                
            key_list.append([key_a, bp])
                
        if len(key_list)%100000 == 0:
            
            print len(key_list)
    
    key_list.sort()
    
    print '\n' + 'sorted keys'
    print ('runtime : %0.3f seconds'%(segment_timer.timer(False, t_start)))
    
    t_start = segment_timer.timer(True)
    
#    remove repeated items from key_list
#    (possibly created by byte position (backwards) correction above)
    
    key_list_ = []
    
    count_duplicates = 0
    
    for idx in range(len(key_list) - 1):
        
        if key_list[idx][0] != key_list[idx + 1][0]:
            
            key_list_.append(key_list[idx])
            
        else:
            
            count_duplicates += 1
            
    key_list_.append(key_list[-1])

    print '\n' + 'removed duplicates :', count_duplicates
    print ('runtime : %0.3f seconds'%(segment_timer.timer(False, t_start)))
    
    t_start = segment_timer.timer(True)
    
    print '\n' + '[saving sorted file]\n\n\t' + dst_csv    
    
    f = open(dst_csv, 'w')
    
    data_reader.seek(0)
    line_out = data_reader.readline()
    
    f.write(line_out)
    
    for key in key_list_:
        
        catch_byte = key[1]
        data_reader.seek(catch_byte)
        line_out=data_reader.readline()
        f.write(line_out)

    f.close()
    
    data_reader.close()
    
    print '\n' + 'end of sort'
    print ('runtime : %0.3f seconds'%(segment_timer.timer(False, t_start)))

    print '\n' + '[deleting unsorted .csv]\n\n\t' + dst_csv_2011q1_to_2013q4_temp
    os.remove(dst_csv_2011q1_to_2013q4_temp)
    
    print
    
    return None
Пример #5
0
def parse(src, year):

    print 'parse T-100 .csv files to .bin, save to \\temp'
    print '** note that raw T-100 .zip and .csv must be renamed as yyyy_*.* before use **'

    assert isinstance(src, str), 'src must be a string'
    assert isinstance(year, int), 'year must be an integer'

    error_string = ''
        
    print '\n[source]\n\n\t' + src
    
    dst_folder = '..\\input\\' + str(year) + '_T100D_SEGMENT_ALL_CARRIER_FOLDER.csv'

    t_unzip_csv_start = segment_timer.timer(True)

    print '\nunzipping folder to \\input'
    
    print '\n[destination]\n\n\t' + dst_folder
    
    zip = zipfile.ZipFile(src)
    zip.extractall(dst_folder)
    zip.close()
    
    src_csv = dst_folder + '\\' + str(year) + '_T100D_SEGMENT_ALL_CARRIER.csv'    
    dst_csv = '..\\temp\\' + str(year) + '_T100D_SEGMENT_ALL_CARRIER.csv'
        
    print '\ncopying .csv from \\input (folder) to \\temp'
        
    shutil.move(src_csv, dst_csv)
    
    print 'deleting redundant folder from \\input'

    shutil.rmtree(dst_folder)
    
    print '\n[warning]\n\n\t .csv \\input datafile is large'

    print '\nopening: ' + dst_csv
        
    f = open(dst_csv, 'r')
    
    header = f.readline().strip().split('"')
   
    header_list = []
    
    for variable in header:
        
        if variable != ',':
            
            header_list.append(variable)       
            
        else:
            
          pass
        
    header_list = header_list[1:]
    
    print '\nList of variables in dataset (.csv column order):',\
        len(header_list), 'variables'
        
    print '\n\t',
    
    for variable_name in header_list:
        print variable_name,
        
    print
    
    retain_name_list = ['YEAR', 'QUARTER', 'ORIGIN', 'DEST', 'CARRIER',\
                        'CARRIER_GROUP', 'PASSENGERS', 'SEATS', 'CLASS',\
                        'AIRCRAFT_GROUP', 'AIRCRAFT_TYPE', 'AIRCRAFT_CONFIG',\
                        'AIR_TIME', 'RAMP_TO_RAMP', 'DEPARTURES_PERFORMED']
        
    retain_list = ['DEPARTURES_PERFORMED', 'CARRIER_GROUP', 'PASSENGERS',\
                    'SEATS','AIRCRAFT_GROUP','AIRCRAFT_TYPE','AIRCRAFT_CONFIG',\
                    'AIR_TIME','RAMP_TO_RAMP']

    print '\nRetaining following variables (list order):',\
        len(retain_name_list), 'variables' 
    
    print '\n\t',
    
    for variable_name in retain_name_list:
        
        if variable_name in header_list:
            
            print variable_name,
            
        else:
            
            print variable_name, '(not available)'
            raise Exception ('variable_name missing from retain_name_list')
    
    print
    
    data_itin_dict = dict([x, []] for x in retain_name_list)
    
    t_open_csv_start = segment_timer.timer(True)

    intermediate_dict={}
    
    count = 0
    
    for line in f:
        
        count += 1
        
        next_line = line.strip().split('"')
        
        next_line_list = []
        
        for variable in next_line:
            
            if variable != ',':
                
               next_line_list.append(variable)       
               
            else:
                
              pass
          
        next_line_list = next_line_list[0:47]
        
        line_data = add_to_b(next_line_list)    
        
        if len(line_data) != len(header_list):
            
            print
            print 'line length problem in line', count + 1, 'for year', year
            error_msg = 'line length problem in line' + str(count + 1) + ' for year ' + str(year)
            error_string += error_msg
            error_string += '\n'
        
        if len(line_data)==len(header_list):
        
            key_list=[]
        
            key_list.append(line_data[header_list.index('ORIGIN')])
            key_list.append(line_data[header_list.index('DEST')])    
            key_list.sort()
            
            key_list.append(line_data[header_list.index('CARRIER')])
            key_list.append(line_data[header_list.index('YEAR')])
            key_list.append(line_data[header_list.index('QUARTER')])  
#            key_list.append(line_data[header_list.index('CLASS')])
            
            key = '_'.join(key_list)
            
            if line_data[header_list.index('CLASS')] in ['F','L']:
                            
                if key not in intermediate_dict:
                    
                    intermediate_dict[key] = {}
                    
                    for k in retain_list:
                        
                        if k in ['PASSENGERS', 'RAMP_TO_RAMP', 'AIR_TIME', 'SEATS', 'DEPARTURES_PERFORMED']:
                            
                            intermediate_dict[key][k] = [eval(line_data[header_list.index(k)])]
                        
                        else:
                            
                            intermediate_dict[key][k]=[line_data[header_list.index(k)]]                            
                    
                else:
            
                    for k in retain_list:
                        
                        if k in ['PASSENGERS', 'RAMP_TO_RAMP', 'AIR_TIME', 'SEATS', 'DEPARTURES_PERFORMED']:
                            
                            intermediate_dict[key][k].append(eval(line_data[header_list.index(k)]))
                            
                        else:
                            
                            intermediate_dict[key][k].append(line_data[header_list.index(k)])           
    
    f.close()
    
    data_dict = {}

    for key in intermediate_dict:
        
        data_dict[key] = {}
        
        for variable in intermediate_dict[key]:
            
            if variable in ['PASSENGERS', 'SEATS']:
                
                data_dict[key][variable] = sum(intermediate_dict[key][variable])
                
        if data_dict[key]['SEATS'] != 0.0:
            
            data_dict[key]['LOAD_FACTOR'] = 100.0 * float(data_dict[key]['PASSENGERS']) / data_dict[key]['SEATS']
            
        for variable in intermediate_dict[key]:
            
            if variable in ['AIR_TIME', 'DEPARTURES_PERFORMED']:
                
              data_dict[key][variable] = sum(intermediate_dict[key][variable])
              
        if data_dict[key]['DEPARTURES_PERFORMED'] != 0.0:
            
            data_dict[key]['MEAN_AIR_TIME'] = data_dict[key]['AIR_TIME'] / data_dict[key]['DEPARTURES_PERFORMED']
            
        for variable in intermediate_dict[key]:
            
            if variable in ['RAMP_TO_RAMP', 'DEPARTURES_PERFORMED']:
                
              data_dict[key][variable] = sum(intermediate_dict[key][variable])
              
        if data_dict[key]['DEPARTURES_PERFORMED'] != 0.0:
            
            data_dict[key]['MEAN_RAMP_TO_RAMP'] = data_dict[key]['RAMP_TO_RAMP'] / data_dict[key]['DEPARTURES_PERFORMED']
    
    print '\nnumber of lines', count
    print ('%0.3f seconds to parse data'%(segment_timer.timer(False, t_open_csv_start)))

    dst_temp = '..\\temp\\T100_merge_' + str(year) + '.bin'

    print '\nsave file: ' + dst_temp
    
    f = open (dst_temp, 'wb')
    cPickle.dump(data_dict, f)
    f.close()

    dst_error = '..\\temp\\error_string_' + str(year) + '.txt'
    
    if error_string != '':
        
        f = open(dst_error, 'wb')
        f.write(error_string)
        f.close()

    print 'deleting redundant file from \\temp: ' + dst_csv
    
    os.remove(dst_csv)

    return None
Пример #6
0
def parse_csv(src, security, security_max, year, quarter):    

    print 'parse DB1B Ticket data from .zip to coupon_year_quarter.bin'
    
    assert isinstance(src, str), 'src must be a string'
    assert isinstance(security, bool), 'security must be a Boolean'
    assert ((security_max > 0) and isinstance(security_max, int)),\
        'security_max must be a positive integer'
    
    print    
    print '[source]\n\n\t' + src
    
    dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '_FOLDER.csv'

    t_unzip_csv_start = segment_timer.timer(True)

    print '\n' + 'unzipping folder to \\input'
    
    print '\n[destination]\n\n\t' + dst_folder
    
    zip = zipfile.ZipFile(src)
    zip.extractall(dst_folder)
    zip.close()
    
    src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv'    
    dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv'
        
    print '\n' + 'copying .csv from \\input (folder) to \\temp'
        
    shutil.move(src_csv, dst_csv)
    
    print 'deleting redundant folder from \\input'

    shutil.rmtree(dst_folder)    
    
    print '\n' + '[warning]\n\n\t .csv \\input datafile is large'
    
    print '\n' + 'opening .csv file for line count'
    
    data_reader = open(dst_csv, 'r')    
    
    print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start))

#    count number of lines in .csv file
    
    if security:
        
        print '\n[** running in reduced-lines mode **]'
        max_count = None
        
    else:
        
        print '\ncounting number of lines in dataset'    
        max_count = 0
    
        t_count_lines_start = segment_timer.timer(True)
    
        for line in data_reader:
            max_count += 1
    
        print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start))
        print '\nnumber of lines of data (including header):', max_count
        print 'sleeping for 5 seconds'
        
        time.sleep(5)    
    
    data_reader.close()
    
    print '\nre-open .csv file for parse'
    
    t_reopen_csv_start = segment_timer.timer(True)
    
    data_reader = open(dst_csv, 'r')
    
    print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start))
     
    print '\nconstruct list of Ticket .csv variable names, in key_list'
    
    t_parse_timer = segment_timer.timer(True)
    
    key_list = []
    
    for line in data_reader:
        
        key_list_raw = line.split('"')[1:-1]
        
        for variable_name in key_list_raw:
            
            if variable_name != ',':
                key_list.append(variable_name)
            else:
                pass
        break               
    
    print 'list of variables in dataset (.csv column order):',\
          len(key_list),'variables'
    print '\n[variables]\n\n\t',
    
    for variable_name in key_list:
        print variable_name,
        
    retain_names_list = ['ItinID', 'ItinFare', 'DollarCred', 'BulkFare']
    
    print
    print '\nretaining following variables (list order):',\
          len(retain_names_list), 'variables'
    print '\n[variables]\n\n\t',

    for variable_name in retain_names_list:
        if variable_name in key_list:
            print variable_name,
        else:
            print variable_name, '(not available)'
            raise Exception('variable_name missing from retain_names_list')
    print
    
#    initialize data dictionary:
#    key/ variable name from retain_names_list, value/ list (empty by default)
#    the set of ith elements of the lists corresponds to one itinerary observation 
    
    data_itin_dict = {}
    
    for variable_name in retain_names_list:
        data_itin_dict[variable_name] = []
    
    print '\nretaining all carriers'
    
    count = 1
    
    if not security:
        
        t_intermediate_parse = segment_timer.timer(True)
        
        print '\nparsing data, percentage completed:'
        
        top_count = max_count / 100
        print count / top_count
    
#    loop over all lines in .csv file
    
    for line in data_reader:
        
        count += 1
        
#        if in test mode, exit data parse
        
        if security and count >= security_max:
            break      

        if not security:
            if float(count / top_count) == float(count) / top_count:
                
                print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse))
                t_intermediate_parse = segment_timer.timer(True)
           
        line_split = line.split(',')[:-1]
        line_data_list = [eval(element) for element in line_split]
        
#        build dictionary: append values to lists, for each retained variable
#        no error trap for multiple occurrences of the same ItinID

        for name in retain_names_list:
                            
            data_itin_dict[name].append(line_data_list[key_list.index(name)])

    print '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer))
    
    print '\nnumber of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]])
    print 'number of lines read:', count
    
    print '\nillustrative itineraries:\n'
    
    if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3:
        for itin_number in xrange(3):
            for variable in retain_names_list:
                print variable, data_itin_dict[variable][itin_number],
            print
        
    if not security:
        print '\ntotal number of lines:', max_count
    
#    safe_cPickle Python dictionary ticket_year_quarter
    
    print 'save .bin to \\temp'
        
    dst = '..\\temp\\' + 'ticket_' + str(year) + '_' + str(quarter) + '.bin'

    print '\n[temp]\n\n\t' + dst
                            
    safe_cPickle.safe_cPickle_dump(dst, data_itin_dict)
    
    if not security:
        
        print 'sleeping for 15 seconds'
        time.sleep(15)
    
    data_reader.close()
    
    print '\ndeleting .csv file from \\temp'
    
    os.remove(dst_csv)
    
    return None
Пример #7
0
def parse_csv(src, security, security_max, year, quarter):
    
    print 'parse DB1B Coupon data from .zip to coupon_year_quarter.bin'
    
    assert isinstance(src, str), 'src must be a string'
    assert isinstance(security, bool), 'security must be a Boolean'
    assert ((security_max > 0) and isinstance(security_max, int)),\
        'security_max must be a positive integer'
    
    print    
    print '[source]\n\n\t' + src
    
    dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_FOLDER.csv'

    t_unzip_csv_start = segment_timer.timer(True)

    print '\n' + 'unzipping folder to \\input\n'
    
    print '[destination]\n\n\t' + dst_folder
    
    zip = zipfile.ZipFile(src)
    zip.extractall(dst_folder)
    zip.close()
    
    src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv'    
    dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv'
    dst_csv_2011q1_to_2013q4_temp = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_UNSORTED.csv'

    if year >= 2014:
        
        raise Exception('sort not implemented', year)

    if 2011 <= year <= 2013:
                
        print '\n' + '[sort csv for 2011Q1 to 2013Q4 inclusive]'
        print '\n' + 'copying .csv from \\input (folder) to \\temp'
        
        shutil.move(src_csv, dst_csv_2011q1_to_2013q4_temp)
        
        sort_coupon_csv_2011_on.sort_coupon_csv(dst_csv_2011q1_to_2013q4_temp, dst_csv, year, quarter)
    
    else:
        
        print '\n' + 'copying .csv from \\temp (folder) to \\temp'
        
        shutil.move(src_csv, dst_csv)
    
    print 'deleting redundant folder from \\input'

    shutil.rmtree(dst_folder)
    
    print '\n[warning]\n\n\t .csv \\input datafile is large\n'
    
    print 'opening .csv file for line count'
    
    data_reader = open(dst_csv, 'r')    
    
    print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start))

#    count number of lines in .csv file
    
    if security:
        
        print '\n[** running in reduced-lines mode **]'
        max_count = None
        
    else:
        
        print '\n' + 'counting number of lines in dataset'    
        max_count = 0
    
        t_count_lines_start = segment_timer.timer(True)
    
        for line in data_reader:
            max_count += 1
    
        print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start))
        print 'number of lines of data (including header):', max_count
        
        print '\n' + 'sleeping for 5 seconds'
        
        time.sleep(5)    
    
    data_reader.close()
    
    print '\n' + 're-open .csv file for parse'
    
    t_reopen_csv_start = segment_timer.timer(True)
    
    data_reader = open(dst_csv, 'r')
    
    print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start))
        
    large_carriers_dict = {'Southwest':'WN', 'American':'AA', 'Continental':'CO',\
                    'Delta':'DL', 'Northwest':'NW', 'Skywest':'OO',\
                    'United':'UA', 'US Airways':'US', 'American Eagle':'MQ',\
                    'Airtran Airways':'FL', 'Express Jet':'EV',\
                    'Jetblue':'B6', 'Alaska Airlines':'AS', 'Endeavor Air':'9E'}
    
    large_carrier_condition = False    
     
    print '\n' + 'construct list of Coupon .csv variable names, in key_list'
    
    t_parse_timer = segment_timer.timer(True)
    
    key_list = []
    
    for line in data_reader:
        
        key_list_raw = line.split('"')[1:-1]
        
        for variable_name in key_list_raw:
            
            if variable_name != ',':
                key_list.append(variable_name)
            else:
                pass
        break               
    
    print 'list of variables in dataset (.csv column order):',\
          len(key_list),'variables'

    print '\n[variables]\n\n\t',
    
    for variable_name in key_list:
        print variable_name,
        
    retain_names_list = ['ItinID', 'Year', 'Quarter', 'Origin', 'Dest',\
        'OpCarrier', 'Passengers', 'TkCarrier', 'Distance', 'FareClass']
    
    print
    print '\n' + 'retaining following variables (list order):',\
          len(retain_names_list), 'variables'
          
    print '\n' + '[variables]\n\n\t',

    for variable_name in retain_names_list:
        if variable_name in key_list:
            print variable_name,
        else:
            print variable_name, '(not available)'
            raise Exception('variable_name missing from retain_names_list')
    print
    
#    initialize data dictionary:
#    key/ variable name from retain_names_list, value/ list (empty by default)
#    the set of ith elements of the lists corresponds to one itinerary observation 
    
    data_itin_dict = {}
    
    for variable_name in retain_names_list:
        data_itin_dict[variable_name] = []
    
    if large_carrier_condition:
        
        print '\n' + 'retaining following large carriers:'
        for i in large_carriers_dict:
            print i+' : ',
        print
        
    else:
        
        print '\n' + 'retaining all carriers\n'
    
    count = 1
    
    if not security:
        
        t_intermediate_parse = segment_timer.timer(True)
        
        print 'parsing data, percentage completed:'
        
        top_count = max_count / 100
        print count / top_count
    
#    loop over all lines in .csv file
    
    for line in data_reader:
        
        count += 1
    
        if not security:
            
            if float(count / top_count) == float(count) / top_count:

                print str(count / top_count)+'\t','%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse))

                t_intermediate_parse = segment_timer.timer(True)  
    
#        line_data_list is a list of evaluated data elements
        
        line_split = line.split(',')[:-1]
        
        line_data_list = [eval(element) for element in line_split]
    
#        new itinerary for retained operating carrier if SeqNum == 1
#        deals with possible incomplete itinerary at start of data_reader (skipped)
        
        itin_test = line_data_list[key_list.index('ItinID')]   
        
#        if itin_test == eval('201122538709'):
#            print 'out', line_data_list
        
        if line_data_list[key_list.index('SeqNum')] == 1 and line_data_list[key_list.index('Coupons')] == 2:

#            number of coupons in newly-identified itinerary
            
            coupons_new = line_data_list[key_list.index('Coupons')]
                        
            if coupons_new == 1:
#                one-way
                pass
            
            elif coupons_new == 2:
                    
#                list of data corresponding to newly-identified itinerary
                
                line_data_list_first = line_data_list[:]
        
#                carrier_list will contain OpCarrier for each coupon in itinerary
#                to enable constant OpCarrier itineraries to be retained
#                (also applies to fare_class_list, passengers_list, and tk_carrier_list if required)
                
                carrier_list = [line_data_list[key_list.index('OpCarrier')][:]]            
                fare_class_list = [line_data_list[key_list.index('FareClass')][:]]
                passengers_list = [line_data_list[key_list.index('Passengers')]]
                tk_carrier_list = [line_data_list[key_list.index('TkCarrier')][:]]
                coupon_type_list = [line_data_list[key_list.index('CouponType')][:]]
                
#                if segment has trip break at SeqNum==1, create candidate destination variable
                
                if line_data_list[key_list.index('Break')] == 'X':
                    
                    count_break = 1
                    line_data_list_dest = line_data_list[key_list.index('Dest')][:]
                    
                else:
                    
                    count_break = 0            
                        
#                once new itinerary identified, continue to loop over lines in .csv file
                
                for line_ in data_reader:
                    
                    count += 1
        
#                    if in test mode, exit data parse
                    
                    if security and count >= security_max:
                        break
        
                    if not security:
                        if float(count / top_count) == float(count) / top_count:
                            
                            print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse))
                            t_intermediate_parse = segment_timer.timer(True)
                       
                    line_split_ = line_.split(',')[:-1]
                    line_data_list_ = [eval(element_) for element_ in line_split_]
                    
                    itin_test_ = line_data_list_[key_list.index('ItinID')]   
                    
                    if itin_test != itin_test_:
                        break
        
#                    if itin_test_ == eval('201122538709'):
#                        print 'return', line_data_list_
                
                    carrier_list.append(line_data_list_[key_list.index('OpCarrier')][:])        
                    fare_class_list.append(line_data_list_[key_list.index('FareClass')][:])
                    passengers_list.append(line_data_list_[key_list.index('Passengers')])
                    tk_carrier_list.append(line_data_list_[key_list.index('TkCarrier')][:])
                    coupon_type_list.append(line_data_list_[key_list.index('CouponType')][:])
    
#                    if at end of itinerary, create candidate final destination variable
#                    if not at end of itinerary, but segment has trip break, 
#                    create candidate destination variable
                    
                    if line_data_list_[key_list.index('SeqNum')] == coupons_new:
                        
                        if line_data_list_[key_list.index('Break')] == 'X':
                            
                            count_break += 1
                            line_data_list_final_dest = line_data_list_[key_list.index('Dest')][:]
        
#                        break out of inner (line_) for loop
#                        and check whether to retain itinerary
                            
                        break
                    
                    else:
                        
                        if line_data_list_[key_list.index('Break')] == 'X':
                            
                            countBreak += 1
                            line_data_list_dest=line_data_list_[key_list.index('Dest')][:]
                            
                        else:
                            
                            pass
        
                    break
                
#                retain itinerary if the following conditions are satisfied:
#                1) 2 trip breaks
#                2) origin=destination (round-trip)
#                3) 2 coupons
#                4) constant OpCarrier
#                5) constant FareClass
#                6) constant Passengers
#                7) constant TkCarrier, same as OpCarrier
#                8) no 'E' in CouponType (no cabotage on itinerary)
#                9) legs in lower 48 states only
        
#                to do: need to check whether OpCarrier code (IATA?) is constant over time
#                or if there is an alternative unique carrier code
        
#                older versions of code retained all itineraries with <= 4 coupons
        
#                for large carrier restriction, add condition:
#                and carrier_list[0] in large_carriers_dict.values()
                
                itinerary_condition = (
                                    (count_break == 2)
                                    and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest)
                                    and (line_data_list_first[key_list.index('Coupons')] == 2)
                                    and carrier_list.count(carrier_list[0]) == len(carrier_list)
                                    and fare_class_list.count(fare_class_list[0]) == len(fare_class_list)
                                    and passengers_list.count(passengers_list[0]) == len(passengers_list)
                                    and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list)
                                    and coupon_type_list.count('E') == 0
                                    and (line_data_list[key_list.index('ItinGeoType')] == 2)
                                    and ((line_data_list[key_list.index('FareClass')] in ['X','Y','C','D','F','G']))
                                    and tk_carrier_list[0] == carrier_list[0]
                                    )
                
#                if itin_test_ == eval('201122538709'):
#                    print 'testing ItinID 201122538709'
#                    print itinerary_condition
#                    print count_break
#                    print line_data_list_first[key_list.index('Origin')]
#                    print line_data_list_final_dest
#                    print line_data_list_first[key_list.index('Coupons')]
#                    print carrier_list
#                    print fare_class_list
#                    print passengers_list
#                    print tk_carrier_list
#                    print coupon_type_list
#                    print line_data_list[key_list.index('ItinGeoType')]
#                    print line_data_list[key_list.index('FareClass')]
#                    print
                    
#                itinerary_condition_wn_first_class=(
#                                    (count_break == 2)
#                                    and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest)
#                                    and (line_data_list_first[key_list.index('Coupons')] == 2)
#                                    and carrier_list.count(carrier_list[0]) == len(carrier_list)
#                                    and fare_class_list.count(fare_class_list[0]) == len(fare_class_list)
#                                    and passengers_list.count(passengers_list[0]) == len(passengers_list)
#                                    and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list)
#                                    and coupon_type_list.count('E') == 0
#                                    and (line_data_list[key_list.index('ItinGeoType')] == 2)
#                                    and ((line_data_list[key_list.index('FareClass')] in ['F','G']))
#                                    and tk_carrier_list[0] == carrier_list[0]
#                                    and carrier_list[0] == 'WN'
#                                    )            
                
                if large_carrier_condition:
                    
                    itinerary_condition = (itinerary_condition and carrier_list[0] in large_carriers_dict.values())
#                    itinerary_condition_wn_first_class = (itinerary_condition_wn_first_class and carrier_list[0] in large_carriers_dict.values())
                
#                if (itinerary_condition or itinerary_condition_wn_first_class):
                
                if itinerary_condition:
                    
#                    if itinerary_condition satisfied, retain all data for itinerary
#                    otherwise, continue searching for a new itinerary in data_reader
                    
                    for name in retain_names_list:
                        
                        if name != 'Dest':
                            
                            data_itin_dict[name].append(line_data_list_first[key_list.index(name)])
                            
                        else:
                            
                            data_itin_dict['Dest'].append(line_data_list_dest)
                
                    if itin_test_ == eval('201122538709'):
                        
                        for key in data_itin_dict.keys():
                            print key, data_itin_dict[key][-1]
                
                else:
        
#                    continue with outer (line) for loop
                    
                    continue
    
#                if itin_test_ == eval('201122538709'):
#                    print data_itin_dict['OpCarrier'].count('WN')
    
#        if in test mode, exit data parse
    
        if security and count >= security_max:
            break

    print '\n' + '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer))
    
    print '\n' + 'number of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]])
    print 'number of lines read:', count
    
    print '\n' + '[illustrative itineraries]\n\n\t',
    
    if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3:
        for itin_number in xrange(3):
            for variable in retain_names_list:
                print variable, data_itin_dict[variable][itin_number],
            print '\n\t',
        
    if not security:
        print 'total number of lines:', max_count
    
#    safe_cPickle Python dictionary coupon_year_quarter
    
    print '\n' + 'save .bin to \\temp'
    
#    if large_carrier_condition == True, dst filename will not change    
    
    dst = '..\\temp\\' + 'coupon_' + str(year) + '_' + str(quarter) + '.bin'

    print '\n[temp]\n\n\t' + dst
                            
    safe_cPickle.safe_cPickle_dump(dst, data_itin_dict)
    
    if not security:
        
        print 'sleeping for 15 seconds'
        time.sleep(15)
        
    data_reader.close()
    
    print '\n' + 'deleting .csv file from \\temp\n'
    
    os.remove(dst_csv)
    
#    descriptive statistics for 2013Q4 (any quarter can be called)    
    
    if (year == 2013) and (quarter == 4):    
        
        coupon_descriptives.compute(year, quarter)
    
    return None