def safe_cPickle_dump(dst, object): import cPickle, segment_timer try: file=open(dst, 'r') file.close() rsp = '' while rsp != 'n' and rsp != 'y': rsp = raw_input('(safe_cPickle has detected object "' + dst + '") Overwrite? (y/n) ') if rsp == 'y': t_pickle_start = segment_timer.timer(True) file_ = open(dst, 'wb') cPickle.dump(object, file_) file_.close() print '%0.3f seconds to cPickle file'%(segment_timer.timer(False, t_pickle_start)) print else: raise Exception('terminating') except IOError: print '\n' + '(safe_cPickle has not detected object "' + dst + '") Saving object.' t_pickle_start = segment_timer.timer(True) file_ = open(dst, 'wb') cPickle.dump(object, file_) file_.close() print '%0.3f seconds to cPickle file'%(segment_timer.timer(False, t_pickle_start)) print return None
def merge(src_coupon, src_ticket, yyyy, q): print 'merge Coupon and Ticket .bin files to Itinerary' t_start_total = segment_timer.timer(True) print print '[loading]\n\n\t' + src_ticket t_start = segment_timer.timer(True) f = open(src_ticket, 'r') ticket = cPickle.load(f) f.close() print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\n[loading]\n\n\t' + src_coupon t_start = segment_timer.timer(True) f = open(src_coupon, 'r') coupon = cPickle.load(f) f.close() print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\ncopying coupon dictionary' t_start = segment_timer.timer(True) output = copy.deepcopy(coupon) output['ItinFare'] = [] output['ItinFareReal'] = [] print '%0.3f seconds '%(segment_timer.timer(False, t_start)) coupon_length = len(coupon[coupon.keys()[0]]) ticket_length = len(ticket[ticket.keys()[0]]) del coupon ticket_dict = {} dollar_cred_dict = {} bulk_fare_dict = {} print '\nbuilding ticket, DollarCred and BulkFare dictionaries' t_start = segment_timer.timer(True) for i in range(ticket_length): ticket_dict[ticket['ItinID'][i]] = ticket['ItinFare'][i] dollar_cred_dict[ticket['ItinID'][i]] = ticket['DollarCred'][i] bulk_fare_dict[ticket['ItinID'][i]] = int(ticket['BulkFare'][i]) print '%0.3f seconds '%(segment_timer.timer(False, t_start)) del ticket if len(ticket_dict) != ticket_length: raise Exception('duplicate key in ticketDict') print '\nparsing CPI data' CPI2013Q4_dict = cpi_parse.parse() print '\nadd ItinFare and ItinFareReal (2013Q4 prices) to output dictionary' t_start = segment_timer.timer(True) for i in range(coupon_length): fare_nominal = ticket_dict[output['ItinID'][i]] fare_real = fare_nominal * CPI2013Q4_dict[str(yyyy) + '_' + str(q)] output['ItinFare'].append(fare_nominal) output['ItinFareReal'].append(fare_real) print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\nremove itineraries with DollarCred = 0' t_start = segment_timer.timer(True) output2 = {} for key in output.keys(): output2[key] = [] count_remove = 0 for i in range(coupon_length): if dollar_cred_dict[output['ItinID'][i]] == 1: for key in output2.keys(): output2[key].append(output[key][i]) else: # fare not credible (DollarCred = 0) count_remove += 1 del output output = copy.deepcopy(output2) del output2 print str(count_remove) + ' itineraries removed' print '%0.3f seconds '%(segment_timer.timer(False, t_start)) coupon_length_ = len(output[output.keys()[0]]) print '\nremove itineraries with BulkFare = 1' t_start = segment_timer.timer(True) output2 = {} for key in output.keys(): output2[key] = [] count_remove = 0 for i in range(coupon_length_): if bulk_fare_dict[output['ItinID'][i]] == 0: for key in output2.keys(): output2[key].append(output[key][i]) else: # bulk fare (BulkFare = 1) count_remove += 1 del output output = copy.deepcopy(output2) del output2 print str(count_remove) + ' itineraries removed' print '%0.3f seconds '%(segment_timer.timer(False, t_start)) coupon_length_after_dollar_and_bulk_fare_cred = len(output[output.keys()[0]]) output_explode_passengers = {} for key in output.keys(): output_explode_passengers[key] = [] for i in range(coupon_length_after_dollar_and_bulk_fare_cred): for key in output_explode_passengers.keys(): for j in range(int(output['Passengers'][i])): output_explode_passengers[key].append(output[key][i]) del output del output_explode_passengers['Passengers'] dst_itinerary = '..\\temp\\itinerary_' + str(yyyy) + '_' + str(q) + '.bin' f = open(dst_itinerary, 'wb') print '\nsave itinerary ' + dst_itinerary t_start = segment_timer.timer(True) cPickle.dump(output_explode_passengers, f) del output_explode_passengers print '%0.3f seconds '%(segment_timer.timer(False, t_start)) f.close() print '\nTotal time: ' + ('%0.3f seconds '%(segment_timer.timer(False, t_start_total))) del ticket_dict del dollar_cred_dict del bulk_fare_dict del fare_nominal del fare_real return None
def compress(src, yyyy, q): print 'aggregate itinerary*.bin to route-level' t_start_total = segment_timer.timer(True) print print 'loading: ' + src t_start = segment_timer.timer(True) f = open(src, 'r') itinerary = cPickle.load(f) f.close() print '%0.3f seconds '%(segment_timer.timer(False, t_start)) length_itinerary = len(itinerary[itinerary.keys()[0]]) route_level_dict = {} print '\ncreating route-level dictionary' print '\n- remove all tickets with (nominal) ItinFare < $20' t_start = segment_timer.timer(True) for i in range(length_itinerary): origin = itinerary['Origin'][i] destination = itinerary['Dest'][i] opcarrier = itinerary['OpCarrier'][i] year = str(itinerary['Year'][i]) quarter = str(itinerary['Quarter'][i]) # non-directional route route = [origin, destination] route.sort() key = (route[0] + '_' + route[1] + '_' + opcarrier + '_' + year + '_' + quarter) distance = itinerary['Distance'][i] tkcarrier = itinerary['TkCarrier'][i] fareclass = itinerary['FareClass'][i] itinfare = itinerary['ItinFare'][i] itinfarereal = itinerary['ItinFareReal'][i] # no need to use ItinID from here onwards value = [distance, tkcarrier, fareclass, itinfare, itinfarereal] frequent_flyer = (itinfare < 20.0) if not frequent_flyer: if key not in route_level_dict.keys(): route_level_dict[key] = [value] else: route_level_dict[key].append(value) print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\n# route-carriers remaining', len(route_level_dict.keys()) del itinerary route_level_dict_2 = copy.deepcopy(route_level_dict) del route_level_dict route_level_dict_3 = {} print '\n- remove all tickets with (nominal) ItinFare > 99th percentile of route-carrier-quarter fare distribution' t_start = segment_timer.timer(True) for k in route_level_dict_2.keys(): nominal_fare_list = [] for ticket in route_level_dict_2[k]: nominal_fare_list.append(ticket[3]) p99 = scipy.percentile(scipy.array(nominal_fare_list), 99) for ticket in route_level_dict_2[k]: high_fare = (ticket[3] > p99) if not high_fare: if k not in route_level_dict_3.keys(): route_level_dict_3[k] = [ticket] else: route_level_dict_3[k].append(ticket) del route_level_dict_2 print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\n# route-carriers remaining', len(route_level_dict_3.keys()) route_level_dict_4 = {} print '\n- remove all WN routes that involve DFW, from 1993Q1 to 1999Q4' t_start = segment_timer.timer(True) count_wn_dfw_tickets = 0 for k in route_level_dict_3.keys(): k__ = k.split('_') origin_ = k__[0] dest_ = k__[1] carrier_ = k__[2] year_ = int(k__[3]) condition = ( (carrier_ == 'WN') and ((origin_ == "DFW") or (dest_ == "DFW")) and (year_ in range(1993, 2000)) ) if condition: count_wn_dfw_tickets += 1 else: route_level_dict_4[k] = route_level_dict_3[k] del route_level_dict_3 print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\n# WN DFW routes removed:', count_wn_dfw_tickets print '\n# route-carriers remaining', len(route_level_dict_4.keys()) d_class = {} for key in route_level_dict_4: for ticket in route_level_dict_4[key]: if ticket[1] not in d_class.keys(): d_class[ticket[1]] = {'coach' : 0, 'other' : 0} if ticket[2] not in ['X','Y']: d_class[ticket[1]]['other'] += 1 else: d_class[ticket[1]]['coach'] += 1 d_class_coach_only = {} for key in d_class: num = float(d_class[key]['other']) den = float(d_class[key]['coach'] + d_class[key]['other']) d_class_coach_only[key] = num / den print for key in d_class_coach_only: print key, '%0.1f percent not coach'%(100 * d_class_coach_only[key]) print '\ncompress dictionary, no error trap for Distance, TkCarrier, FareClass' t_start = segment_timer.timer(True) route_level_dict_5 = {} for key in route_level_dict_4.keys(): distance = route_level_dict_4[key][0][0] nominal_fare_list = [] real_fare_list = [] fare_class_list = [] for ticket in route_level_dict_4[key]: # note that ticket[1] is the ticketing, not operating carrier; even if ticketing=operating here if d_class_coach_only[ticket[1]] > 0.75: nominal_fare_list.append(ticket[3]) real_fare_list.append(ticket[4]) fare_class_list.append(ticket[2]) else: if ticket[2] in ['X','Y']: nominal_fare_list.append(ticket[3]) real_fare_list.append(ticket[4]) fare_class_list.append(ticket[2]) nominal_fare_list.sort() real_fare_list.sort() route_level_dict_5[key] = [distance, nominal_fare_list, real_fare_list] condition = ( (key.split('_')[2] == 'WN') and (('F' in fare_class_list) or ('G' in fare_class_list)) ) if condition: dst_wn = '..\\temp\\' + key + '.txt' output_string = '' output_string += 'Southwest reporting first class tickets:\n' output_string += ('# coach class on route-carrier-quarter ' + key + ':\n') output_string += (str((fare_class_list.count('X') + fare_class_list.count('Y'))) + '\n') output_string += ('# first class on route-carrier-quarter ' + key + ':\n') output_string += (str((fare_class_list.count('F') + fare_class_list.count('G'))) + '\n') print 'saving temporary file: ' + dst_wn f = open(dst_wn, 'w') f.write(output_string) f.close() del route_level_dict_4 print '%0.3f seconds '%(segment_timer.timer(False, t_start)) route_carrier_list = route_level_dict_5.keys()[:] route_level_dict_6 = {} print '\n- remove all route-carriers with < 100 passengers in quarter' t_start = segment_timer.timer(True) for j in route_carrier_list: low_volume = (len(route_level_dict_5[j][2]) < 100) if not low_volume: route_level_dict_6[j] = route_level_dict_5[j] del route_level_dict_5 print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\n# route-carriers remaining', len(route_level_dict_6.keys()) dst_route_carrier = '..\\temp\\routecarrier_' + str(yyyy) + '_' + str(q) + '.bin' print '\nsaving: ' + dst_route_carrier t_start = segment_timer.timer(True) f = open(dst_route_carrier, 'wb') cPickle.dump(route_level_dict_6, f) f.close() print '%0.3f seconds '%(segment_timer.timer(False, t_start)) print '\ntotal time:' print '%0.3f seconds '%(segment_timer.timer(False, t_start_total)) del route_level_dict_6 return None
def sort_coupon_csv(dst_csv_2011q1_to_2013q4_temp, dst_csv, year, quarter): data_reader = open(dst_csv_2011q1_to_2013q4_temp, 'r') print print '[sorting file]\n\n\t' + dst_csv_2011q1_to_2013q4_temp t_start = segment_timer.timer(True) key_list = [] h = 'X' while h != '': flag = True bp = data_reader.tell() while flag: data_reader.seek(bp) h = data_reader.readline() check_itin_id = h.split(',')[0][:4] h_ = h.split(',') try: if bp != 0 and eval(check_itin_id) != year: bp -= 1 else: flag = False except SyntaxError: # fiddle : may lose last line (or more) of file? flag = False h = '' if bp != 0 and h != '': key_itin_id = h_[0].split('"')[-1] seq_num = h_[2] if eval(seq_num) <= 9: key_a = eval(key_itin_id + '0' + seq_num) else: key_a = eval(key_itin_id + seq_num) key_list.append([key_a, bp]) if len(key_list)%100000 == 0: print len(key_list) key_list.sort() print '\n' + 'sorted keys' print ('runtime : %0.3f seconds'%(segment_timer.timer(False, t_start))) t_start = segment_timer.timer(True) # remove repeated items from key_list # (possibly created by byte position (backwards) correction above) key_list_ = [] count_duplicates = 0 for idx in range(len(key_list) - 1): if key_list[idx][0] != key_list[idx + 1][0]: key_list_.append(key_list[idx]) else: count_duplicates += 1 key_list_.append(key_list[-1]) print '\n' + 'removed duplicates :', count_duplicates print ('runtime : %0.3f seconds'%(segment_timer.timer(False, t_start))) t_start = segment_timer.timer(True) print '\n' + '[saving sorted file]\n\n\t' + dst_csv f = open(dst_csv, 'w') data_reader.seek(0) line_out = data_reader.readline() f.write(line_out) for key in key_list_: catch_byte = key[1] data_reader.seek(catch_byte) line_out=data_reader.readline() f.write(line_out) f.close() data_reader.close() print '\n' + 'end of sort' print ('runtime : %0.3f seconds'%(segment_timer.timer(False, t_start))) print '\n' + '[deleting unsorted .csv]\n\n\t' + dst_csv_2011q1_to_2013q4_temp os.remove(dst_csv_2011q1_to_2013q4_temp) print return None
def parse(src, year): print 'parse T-100 .csv files to .bin, save to \\temp' print '** note that raw T-100 .zip and .csv must be renamed as yyyy_*.* before use **' assert isinstance(src, str), 'src must be a string' assert isinstance(year, int), 'year must be an integer' error_string = '' print '\n[source]\n\n\t' + src dst_folder = '..\\input\\' + str(year) + '_T100D_SEGMENT_ALL_CARRIER_FOLDER.csv' t_unzip_csv_start = segment_timer.timer(True) print '\nunzipping folder to \\input' print '\n[destination]\n\n\t' + dst_folder zip = zipfile.ZipFile(src) zip.extractall(dst_folder) zip.close() src_csv = dst_folder + '\\' + str(year) + '_T100D_SEGMENT_ALL_CARRIER.csv' dst_csv = '..\\temp\\' + str(year) + '_T100D_SEGMENT_ALL_CARRIER.csv' print '\ncopying .csv from \\input (folder) to \\temp' shutil.move(src_csv, dst_csv) print 'deleting redundant folder from \\input' shutil.rmtree(dst_folder) print '\n[warning]\n\n\t .csv \\input datafile is large' print '\nopening: ' + dst_csv f = open(dst_csv, 'r') header = f.readline().strip().split('"') header_list = [] for variable in header: if variable != ',': header_list.append(variable) else: pass header_list = header_list[1:] print '\nList of variables in dataset (.csv column order):',\ len(header_list), 'variables' print '\n\t', for variable_name in header_list: print variable_name, print retain_name_list = ['YEAR', 'QUARTER', 'ORIGIN', 'DEST', 'CARRIER',\ 'CARRIER_GROUP', 'PASSENGERS', 'SEATS', 'CLASS',\ 'AIRCRAFT_GROUP', 'AIRCRAFT_TYPE', 'AIRCRAFT_CONFIG',\ 'AIR_TIME', 'RAMP_TO_RAMP', 'DEPARTURES_PERFORMED'] retain_list = ['DEPARTURES_PERFORMED', 'CARRIER_GROUP', 'PASSENGERS',\ 'SEATS','AIRCRAFT_GROUP','AIRCRAFT_TYPE','AIRCRAFT_CONFIG',\ 'AIR_TIME','RAMP_TO_RAMP'] print '\nRetaining following variables (list order):',\ len(retain_name_list), 'variables' print '\n\t', for variable_name in retain_name_list: if variable_name in header_list: print variable_name, else: print variable_name, '(not available)' raise Exception ('variable_name missing from retain_name_list') print data_itin_dict = dict([x, []] for x in retain_name_list) t_open_csv_start = segment_timer.timer(True) intermediate_dict={} count = 0 for line in f: count += 1 next_line = line.strip().split('"') next_line_list = [] for variable in next_line: if variable != ',': next_line_list.append(variable) else: pass next_line_list = next_line_list[0:47] line_data = add_to_b(next_line_list) if len(line_data) != len(header_list): print print 'line length problem in line', count + 1, 'for year', year error_msg = 'line length problem in line' + str(count + 1) + ' for year ' + str(year) error_string += error_msg error_string += '\n' if len(line_data)==len(header_list): key_list=[] key_list.append(line_data[header_list.index('ORIGIN')]) key_list.append(line_data[header_list.index('DEST')]) key_list.sort() key_list.append(line_data[header_list.index('CARRIER')]) key_list.append(line_data[header_list.index('YEAR')]) key_list.append(line_data[header_list.index('QUARTER')]) # key_list.append(line_data[header_list.index('CLASS')]) key = '_'.join(key_list) if line_data[header_list.index('CLASS')] in ['F','L']: if key not in intermediate_dict: intermediate_dict[key] = {} for k in retain_list: if k in ['PASSENGERS', 'RAMP_TO_RAMP', 'AIR_TIME', 'SEATS', 'DEPARTURES_PERFORMED']: intermediate_dict[key][k] = [eval(line_data[header_list.index(k)])] else: intermediate_dict[key][k]=[line_data[header_list.index(k)]] else: for k in retain_list: if k in ['PASSENGERS', 'RAMP_TO_RAMP', 'AIR_TIME', 'SEATS', 'DEPARTURES_PERFORMED']: intermediate_dict[key][k].append(eval(line_data[header_list.index(k)])) else: intermediate_dict[key][k].append(line_data[header_list.index(k)]) f.close() data_dict = {} for key in intermediate_dict: data_dict[key] = {} for variable in intermediate_dict[key]: if variable in ['PASSENGERS', 'SEATS']: data_dict[key][variable] = sum(intermediate_dict[key][variable]) if data_dict[key]['SEATS'] != 0.0: data_dict[key]['LOAD_FACTOR'] = 100.0 * float(data_dict[key]['PASSENGERS']) / data_dict[key]['SEATS'] for variable in intermediate_dict[key]: if variable in ['AIR_TIME', 'DEPARTURES_PERFORMED']: data_dict[key][variable] = sum(intermediate_dict[key][variable]) if data_dict[key]['DEPARTURES_PERFORMED'] != 0.0: data_dict[key]['MEAN_AIR_TIME'] = data_dict[key]['AIR_TIME'] / data_dict[key]['DEPARTURES_PERFORMED'] for variable in intermediate_dict[key]: if variable in ['RAMP_TO_RAMP', 'DEPARTURES_PERFORMED']: data_dict[key][variable] = sum(intermediate_dict[key][variable]) if data_dict[key]['DEPARTURES_PERFORMED'] != 0.0: data_dict[key]['MEAN_RAMP_TO_RAMP'] = data_dict[key]['RAMP_TO_RAMP'] / data_dict[key]['DEPARTURES_PERFORMED'] print '\nnumber of lines', count print ('%0.3f seconds to parse data'%(segment_timer.timer(False, t_open_csv_start))) dst_temp = '..\\temp\\T100_merge_' + str(year) + '.bin' print '\nsave file: ' + dst_temp f = open (dst_temp, 'wb') cPickle.dump(data_dict, f) f.close() dst_error = '..\\temp\\error_string_' + str(year) + '.txt' if error_string != '': f = open(dst_error, 'wb') f.write(error_string) f.close() print 'deleting redundant file from \\temp: ' + dst_csv os.remove(dst_csv) return None
def parse_csv(src, security, security_max, year, quarter): print 'parse DB1B Ticket data from .zip to coupon_year_quarter.bin' assert isinstance(src, str), 'src must be a string' assert isinstance(security, bool), 'security must be a Boolean' assert ((security_max > 0) and isinstance(security_max, int)),\ 'security_max must be a positive integer' print print '[source]\n\n\t' + src dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '_FOLDER.csv' t_unzip_csv_start = segment_timer.timer(True) print '\n' + 'unzipping folder to \\input' print '\n[destination]\n\n\t' + dst_folder zip = zipfile.ZipFile(src) zip.extractall(dst_folder) zip.close() src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv' dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv' print '\n' + 'copying .csv from \\input (folder) to \\temp' shutil.move(src_csv, dst_csv) print 'deleting redundant folder from \\input' shutil.rmtree(dst_folder) print '\n' + '[warning]\n\n\t .csv \\input datafile is large' print '\n' + 'opening .csv file for line count' data_reader = open(dst_csv, 'r') print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start)) # count number of lines in .csv file if security: print '\n[** running in reduced-lines mode **]' max_count = None else: print '\ncounting number of lines in dataset' max_count = 0 t_count_lines_start = segment_timer.timer(True) for line in data_reader: max_count += 1 print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start)) print '\nnumber of lines of data (including header):', max_count print 'sleeping for 5 seconds' time.sleep(5) data_reader.close() print '\nre-open .csv file for parse' t_reopen_csv_start = segment_timer.timer(True) data_reader = open(dst_csv, 'r') print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start)) print '\nconstruct list of Ticket .csv variable names, in key_list' t_parse_timer = segment_timer.timer(True) key_list = [] for line in data_reader: key_list_raw = line.split('"')[1:-1] for variable_name in key_list_raw: if variable_name != ',': key_list.append(variable_name) else: pass break print 'list of variables in dataset (.csv column order):',\ len(key_list),'variables' print '\n[variables]\n\n\t', for variable_name in key_list: print variable_name, retain_names_list = ['ItinID', 'ItinFare', 'DollarCred', 'BulkFare'] print print '\nretaining following variables (list order):',\ len(retain_names_list), 'variables' print '\n[variables]\n\n\t', for variable_name in retain_names_list: if variable_name in key_list: print variable_name, else: print variable_name, '(not available)' raise Exception('variable_name missing from retain_names_list') print # initialize data dictionary: # key/ variable name from retain_names_list, value/ list (empty by default) # the set of ith elements of the lists corresponds to one itinerary observation data_itin_dict = {} for variable_name in retain_names_list: data_itin_dict[variable_name] = [] print '\nretaining all carriers' count = 1 if not security: t_intermediate_parse = segment_timer.timer(True) print '\nparsing data, percentage completed:' top_count = max_count / 100 print count / top_count # loop over all lines in .csv file for line in data_reader: count += 1 # if in test mode, exit data parse if security and count >= security_max: break if not security: if float(count / top_count) == float(count) / top_count: print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse)) t_intermediate_parse = segment_timer.timer(True) line_split = line.split(',')[:-1] line_data_list = [eval(element) for element in line_split] # build dictionary: append values to lists, for each retained variable # no error trap for multiple occurrences of the same ItinID for name in retain_names_list: data_itin_dict[name].append(line_data_list[key_list.index(name)]) print '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer)) print '\nnumber of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]]) print 'number of lines read:', count print '\nillustrative itineraries:\n' if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3: for itin_number in xrange(3): for variable in retain_names_list: print variable, data_itin_dict[variable][itin_number], print if not security: print '\ntotal number of lines:', max_count # safe_cPickle Python dictionary ticket_year_quarter print 'save .bin to \\temp' dst = '..\\temp\\' + 'ticket_' + str(year) + '_' + str(quarter) + '.bin' print '\n[temp]\n\n\t' + dst safe_cPickle.safe_cPickle_dump(dst, data_itin_dict) if not security: print 'sleeping for 15 seconds' time.sleep(15) data_reader.close() print '\ndeleting .csv file from \\temp' os.remove(dst_csv) return None
def parse_csv(src, security, security_max, year, quarter): print 'parse DB1B Coupon data from .zip to coupon_year_quarter.bin' assert isinstance(src, str), 'src must be a string' assert isinstance(security, bool), 'security must be a Boolean' assert ((security_max > 0) and isinstance(security_max, int)),\ 'security_max must be a positive integer' print print '[source]\n\n\t' + src dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_FOLDER.csv' t_unzip_csv_start = segment_timer.timer(True) print '\n' + 'unzipping folder to \\input\n' print '[destination]\n\n\t' + dst_folder zip = zipfile.ZipFile(src) zip.extractall(dst_folder) zip.close() src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv' dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv' dst_csv_2011q1_to_2013q4_temp = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_UNSORTED.csv' if year >= 2014: raise Exception('sort not implemented', year) if 2011 <= year <= 2013: print '\n' + '[sort csv for 2011Q1 to 2013Q4 inclusive]' print '\n' + 'copying .csv from \\input (folder) to \\temp' shutil.move(src_csv, dst_csv_2011q1_to_2013q4_temp) sort_coupon_csv_2011_on.sort_coupon_csv(dst_csv_2011q1_to_2013q4_temp, dst_csv, year, quarter) else: print '\n' + 'copying .csv from \\temp (folder) to \\temp' shutil.move(src_csv, dst_csv) print 'deleting redundant folder from \\input' shutil.rmtree(dst_folder) print '\n[warning]\n\n\t .csv \\input datafile is large\n' print 'opening .csv file for line count' data_reader = open(dst_csv, 'r') print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start)) # count number of lines in .csv file if security: print '\n[** running in reduced-lines mode **]' max_count = None else: print '\n' + 'counting number of lines in dataset' max_count = 0 t_count_lines_start = segment_timer.timer(True) for line in data_reader: max_count += 1 print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start)) print 'number of lines of data (including header):', max_count print '\n' + 'sleeping for 5 seconds' time.sleep(5) data_reader.close() print '\n' + 're-open .csv file for parse' t_reopen_csv_start = segment_timer.timer(True) data_reader = open(dst_csv, 'r') print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start)) large_carriers_dict = {'Southwest':'WN', 'American':'AA', 'Continental':'CO',\ 'Delta':'DL', 'Northwest':'NW', 'Skywest':'OO',\ 'United':'UA', 'US Airways':'US', 'American Eagle':'MQ',\ 'Airtran Airways':'FL', 'Express Jet':'EV',\ 'Jetblue':'B6', 'Alaska Airlines':'AS', 'Endeavor Air':'9E'} large_carrier_condition = False print '\n' + 'construct list of Coupon .csv variable names, in key_list' t_parse_timer = segment_timer.timer(True) key_list = [] for line in data_reader: key_list_raw = line.split('"')[1:-1] for variable_name in key_list_raw: if variable_name != ',': key_list.append(variable_name) else: pass break print 'list of variables in dataset (.csv column order):',\ len(key_list),'variables' print '\n[variables]\n\n\t', for variable_name in key_list: print variable_name, retain_names_list = ['ItinID', 'Year', 'Quarter', 'Origin', 'Dest',\ 'OpCarrier', 'Passengers', 'TkCarrier', 'Distance', 'FareClass'] print print '\n' + 'retaining following variables (list order):',\ len(retain_names_list), 'variables' print '\n' + '[variables]\n\n\t', for variable_name in retain_names_list: if variable_name in key_list: print variable_name, else: print variable_name, '(not available)' raise Exception('variable_name missing from retain_names_list') print # initialize data dictionary: # key/ variable name from retain_names_list, value/ list (empty by default) # the set of ith elements of the lists corresponds to one itinerary observation data_itin_dict = {} for variable_name in retain_names_list: data_itin_dict[variable_name] = [] if large_carrier_condition: print '\n' + 'retaining following large carriers:' for i in large_carriers_dict: print i+' : ', print else: print '\n' + 'retaining all carriers\n' count = 1 if not security: t_intermediate_parse = segment_timer.timer(True) print 'parsing data, percentage completed:' top_count = max_count / 100 print count / top_count # loop over all lines in .csv file for line in data_reader: count += 1 if not security: if float(count / top_count) == float(count) / top_count: print str(count / top_count)+'\t','%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse)) t_intermediate_parse = segment_timer.timer(True) # line_data_list is a list of evaluated data elements line_split = line.split(',')[:-1] line_data_list = [eval(element) for element in line_split] # new itinerary for retained operating carrier if SeqNum == 1 # deals with possible incomplete itinerary at start of data_reader (skipped) itin_test = line_data_list[key_list.index('ItinID')] # if itin_test == eval('201122538709'): # print 'out', line_data_list if line_data_list[key_list.index('SeqNum')] == 1 and line_data_list[key_list.index('Coupons')] == 2: # number of coupons in newly-identified itinerary coupons_new = line_data_list[key_list.index('Coupons')] if coupons_new == 1: # one-way pass elif coupons_new == 2: # list of data corresponding to newly-identified itinerary line_data_list_first = line_data_list[:] # carrier_list will contain OpCarrier for each coupon in itinerary # to enable constant OpCarrier itineraries to be retained # (also applies to fare_class_list, passengers_list, and tk_carrier_list if required) carrier_list = [line_data_list[key_list.index('OpCarrier')][:]] fare_class_list = [line_data_list[key_list.index('FareClass')][:]] passengers_list = [line_data_list[key_list.index('Passengers')]] tk_carrier_list = [line_data_list[key_list.index('TkCarrier')][:]] coupon_type_list = [line_data_list[key_list.index('CouponType')][:]] # if segment has trip break at SeqNum==1, create candidate destination variable if line_data_list[key_list.index('Break')] == 'X': count_break = 1 line_data_list_dest = line_data_list[key_list.index('Dest')][:] else: count_break = 0 # once new itinerary identified, continue to loop over lines in .csv file for line_ in data_reader: count += 1 # if in test mode, exit data parse if security and count >= security_max: break if not security: if float(count / top_count) == float(count) / top_count: print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse)) t_intermediate_parse = segment_timer.timer(True) line_split_ = line_.split(',')[:-1] line_data_list_ = [eval(element_) for element_ in line_split_] itin_test_ = line_data_list_[key_list.index('ItinID')] if itin_test != itin_test_: break # if itin_test_ == eval('201122538709'): # print 'return', line_data_list_ carrier_list.append(line_data_list_[key_list.index('OpCarrier')][:]) fare_class_list.append(line_data_list_[key_list.index('FareClass')][:]) passengers_list.append(line_data_list_[key_list.index('Passengers')]) tk_carrier_list.append(line_data_list_[key_list.index('TkCarrier')][:]) coupon_type_list.append(line_data_list_[key_list.index('CouponType')][:]) # if at end of itinerary, create candidate final destination variable # if not at end of itinerary, but segment has trip break, # create candidate destination variable if line_data_list_[key_list.index('SeqNum')] == coupons_new: if line_data_list_[key_list.index('Break')] == 'X': count_break += 1 line_data_list_final_dest = line_data_list_[key_list.index('Dest')][:] # break out of inner (line_) for loop # and check whether to retain itinerary break else: if line_data_list_[key_list.index('Break')] == 'X': countBreak += 1 line_data_list_dest=line_data_list_[key_list.index('Dest')][:] else: pass break # retain itinerary if the following conditions are satisfied: # 1) 2 trip breaks # 2) origin=destination (round-trip) # 3) 2 coupons # 4) constant OpCarrier # 5) constant FareClass # 6) constant Passengers # 7) constant TkCarrier, same as OpCarrier # 8) no 'E' in CouponType (no cabotage on itinerary) # 9) legs in lower 48 states only # to do: need to check whether OpCarrier code (IATA?) is constant over time # or if there is an alternative unique carrier code # older versions of code retained all itineraries with <= 4 coupons # for large carrier restriction, add condition: # and carrier_list[0] in large_carriers_dict.values() itinerary_condition = ( (count_break == 2) and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest) and (line_data_list_first[key_list.index('Coupons')] == 2) and carrier_list.count(carrier_list[0]) == len(carrier_list) and fare_class_list.count(fare_class_list[0]) == len(fare_class_list) and passengers_list.count(passengers_list[0]) == len(passengers_list) and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list) and coupon_type_list.count('E') == 0 and (line_data_list[key_list.index('ItinGeoType')] == 2) and ((line_data_list[key_list.index('FareClass')] in ['X','Y','C','D','F','G'])) and tk_carrier_list[0] == carrier_list[0] ) # if itin_test_ == eval('201122538709'): # print 'testing ItinID 201122538709' # print itinerary_condition # print count_break # print line_data_list_first[key_list.index('Origin')] # print line_data_list_final_dest # print line_data_list_first[key_list.index('Coupons')] # print carrier_list # print fare_class_list # print passengers_list # print tk_carrier_list # print coupon_type_list # print line_data_list[key_list.index('ItinGeoType')] # print line_data_list[key_list.index('FareClass')] # print # itinerary_condition_wn_first_class=( # (count_break == 2) # and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest) # and (line_data_list_first[key_list.index('Coupons')] == 2) # and carrier_list.count(carrier_list[0]) == len(carrier_list) # and fare_class_list.count(fare_class_list[0]) == len(fare_class_list) # and passengers_list.count(passengers_list[0]) == len(passengers_list) # and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list) # and coupon_type_list.count('E') == 0 # and (line_data_list[key_list.index('ItinGeoType')] == 2) # and ((line_data_list[key_list.index('FareClass')] in ['F','G'])) # and tk_carrier_list[0] == carrier_list[0] # and carrier_list[0] == 'WN' # ) if large_carrier_condition: itinerary_condition = (itinerary_condition and carrier_list[0] in large_carriers_dict.values()) # itinerary_condition_wn_first_class = (itinerary_condition_wn_first_class and carrier_list[0] in large_carriers_dict.values()) # if (itinerary_condition or itinerary_condition_wn_first_class): if itinerary_condition: # if itinerary_condition satisfied, retain all data for itinerary # otherwise, continue searching for a new itinerary in data_reader for name in retain_names_list: if name != 'Dest': data_itin_dict[name].append(line_data_list_first[key_list.index(name)]) else: data_itin_dict['Dest'].append(line_data_list_dest) if itin_test_ == eval('201122538709'): for key in data_itin_dict.keys(): print key, data_itin_dict[key][-1] else: # continue with outer (line) for loop continue # if itin_test_ == eval('201122538709'): # print data_itin_dict['OpCarrier'].count('WN') # if in test mode, exit data parse if security and count >= security_max: break print '\n' + '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer)) print '\n' + 'number of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]]) print 'number of lines read:', count print '\n' + '[illustrative itineraries]\n\n\t', if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3: for itin_number in xrange(3): for variable in retain_names_list: print variable, data_itin_dict[variable][itin_number], print '\n\t', if not security: print 'total number of lines:', max_count # safe_cPickle Python dictionary coupon_year_quarter print '\n' + 'save .bin to \\temp' # if large_carrier_condition == True, dst filename will not change dst = '..\\temp\\' + 'coupon_' + str(year) + '_' + str(quarter) + '.bin' print '\n[temp]\n\n\t' + dst safe_cPickle.safe_cPickle_dump(dst, data_itin_dict) if not security: print 'sleeping for 15 seconds' time.sleep(15) data_reader.close() print '\n' + 'deleting .csv file from \\temp\n' os.remove(dst_csv) # descriptive statistics for 2013Q4 (any quarter can be called) if (year == 2013) and (quarter == 4): coupon_descriptives.compute(year, quarter) return None