def parse_csv(src, security, security_max, year, quarter): print 'parse DB1B Ticket data from .zip to coupon_year_quarter.bin' assert isinstance(src, str), 'src must be a string' assert isinstance(security, bool), 'security must be a Boolean' assert ((security_max > 0) and isinstance(security_max, int)),\ 'security_max must be a positive integer' print print '[source]\n\n\t' + src dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '_FOLDER.csv' t_unzip_csv_start = segment_timer.timer(True) print '\n' + 'unzipping folder to \\input' print '\n[destination]\n\n\t' + dst_folder zip = zipfile.ZipFile(src) zip.extractall(dst_folder) zip.close() src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv' dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BTicket_' + str(year) + '_' + str(quarter) + '.csv' print '\n' + 'copying .csv from \\input (folder) to \\temp' shutil.move(src_csv, dst_csv) print 'deleting redundant folder from \\input' shutil.rmtree(dst_folder) print '\n' + '[warning]\n\n\t .csv \\input datafile is large' print '\n' + 'opening .csv file for line count' data_reader = open(dst_csv, 'r') print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start)) # count number of lines in .csv file if security: print '\n[** running in reduced-lines mode **]' max_count = None else: print '\ncounting number of lines in dataset' max_count = 0 t_count_lines_start = segment_timer.timer(True) for line in data_reader: max_count += 1 print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start)) print '\nnumber of lines of data (including header):', max_count print 'sleeping for 5 seconds' time.sleep(5) data_reader.close() print '\nre-open .csv file for parse' t_reopen_csv_start = segment_timer.timer(True) data_reader = open(dst_csv, 'r') print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start)) print '\nconstruct list of Ticket .csv variable names, in key_list' t_parse_timer = segment_timer.timer(True) key_list = [] for line in data_reader: key_list_raw = line.split('"')[1:-1] for variable_name in key_list_raw: if variable_name != ',': key_list.append(variable_name) else: pass break print 'list of variables in dataset (.csv column order):',\ len(key_list),'variables' print '\n[variables]\n\n\t', for variable_name in key_list: print variable_name, retain_names_list = ['ItinID', 'ItinFare', 'DollarCred', 'BulkFare'] print print '\nretaining following variables (list order):',\ len(retain_names_list), 'variables' print '\n[variables]\n\n\t', for variable_name in retain_names_list: if variable_name in key_list: print variable_name, else: print variable_name, '(not available)' raise Exception('variable_name missing from retain_names_list') print # initialize data dictionary: # key/ variable name from retain_names_list, value/ list (empty by default) # the set of ith elements of the lists corresponds to one itinerary observation data_itin_dict = {} for variable_name in retain_names_list: data_itin_dict[variable_name] = [] print '\nretaining all carriers' count = 1 if not security: t_intermediate_parse = segment_timer.timer(True) print '\nparsing data, percentage completed:' top_count = max_count / 100 print count / top_count # loop over all lines in .csv file for line in data_reader: count += 1 # if in test mode, exit data parse if security and count >= security_max: break if not security: if float(count / top_count) == float(count) / top_count: print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse)) t_intermediate_parse = segment_timer.timer(True) line_split = line.split(',')[:-1] line_data_list = [eval(element) for element in line_split] # build dictionary: append values to lists, for each retained variable # no error trap for multiple occurrences of the same ItinID for name in retain_names_list: data_itin_dict[name].append(line_data_list[key_list.index(name)]) print '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer)) print '\nnumber of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]]) print 'number of lines read:', count print '\nillustrative itineraries:\n' if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3: for itin_number in xrange(3): for variable in retain_names_list: print variable, data_itin_dict[variable][itin_number], print if not security: print '\ntotal number of lines:', max_count # safe_cPickle Python dictionary ticket_year_quarter print 'save .bin to \\temp' dst = '..\\temp\\' + 'ticket_' + str(year) + '_' + str(quarter) + '.bin' print '\n[temp]\n\n\t' + dst safe_cPickle.safe_cPickle_dump(dst, data_itin_dict) if not security: print 'sleeping for 15 seconds' time.sleep(15) data_reader.close() print '\ndeleting .csv file from \\temp' os.remove(dst_csv) return None
def parse_csv(src, security, security_max, year, quarter): print 'parse DB1B Coupon data from .zip to coupon_year_quarter.bin' assert isinstance(src, str), 'src must be a string' assert isinstance(security, bool), 'security must be a Boolean' assert ((security_max > 0) and isinstance(security_max, int)),\ 'security_max must be a positive integer' print print '[source]\n\n\t' + src dst_folder = '..\\input\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_FOLDER.csv' t_unzip_csv_start = segment_timer.timer(True) print '\n' + 'unzipping folder to \\input\n' print '[destination]\n\n\t' + dst_folder zip = zipfile.ZipFile(src) zip.extractall(dst_folder) zip.close() src_csv = dst_folder + '\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv' dst_csv = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '.csv' dst_csv_2011q1_to_2013q4_temp = '..\\temp\\Origin_and_Destination_Survey_DB1BCoupon_' + str(year) + '_' + str(quarter) + '_UNSORTED.csv' if year >= 2014: raise Exception('sort not implemented', year) if 2011 <= year <= 2013: print '\n' + '[sort csv for 2011Q1 to 2013Q4 inclusive]' print '\n' + 'copying .csv from \\input (folder) to \\temp' shutil.move(src_csv, dst_csv_2011q1_to_2013q4_temp) sort_coupon_csv_2011_on.sort_coupon_csv(dst_csv_2011q1_to_2013q4_temp, dst_csv, year, quarter) else: print '\n' + 'copying .csv from \\temp (folder) to \\temp' shutil.move(src_csv, dst_csv) print 'deleting redundant folder from \\input' shutil.rmtree(dst_folder) print '\n[warning]\n\n\t .csv \\input datafile is large\n' print 'opening .csv file for line count' data_reader = open(dst_csv, 'r') print '%0.3f seconds to unzip and open file'%(segment_timer.timer(False, t_unzip_csv_start)) # count number of lines in .csv file if security: print '\n[** running in reduced-lines mode **]' max_count = None else: print '\n' + 'counting number of lines in dataset' max_count = 0 t_count_lines_start = segment_timer.timer(True) for line in data_reader: max_count += 1 print '%0.3f seconds to count lines in file'%(segment_timer.timer(False, t_count_lines_start)) print 'number of lines of data (including header):', max_count print '\n' + 'sleeping for 5 seconds' time.sleep(5) data_reader.close() print '\n' + 're-open .csv file for parse' t_reopen_csv_start = segment_timer.timer(True) data_reader = open(dst_csv, 'r') print '%0.3f seconds to re-open file'%(segment_timer.timer(False, t_reopen_csv_start)) large_carriers_dict = {'Southwest':'WN', 'American':'AA', 'Continental':'CO',\ 'Delta':'DL', 'Northwest':'NW', 'Skywest':'OO',\ 'United':'UA', 'US Airways':'US', 'American Eagle':'MQ',\ 'Airtran Airways':'FL', 'Express Jet':'EV',\ 'Jetblue':'B6', 'Alaska Airlines':'AS', 'Endeavor Air':'9E'} large_carrier_condition = False print '\n' + 'construct list of Coupon .csv variable names, in key_list' t_parse_timer = segment_timer.timer(True) key_list = [] for line in data_reader: key_list_raw = line.split('"')[1:-1] for variable_name in key_list_raw: if variable_name != ',': key_list.append(variable_name) else: pass break print 'list of variables in dataset (.csv column order):',\ len(key_list),'variables' print '\n[variables]\n\n\t', for variable_name in key_list: print variable_name, retain_names_list = ['ItinID', 'Year', 'Quarter', 'Origin', 'Dest',\ 'OpCarrier', 'Passengers', 'TkCarrier', 'Distance', 'FareClass'] print print '\n' + 'retaining following variables (list order):',\ len(retain_names_list), 'variables' print '\n' + '[variables]\n\n\t', for variable_name in retain_names_list: if variable_name in key_list: print variable_name, else: print variable_name, '(not available)' raise Exception('variable_name missing from retain_names_list') print # initialize data dictionary: # key/ variable name from retain_names_list, value/ list (empty by default) # the set of ith elements of the lists corresponds to one itinerary observation data_itin_dict = {} for variable_name in retain_names_list: data_itin_dict[variable_name] = [] if large_carrier_condition: print '\n' + 'retaining following large carriers:' for i in large_carriers_dict: print i+' : ', print else: print '\n' + 'retaining all carriers\n' count = 1 if not security: t_intermediate_parse = segment_timer.timer(True) print 'parsing data, percentage completed:' top_count = max_count / 100 print count / top_count # loop over all lines in .csv file for line in data_reader: count += 1 if not security: if float(count / top_count) == float(count) / top_count: print str(count / top_count)+'\t','%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse)) t_intermediate_parse = segment_timer.timer(True) # line_data_list is a list of evaluated data elements line_split = line.split(',')[:-1] line_data_list = [eval(element) for element in line_split] # new itinerary for retained operating carrier if SeqNum == 1 # deals with possible incomplete itinerary at start of data_reader (skipped) itin_test = line_data_list[key_list.index('ItinID')] # if itin_test == eval('201122538709'): # print 'out', line_data_list if line_data_list[key_list.index('SeqNum')] == 1 and line_data_list[key_list.index('Coupons')] == 2: # number of coupons in newly-identified itinerary coupons_new = line_data_list[key_list.index('Coupons')] if coupons_new == 1: # one-way pass elif coupons_new == 2: # list of data corresponding to newly-identified itinerary line_data_list_first = line_data_list[:] # carrier_list will contain OpCarrier for each coupon in itinerary # to enable constant OpCarrier itineraries to be retained # (also applies to fare_class_list, passengers_list, and tk_carrier_list if required) carrier_list = [line_data_list[key_list.index('OpCarrier')][:]] fare_class_list = [line_data_list[key_list.index('FareClass')][:]] passengers_list = [line_data_list[key_list.index('Passengers')]] tk_carrier_list = [line_data_list[key_list.index('TkCarrier')][:]] coupon_type_list = [line_data_list[key_list.index('CouponType')][:]] # if segment has trip break at SeqNum==1, create candidate destination variable if line_data_list[key_list.index('Break')] == 'X': count_break = 1 line_data_list_dest = line_data_list[key_list.index('Dest')][:] else: count_break = 0 # once new itinerary identified, continue to loop over lines in .csv file for line_ in data_reader: count += 1 # if in test mode, exit data parse if security and count >= security_max: break if not security: if float(count / top_count) == float(count) / top_count: print str(count / top_count) + '\t', '%0.3f seconds'%(segment_timer.timer(False, t_intermediate_parse)) t_intermediate_parse = segment_timer.timer(True) line_split_ = line_.split(',')[:-1] line_data_list_ = [eval(element_) for element_ in line_split_] itin_test_ = line_data_list_[key_list.index('ItinID')] if itin_test != itin_test_: break # if itin_test_ == eval('201122538709'): # print 'return', line_data_list_ carrier_list.append(line_data_list_[key_list.index('OpCarrier')][:]) fare_class_list.append(line_data_list_[key_list.index('FareClass')][:]) passengers_list.append(line_data_list_[key_list.index('Passengers')]) tk_carrier_list.append(line_data_list_[key_list.index('TkCarrier')][:]) coupon_type_list.append(line_data_list_[key_list.index('CouponType')][:]) # if at end of itinerary, create candidate final destination variable # if not at end of itinerary, but segment has trip break, # create candidate destination variable if line_data_list_[key_list.index('SeqNum')] == coupons_new: if line_data_list_[key_list.index('Break')] == 'X': count_break += 1 line_data_list_final_dest = line_data_list_[key_list.index('Dest')][:] # break out of inner (line_) for loop # and check whether to retain itinerary break else: if line_data_list_[key_list.index('Break')] == 'X': countBreak += 1 line_data_list_dest=line_data_list_[key_list.index('Dest')][:] else: pass break # retain itinerary if the following conditions are satisfied: # 1) 2 trip breaks # 2) origin=destination (round-trip) # 3) 2 coupons # 4) constant OpCarrier # 5) constant FareClass # 6) constant Passengers # 7) constant TkCarrier, same as OpCarrier # 8) no 'E' in CouponType (no cabotage on itinerary) # 9) legs in lower 48 states only # to do: need to check whether OpCarrier code (IATA?) is constant over time # or if there is an alternative unique carrier code # older versions of code retained all itineraries with <= 4 coupons # for large carrier restriction, add condition: # and carrier_list[0] in large_carriers_dict.values() itinerary_condition = ( (count_break == 2) and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest) and (line_data_list_first[key_list.index('Coupons')] == 2) and carrier_list.count(carrier_list[0]) == len(carrier_list) and fare_class_list.count(fare_class_list[0]) == len(fare_class_list) and passengers_list.count(passengers_list[0]) == len(passengers_list) and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list) and coupon_type_list.count('E') == 0 and (line_data_list[key_list.index('ItinGeoType')] == 2) and ((line_data_list[key_list.index('FareClass')] in ['X','Y','C','D','F','G'])) and tk_carrier_list[0] == carrier_list[0] ) # if itin_test_ == eval('201122538709'): # print 'testing ItinID 201122538709' # print itinerary_condition # print count_break # print line_data_list_first[key_list.index('Origin')] # print line_data_list_final_dest # print line_data_list_first[key_list.index('Coupons')] # print carrier_list # print fare_class_list # print passengers_list # print tk_carrier_list # print coupon_type_list # print line_data_list[key_list.index('ItinGeoType')] # print line_data_list[key_list.index('FareClass')] # print # itinerary_condition_wn_first_class=( # (count_break == 2) # and (line_data_list_first[key_list.index('Origin')] == line_data_list_final_dest) # and (line_data_list_first[key_list.index('Coupons')] == 2) # and carrier_list.count(carrier_list[0]) == len(carrier_list) # and fare_class_list.count(fare_class_list[0]) == len(fare_class_list) # and passengers_list.count(passengers_list[0]) == len(passengers_list) # and tk_carrier_list.count(tk_carrier_list[0]) == len(tk_carrier_list) # and coupon_type_list.count('E') == 0 # and (line_data_list[key_list.index('ItinGeoType')] == 2) # and ((line_data_list[key_list.index('FareClass')] in ['F','G'])) # and tk_carrier_list[0] == carrier_list[0] # and carrier_list[0] == 'WN' # ) if large_carrier_condition: itinerary_condition = (itinerary_condition and carrier_list[0] in large_carriers_dict.values()) # itinerary_condition_wn_first_class = (itinerary_condition_wn_first_class and carrier_list[0] in large_carriers_dict.values()) # if (itinerary_condition or itinerary_condition_wn_first_class): if itinerary_condition: # if itinerary_condition satisfied, retain all data for itinerary # otherwise, continue searching for a new itinerary in data_reader for name in retain_names_list: if name != 'Dest': data_itin_dict[name].append(line_data_list_first[key_list.index(name)]) else: data_itin_dict['Dest'].append(line_data_list_dest) if itin_test_ == eval('201122538709'): for key in data_itin_dict.keys(): print key, data_itin_dict[key][-1] else: # continue with outer (line) for loop continue # if itin_test_ == eval('201122538709'): # print data_itin_dict['OpCarrier'].count('WN') # if in test mode, exit data parse if security and count >= security_max: break print '\n' + '%0.3f seconds to parse data'%(segment_timer.timer(False, t_parse_timer)) print '\n' + 'number of retained itineraries:', len(data_itin_dict[data_itin_dict.keys()[0]]) print 'number of lines read:', count print '\n' + '[illustrative itineraries]\n\n\t', if len(data_itin_dict[data_itin_dict.keys()[0]]) >= 3: for itin_number in xrange(3): for variable in retain_names_list: print variable, data_itin_dict[variable][itin_number], print '\n\t', if not security: print 'total number of lines:', max_count # safe_cPickle Python dictionary coupon_year_quarter print '\n' + 'save .bin to \\temp' # if large_carrier_condition == True, dst filename will not change dst = '..\\temp\\' + 'coupon_' + str(year) + '_' + str(quarter) + '.bin' print '\n[temp]\n\n\t' + dst safe_cPickle.safe_cPickle_dump(dst, data_itin_dict) if not security: print 'sleeping for 15 seconds' time.sleep(15) data_reader.close() print '\n' + 'deleting .csv file from \\temp\n' os.remove(dst_csv) # descriptive statistics for 2013Q4 (any quarter can be called) if (year == 2013) and (quarter == 4): coupon_descriptives.compute(year, quarter) return None