def process_file(yymm): ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: path_to_last_day_csv_file = None temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv') prev_fn = None y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' % (y, prev_m) for temp_fn in temp_csv_files: if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)): prev_fn = temp_fn break assert prev_fn, yymm path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn) # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1: # return None veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) print 'end the file; %s' % yymm
def run(): check_dir_create(dpaths['baseline', '2009', 'countGraph']) # yyyy = '20%02d' % 9 for tfZ_TP_fn in get_all_files(tfZ_TP_dpath, '%s%s*.csv' % (tfZ_TP_prefix, yyyy)): tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn) process_file(tfZ_TP_fpath)
def run(): yearDriver_gn = {} whole_ss_drivers = set() tm = 'spendingTime' for year in ['2009', '2010', '2011', '2012']: gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) gp_drivers = load_pickle_file(gp_drivers_fpath) for gn, drivers in gp_drivers.iteritems(): for did in drivers: yearDriver_gn[year, did] = gn yy = year[2:] for fn in get_all_files(ss_drivers_dpath, '%s%s*.pkl' % (ss_drivers_prefix, yy)): ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn) ss_drivers = load_pickle_file(ss_drivers_fpath) for did in ss_drivers: whole_ss_drivers.add(did) with open(groupEvolution_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'G2009', 'G2010', 'G2011', 'G2012'] writer.writerow(header) for did in whole_ss_drivers: new_row = [did] for year in ['2009', '2010', '2011', '2012']: k = (year, did) if yearDriver_gn.has_key(k): gn = yearDriver_gn[k] else: gn = 'X' new_row += [gn] writer.writerow(new_row)
def run(processorNum): for i, fn in enumerate( get_all_files(if_dpath, '%s%s*.csv' % (if_prefix, year))): if i % numWorker != processorNum: continue fpath = '%s/%s' % (if_dpath, fn) process_file(fpath)
def get_driver_trajectory(did): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): dt_xy_state = load_pickle_file(ofpath) else: dates = [] for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') if int(_did) != did: continue year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) dates += [dt] dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval(row[hid['time']])) lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [(dt, x, y, int(row[hid['state']]))] save_pickle_file(ofpath, dt_xy_state) return dt_xy_state
def process_files(yyyy, reducerID, driver_subset, pickUp_drivers): from traceback import format_exc # try: logger.info('Handle arrange %s(%d)' % (yyyy, reducerID)) tfZ_TP_fpath = '%s/%s%s-%d.csv' % (tfZ_TP_dpath, tfZ_TP_prefix, yyyy, reducerID) with open(tfZ_TP_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'spendingTime', 'roamingTime' ] for did0 in pickUp_drivers: header.append(did0) writer.writerow(header) yy = yyyy[2:] for fn in get_all_files( prevDriversDefined_dpath, 'Filtered-%s%s*.csv' % (prevDriversDefined_prefix, yy)): prevDriverDefined_fpath = '%s/%s' % (prevDriversDefined_dpath, fn) logger.info('Handling %s(%d); %s' % (yyyy, reducerID, fn)) with open(prevDriverDefined_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} handling_day = 0 for row in reader: cur_dtT = datetime.datetime.fromtimestamp( eval(row[hid['time']])) if handling_day != cur_dtT.day: handling_day = cur_dtT.day logger.info('Processing %s %dth day; reducer %d' % (fn, cur_dtT.day, reducerID)) did1 = int(row[hid['did']]) if did1 not in driver_subset: continue _prevDrivers = row[hid['prevDrivers']].split('&') if len(_prevDrivers) == 1 and _prevDrivers[0] == '': continue prevDrivers = map(int, _prevDrivers) tf = row[hid['timeFrame']] zi, zj = row[hid['zi']], row[hid['zj']] tfZ = '(%s,%s,%s)' % (tf, zi, zj) with open(tfZ_TP_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ row[hid['month']], row[hid['day']], tf, zi, zj, tfZ, did1, row[hid['spendingTime']], row[hid['roamingTime']] ] for did0 in pickUp_drivers: new_row.append(O_PRESENCE if did0 in prevDrivers else X_PRESENCE) writer.writerow(new_row) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yyyy), 'w') as f: f.write(format_exc()) raise
def run(): yearDriver_gn = {} whole_ss_drivers = set() tm = 'spendingTime' for year in ['2009', '2010', '2011', '2012']: gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) gp_drivers = load_pickle_file(gp_drivers_fpath) for gn, drivers in gp_drivers.iteritems(): for did in drivers: yearDriver_gn[year, did] = gn yy = year[2:] for fn in get_all_files(ss_drivers_dpath, '%s%s*.pkl' % (ss_drivers_prefix, yy)): ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn) ss_drivers = load_pickle_file(ss_drivers_fpath) for did in ss_drivers: whole_ss_drivers.add(did) with open(groupEvolution_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'G2009', 'G2010', 'G2011', 'G2012'] writer.writerow(header) for did in whole_ss_drivers: new_row = [did] for year in ['2009', '2010', '2011', '2012']: k = (year, did) if yearDriver_gn.has_key(k): gn = yearDriver_gn[k] else: gn = 'X' new_row += [gn] writer.writerow(new_row)
def run(): def summary(write_fpath, read_fpath): logger.info('start the file; %s' % read_fpath.split('/')[-1]) num_statistics = {} with open(read_fpath, 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']]) cur_dt = datetime.datetime(year, month, day, hour) dow = cur_dt.strftime("%a") pickUpTerminalAP, prevEndTerminalAP = row[hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']] k = (year, month, day, dow, hour, pickUpTerminalAP, prevEndTerminalAP) if not num_statistics.has_key(k): num_statistics[k] = 0 num_statistics[k] += 1 for (year, month, day, dow, hour, pickUpTerminalAP, prevEndTerminalAP), num in num_statistics.iteritems(): with open(write_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([year, month, day, dow, hour, prevEndTerminalAP, pickUpTerminalAP, num]) logger.info('end the file; %s' % read_fpath.split('/')[-1]) # for y in xrange(9, 11): yyyy = str(2000 + y) yy = '%02d' % y logger.info('Start; %s' % yyyy) write_fpath = '%s/%s%s.csv' % (trip_dpath, trip_ap_dp_flow_prefix, yyyy) with open(write_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'dayOfWeek', 'hour', 'prevEndTerminalAP', 'pickUpTerminalAP', 'totalNum'] writer.writerow(header) for fn in get_all_files(trip_dpath, '%s%s*' %(trip_prefix, yy)): read_fpath = '%s/%s' % (trip_dpath, fn) summary(write_fpath, read_fpath)
def process_file(yymm): ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: path_to_last_day_csv_file = None temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv') prev_fn = None y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' %(y, prev_m) for temp_fn in temp_csv_files: if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)): prev_fn = temp_fn break assert prev_fn, yymm path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn) # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1: # return None veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) print 'end the file; %s' % yymm
def process_tripBased(): for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) # statistics_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, yyyy) if check_path_exist(statistics_fpath): logger.info('The file had already been processed; %s' % yyyy) return yy = yyyy[2:] holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010 with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'hour', 'weekEnd', 'driverID', 'locQTime', 'locEP', 'locDuration', 'locFare', 'locProductivity', 'locIn'] drop_pick_cns = [] for l0 in locations: for l1 in locations: cn = 'D%s#P%s' % (l0, l1) drop_pick_cns.append(cn) header.append(cn) writer.writerow(header) for fn in get_all_files(economicProfit_ap_dpath, '%s%s*' % (economicProfit_ap_prefix, yy)): with open('%s/%s' % (economicProfit_ap_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']]) did = int(row[hid['did']]) locQTime = float(row[hid['queueingTime']]) / SEC60 locEP = float(row[hid['economicProfit']]) / CENT locDuration = float(row[hid['duration']]) / SEC60 locFare = float(row[hid['fare']]) / CENT locProductivity = (locFare / (locQTime + locDuration)) * SEC60 locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0 weekEnd = 0 if (year, month, day) in holidays: weekEnd = 1 if datetime.datetime(year, month, day).weekday() in WEEKENDS: weekEnd = 1 l0, l1 = row[hid['prevEndTerminalAP']], row[hid['pickUpTerminalAP']] drop_pick = 'D%s#P%s' % (l0, l1) new_row = [ year, month, day, hour, weekEnd, did, locQTime, locEP, locDuration, locFare, locProductivity, locIn ] for dp_candidate in drop_pick_cns: if dp_candidate == drop_pick: new_row.append(1) else: new_row.append(0) writer.writerow(new_row)
def run(): check_dir_create(dpaths['baseline', '2009', 'countGraph']) # yyyy = '20%02d' % 9 for tfZ_TP_fn in get_all_files(tfZ_TP_dpath, '%s%s*.csv' % (tfZ_TP_prefix, yyyy)): tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn) process_file(tfZ_TP_fpath)
def filtering(year): yy = year[2:] for fn in get_all_files(prevDriversDefined_dpath, '%s%s*' % (prevDriversDefined_prefix, yy)): df = pd.read_csv('%s/%s' % (prevDriversDefined_dpath, fn)) cn = 'spendingTime' outlier_set = set(np.where(df[cn] > MINUTES40)[0].tolist()) df = df.drop(df.index[list(outlier_set)]) df.to_csv('%s/Filtered-%s' % (prevDriversDefined_dpath, fn), index=False)
def process_file(tm, year): logger.info('handle the file; %s-%s' % (tm, year)) gds_dpath = dpaths[tm, year, 'groupDayStats'] gds_prefix = prefixs[tm, year, 'groupDayStats'] gds_fpath = '%s/%s%s.csv' % (gds_dpath, gds_prefix, 'summary') with open(gds_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['groupName', 'numDrivers', 'numTrips', 'proDur','fare', 'fare/Trip', 'distance/Trip', 'duration/Trip', 'spendingTime', 'spendingTime/Trip', 'priorOnumTrips', 'priorO_ST', 'priorO_ST/Trip', 'priorXnumTrips', 'priorX_ST', 'priorX_ST/Trip'] writer.writerow(header) gt_dpath = dpaths[tm, year, 'groupTrips'] gt_prefix = prefixs[tm, year, 'groupTrips'] gs_dpath = dpaths[tm, year, 'groupShifts'] gs_prefix = prefixs[tm, year, 'groupShifts'] for fn in get_all_files(gt_dpath, '%s*.csv' % gt_prefix): if len(fn[:-len('.csv')].split('-')) != 4: continue _, _, _, gn = fn[:-len('.csv')].split('-') gt_fpath = '%s/%s' % (gt_dpath, fn) gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn) gt_df = pd.read_csv(gt_fpath) gs_df = pd.read_csv(gs_fpath) numDrivers = len(set(gt_df['did'])) numTrips = gt_df.groupby(['year', 'month', 'day', 'did']).count().reset_index()['groupName'].mean() proDur = gs_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['pro-dur'].mean() distance = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['distance'].mean() duration = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['duration'].mean() fare = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['fare'].mean() distance_trip = distance / float(numTrips) duration_trip = duration / float(numTrips) fare_trip = fare / float(numTrips) spendingTime = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean() spendingTime_trip = spendingTime / float(numTrips) # priorO_gt_df = gt_df[(gt_df['priorPresence'] == 1)] priorOnumTrips = priorO_gt_df.groupby(['year', 'month', 'day', 'did']).count().reset_index()['groupName'].mean() priorO_ST = priorO_gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean() priorO_ST_trip = priorO_ST / float(priorOnumTrips) # priorX_gt_df = gt_df[(gt_df['priorPresence'] == 0)] priorXnumTrips = priorX_gt_df.groupby(['year', 'month', 'day', 'did']).count().reset_index()['groupName'].mean() priorX_ST = priorX_gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean() priorX_ST_trip = priorX_ST / float(priorXnumTrips) with open(gds_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [gn, numDrivers, numTrips, proDur, fare, fare_trip, distance_trip, duration_trip, spendingTime, spendingTime_trip, priorOnumTrips, priorO_ST, priorO_ST_trip, priorXnumTrips, priorX_ST, priorX_ST_trip] writer.writerow(new_row)
def run(): ir = 'influenceGraph' # tm = 'spendingTime' for year in ['2009', '2010', '2011', '2012']: check_dir_create(dpaths[tm, year, ir]) # yyyy = '20%02d' % 9 for tfZ_TP_fn in get_all_files(tfZ_TP_dpath, '%s%s*.csv' % (tfZ_TP_prefix, yyyy)): tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn) process_file(tfZ_TP_fpath)
def process_tripBased(): for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) logger.info('handle the file; %s' % yyyy) # statistics1517_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversTrip_ns1517_prefix, yyyy) statistics2023_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversTrip_ns2023_prefix, yyyy) # yy = yyyy[2:] holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010 for statistics_fpath in [statistics1517_fpath, statistics2023_fpath]: with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'hour', 'driverID', 'locQTime', 'locEP', 'locDuration', 'locFare', 'locProductivity', 'locIn', 'weekEnd'] writer.writerow(header) for fn in get_all_files(economicProfit_ns_dpath, '%s%s*' % (economicProfit_ns_prefix, yy)): with open('%s/%s' % (economicProfit_ns_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']]) did = int(row[hid['did']]) locQTime = float(row[hid['queueingTime']]) / SEC60 locEP = float(row[hid['economicProfit']]) / CENT locDuration = float(row[hid['duration']]) / SEC60 locFare = float(row[hid['fare']]) / CENT locProductivity = (locFare / (locQTime + locDuration)) * SEC60 locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0 weekEnd = 0 if (year, month, day) in holidays: weekEnd = 1 if datetime.datetime(year, month, day).weekday() in WEEKENDS: weekEnd = 1 if hour in tf_ns1517: statistics_fpath = statistics1517_fpath elif hour in tf_ns2023: statistics_fpath = statistics2023_fpath else: continue with open(statistics_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ year, month, day, hour, did, locQTime, locEP, locDuration, locFare, locProductivity, locIn, weekEnd] writer.writerow(new_row)
def run(): ir = 'influenceGraph' # for tm in ['spendingTime', 'roamingTime']: for tm in ['spendingTime']: for year in ['2009', '2010', '2011', '2012']: check_dir_create(dpaths[tm, year, ir]) yyyy = '20%02d' % 9 for tfZ_TP_fn in get_all_files(tfZ_TP_dpath, '%s%s*.csv' % (tfZ_TP_prefix, yyyy)): tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn) process_file(tfZ_TP_fpath)
def process_files(yyyy, reducerID, driver_subset, pickUp_drivers): from traceback import format_exc # try: logger.info('Handle arrange %s(%d)' % (yyyy, reducerID)) tfZ_TP_fpath = '%s/%s%s-%d.csv' % (tfZ_TP_dpath, tfZ_TP_prefix, yyyy, reducerID) with open(tfZ_TP_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'spendingTime', 'roamingTime'] for did0 in pickUp_drivers: header.append(did0) writer.writerow(header) yy = yyyy[2:] for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*.csv' % (prevDriversDefined_prefix, yy)): prevDriverDefined_fpath = '%s/%s' % (prevDriversDefined_dpath, fn) logger.info('Handling %s(%d); %s' % (yyyy, reducerID, fn)) with open(prevDriverDefined_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} handling_day = 0 for row in reader: cur_dtT = datetime.datetime.fromtimestamp(eval(row[hid['time']])) if handling_day != cur_dtT.day: handling_day = cur_dtT.day logger.info('Processing %s %dth day; reducer %d' % (fn, cur_dtT.day, reducerID)) did1 = int(row[hid['did']]) if did1 not in driver_subset: continue _prevDrivers = row[hid['prevDrivers']].split('&') if len(_prevDrivers) == 1 and _prevDrivers[0] == '': continue prevDrivers = map(int, _prevDrivers) tf = row[hid['timeFrame']] zi, zj = row[hid['zi']], row[hid['zj']] tfZ = '(%s,%s,%s)' % (tf, zi, zj) with open(tfZ_TP_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [row[hid['month']], row[hid['day']], tf, zi, zj, tfZ, did1, row[hid['spendingTime']], row[hid['roamingTime']] ] for did0 in pickUp_drivers: new_row.append(O_PRESENCE if did0 in prevDrivers else X_PRESENCE) writer.writerow(new_row) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yyyy), 'w') as f: f.write(format_exc()) raise
def process_file(tm, year): logger.info('handle the file; %s-%s' % (tm, year)) gds_dpath = dpaths[tm, year, 'groupDayStats'] gds_prefix = prefixs[tm, year, 'groupDayStats'] gds_fpath = '%s/%s%s.csv' % (gds_dpath, gds_prefix, 'summary') with open(gds_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'groupName', 'numDrivers', 'numTrips', 'proDur', 'fare', 'fare/Trip', 'distance/Trip', 'duration/Trip', 'spendingTime', 'spendingTime/Trip' ] writer.writerow(header) gt_dpath = dpaths[tm, year, 'groupTrips'] gt_prefix = prefixs[tm, year, 'groupTrips'] gs_dpath = dpaths[tm, year, 'groupShifts'] gs_prefix = prefixs[tm, year, 'groupShifts'] # for fn in get_all_files(gt_dpath, '%s*.csv' % gt_prefix): if len(fn[:-len('.csv')].split('-')) != 4: continue _, _, _, gn = fn[:-len('.csv')].split('-') gt_fpath = '%s/%s' % (gt_dpath, fn) gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn) gt_df = pd.read_csv(gt_fpath) gs_df = pd.read_csv(gs_fpath) numDrivers = len(set(gt_df['did'])) numTrips = gt_df.groupby(['year', 'month', 'day', 'did' ]).count().reset_index()['groupName'].mean() proDur = gs_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['pro-dur'].mean() distance = gt_df.groupby(['year', 'month', 'day', 'did' ]).sum().reset_index()['distance'].mean() duration = gt_df.groupby(['year', 'month', 'day', 'did' ]).sum().reset_index()['duration'].mean() fare = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['fare'].mean() distance_trip = distance / float(numTrips) duration_trip = duration / float(numTrips) fare_trip = fare / float(numTrips) spendingTime = gt_df.groupby( ['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean() spendingTime_trip = spendingTime / float(numTrips) with open(gds_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ gn, numDrivers, numTrips, proDur, fare, fare_trip, distance_trip, duration_trip, spendingTime, spendingTime_trip ] writer.writerow(new_row)
def run(): for dpath in [ # statisticsSsDrivers_ap_dpath, statisticsSsDrivers_ns_dpath ]: check_dir_create(dpath) # ssDrivers = set() for y in xrange(9, 11): for m in xrange(1, 13): yymm = '%02d%02d' % (y, m) if yymm in ['0912', '1010']: # both years data are corrupted continue ssDrivers = ssDrivers.union(load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm))) # for all_dpath, ss_dpath in [ # (statisticsAllDrivers_ap_dpath, statisticsSsDrivers_ap_dpath), (statisticsAllDrivers_ns_dpath, statisticsSsDrivers_ns_dpath) ]: for all_prefix, ss_prefix in [ # (statisticsAllDriversDay_ap_prefix, statisticsSsDriversDay_ap_prefix), (statisticsAllDriversDay_ns1517_prefix, statisticsSsDriversDay_ns1517_prefix), (statisticsAllDriversDay_ns2023_prefix, statisticsSsDriversDay_ns2023_prefix), # (statisticsAllDriversMonth_ap_prefix, statisticsSsDriversMonth_ap_prefix), (statisticsAllDriversMonth_ns1517_prefix, statisticsSsDriversMonth_ns1517_prefix), (statisticsAllDriversMonth_ns2023_prefix, statisticsSsDriversMonth_ns2023_prefix), # (statisticsAllDriversTrip_ap_prefix, statisticsSsDriversTrip_ap_prefix), (statisticsAllDriversTrip_ns1517_prefix, statisticsSsDriversTrip_ns1517_prefix), (statisticsAllDriversTrip_ns2023_prefix, statisticsSsDriversTrip_ns2023_prefix), ]: for fn in get_all_files(all_dpath, '%s*' % all_prefix): period = fn[:-len('.csv')].split('-')[2] with open('%s/%s' % (all_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} with open('%s/%s%s.csv' % (ss_dpath, ss_prefix, period), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(header) for row in reader: did = int(row[hid['driverID']]) if did not in ssDrivers: continue writer.writerow(row)
def run(): init_multiprocessor(6) count_num_jobs = 0 tm = 'spendingTime' # for year in ['2009', '2010', '2011', '2012']: for year in ['2009']: gds_dpath = dpaths[tm, year, 'groupDriverStats'] check_dir_create(gds_dpath) # gm_dpath = dpaths[tm, year, 'groupMarginal'] gm_prefix = prefixs[tm, year, 'groupMarginal'] for fn in get_all_files(gm_dpath, '%s*.csv' % gm_prefix): _, _, _, gn = fn[:-len('.csv')].split('-') # process_file(tm, year, gn) put_task(process_file, [tm, year, gn]) count_num_jobs += 1 end_multiprocessor(count_num_jobs)
def run(): init_multiprocessor(6) count_num_jobs = 0 for tm in ['spendingTime']: # for year in ['2009', '2010', '2011', '2012']: for year in ['2009']: gm_dpath = dpaths[tm, year, 'groupMarginal'] check_dir_create(gm_dpath) # gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] for fn in get_all_files(gp_dpath, '%s*.pkl' % gp_prefix): _, _, _, gn = fn[:-len('.pkl')].split('-') if gn == 'drivers' or gn == 'original': continue # process_file(tm, year, gn) put_task(process_file, [tm, year, gn]) count_num_jobs += 1 end_multiprocessor(count_num_jobs)
def process_file(tm, year, gn, groupDrivers): logger.info('handle the file; %s-%s-%s' % (tm, year, gn)) gt_dpath = dpaths[tm, year, 'groupTrips'] gt_prefix = prefixs[tm, year, 'groupTrips'] gt_fpath = '%s/%s%s.csv' % (gt_dpath, gt_prefix, gn) # # gs_dpath = dpaths[tm, year, 'groupShifts'] # gs_prefix = prefixs[tm, year, 'groupShifts'] # gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn) xgt_fpath = '%s/%s%s.csv' % (gt_dpath, gt_prefix, 'X') assert xgt_fpath == gt_fpath, (gt_fpath) with open(xgt_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['time', 'year', 'month', 'day', 'hour', 'did', 'groupName', 'zi', 'zj', 'zizj', tm, 'priorPresence', 'start-long', 'start-lat', 'distance', 'duration', 'fare'] writer.writerow(header) yy = year[2:] for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' % (prevDriversDefined_prefix, yy)): fpath = '%s/%s' % (prevDriversDefined_dpath, fn) logger.info('handle the file %s; %s-%s-%s' % (fn, tm, year, gn)) with open(fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} for row in reader: did1 = int(row[hid['did']]) if did1 not in groupDrivers: with open(xgt_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [row[hid['time']], year] new_row += [row[hid[cn]] for cn in ['month', 'day', 'timeFrame']] new_row += [did1, 'X'] zi, zj = row[hid['zi']], row[hid['zj']] zizj = '%s#%s' % (zi, zj) new_row += [zi, zj, zizj] new_row += [row[hid[tm]], 'X'] for cn in ['start-long', 'start-lat', 'distance', 'duration', 'fare']: new_row.append(row[hid[cn]]) writer.writerow(new_row)
def find_driversRelations(year): yy = year[2:] driversRelations = {} for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' % (prevDriversDefined_prefix, yy)): logger.info('handle the file; %s' % fn) with open('%s/%s' % (prevDriversDefined_dpath, fn), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: did1 = int(row[hid['did']]) prevDrivers = row[hid['prevDrivers']].split('&') if len(prevDrivers) == 1 and prevDrivers[0] == '': continue if not driversRelations.has_key(did1): driversRelations[did1] = set() for did0 in map(int, prevDrivers): driversRelations[did1].add(did0) save_pickle_file(driversRelations_fpaths[year], driversRelations)
def summary(): summary_fpath = '%s/%s%s.csv' % (of_dpath, of_prefix, year) with open(summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'did', 'numObservations', 'numPrevDrivers', 'numSigRelationship', 'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation' ] writer.writerow(header) for fn in get_all_files(of_dpath, '%s%s-*.csv' % (of_prefix, year)): _, _, _, _did1 = fn[:-len('.csv')].split('-') fpath = '%s/%s' % (of_dpath, fn) with open(fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) reader.next() hid = {h: i for i, h in enumerate(header)} for row in reader: with open(summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(row)
def summary(): summary_fpath = '%s/%s%s.csv' % (of_dpath, of_prefix, year) with open(summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'numObservations', 'numPrevDrivers', 'numSigRelationship', 'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation'] writer.writerow(header) for fn in get_all_files(of_dpath, '%s%s-*.csv' % (of_prefix, year)): _, _, _, _did1 = fn[:-len('.csv')].split('-') fpath = '%s/%s' % (of_dpath, fn) with open(fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) reader.next() hid = {h: i for i, h in enumerate(header)} for row in reader: with open(summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(row)
def run(): init_multiprocessor(6) count_num_jobs = 0 tm = 'baseline' # for tm in ['spendingTime', 'roamingTime']: # for year in ['2009', '2010', '2011', '2012']: for year in ['2009']: gz_dpath = dpaths[tm, year, 'groupZones'] check_dir_create(gz_dpath) # gt_dpath = dpaths[tm, year, 'groupTrips'] gt_prefix = prefixs[tm, year, 'groupTrips'] for fn in get_all_files(gt_dpath, '%s*' % gt_prefix): if len(fn[:-len('.csv')].split('-')) != 4: continue _, _, _, gn = fn[:-len('.csv')].split('-') if gn == 'X': continue gt_fpath = '%s/%s' % (gt_dpath, fn) # process_file(tm, year, gt_fpath) put_task(process_file, [tm, year, gt_fpath]) count_num_jobs += 1 end_multiprocessor(count_num_jobs)
def run(): init_multiprocessor(6) count_num_jobs = 0 tm = 'spendingTime' # for tm in ['spendingTime', 'roamingTime']: # for year in ['2009', '2010', '2011', '2012']: for year in ['2009']: gz_dpath = dpaths[tm, year, 'groupZones'] check_dir_create(gz_dpath) # gt_dpath = dpaths[tm, year, 'groupTrips'] gt_prefix = prefixs[tm, year, 'groupTrips'] for fn in get_all_files(gt_dpath, '%s*' % gt_prefix): if len(fn[:-len('.csv')].split('-')) != 4: continue _, _, _, gn = fn[:-len('.csv')].split('-') if gn == 'X': continue gt_fpath = '%s/%s' % (gt_dpath, fn) # process_file(tm, year, gt_fpath) put_task(process_file, [tm, year, gt_fpath]) count_num_jobs += 1 end_multiprocessor(count_num_jobs)
def run(): drivers_dates = {} for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) k = int(_did) if not drivers_dates.has_key(k): drivers_dates[k] = [] drivers_dates[k] += [dt] # for did, dates in drivers_dates.iteritems(): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): continue dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval( row[hid['time']])) lon, lat = map( eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [dt, x, y, int(row[hid['state']])] save_pickle_file(ofpath, dt_xy_state)
def run(): cg_dpath = dpaths['baseline', '2009', 'countGraph'] cg_prefix = prefixs['baseline', '2009', 'countGraph'] gp_dpath = dpaths['baseline', '2009', 'groupPartition'] gp_prefix = prefixs['baseline', '2009', 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) # logger.info('Start handling SP_group_dpath') if not check_path_exist(gp_original_fpath): original_graph = {} for fn in get_all_files(cg_dpath, '%s*' % cg_prefix): count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn)) logger.info('Start handling; %s' % fn) numEdges = len(count_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(count_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % (i / float(numEdges))) original_graph[did0, did1] = w save_pickle_file(gp_original_fpath, original_graph) else: original_graph = load_pickle_file(gp_original_fpath) # logger.info('igraph converting') igid, did_igid = 0, {} igG = ig.Graph(directed=True) numEdges = len(original_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(original_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % i / float(numEdges)) if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def summary(): from traceback import format_exc try: logger.info('Start summary') ignoring_periods = [] for ys, ms, ds, hs in error_hours: yyyy = 2000 + int(ys) mm, dd, hh = map(int, [ms, ds, hs]) k = (yyyy, mm, dd, hh) ignoring_periods.append(k) cur_timestamp = datetime.datetime(2008, 12, 31, 23) last_timestamp = datetime.datetime(2011, 1, 1, 0) hp_summary, time_period_order = {}, [] while cur_timestamp < last_timestamp: cur_timestamp += datetime.timedelta(hours=1) year, month, day, hour = cur_timestamp.year, cur_timestamp.month, cur_timestamp.day, cur_timestamp.hour if year == 2009 and month == 12: continue if year == 2010 and month == 10: continue if year == 2011: continue if AM2 <= hour and hour <= AM5: continue need2skip = False for ys, ms, ds, hs in error_hours: year0 = 2000 + int(ys) month0, day0, hour0 = map(int, [ms, ds, hs]) if (year == year0) and (month == month0) and ( day == day0) and (hour == hour0): need2skip = True if need2skip: continue # k = year, month, day, hour hp_summary[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, \ AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, \ NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))] time_period_order.append(k) # year_l, month_l, day_l, hour_l = 'year', 'month', 'day', 'hour' for fn in get_all_files(productivity_dpath, '%s*.csv' % productivity_prefix): with open('%s/%s' % (productivity_dpath, fn), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month = int(row[hid[year_l]]), int(row[hid[month_l]]) day, hour = int(row[hid[day_l]]), int(row[hid[hour_l]]) k = (year, month, day, hour) if not hp_summary.has_key(k): continue hp_summary[k][ALL_DUR] += eval(row[hid['allDuration']]) hp_summary[k][ALL_FARE] += eval(row[hid['allFare']]) hp_summary[k][ALL_NUM] += eval(row[hid['allNum']]) # hp_summary[k][AP_DUR] += eval(row[hid['apDuration']]) hp_summary[k][AP_FARE] += eval(row[hid['apFare']]) hp_summary[k][AP_QUEUE] += eval(row[hid['apQueueingTime']]) hp_summary[k][AP_NUM] += eval(row[hid['apNum']]) # hp_summary[k][NS_DUR] += eval(row[hid['nsDuration']]) hp_summary[k][NS_FARE] += eval(row[hid['nsFare']]) hp_summary[k][NS_QUEUE] += eval(row[hid['nsQueueingTime']]) hp_summary[k][NS_NUM] += eval(row[hid['nsNum']]) # with open(productivity_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile) header = [ 'year', 'month', 'day', 'hour', 'allNum', 'allTotalDuration', 'allAvgDuration', 'allTotalFare', 'allAvgFare', 'allProductivity', 'apNum', 'apTotalDuration', 'apAvgDuration', 'apTotalFare', 'apAvgFare', 'apTotalQueueing', 'apAvgQueueing', 'apProductivity', 'apGenNum', 'apGenTotalDuration', 'apGenAvgDuration', 'apGenTotalFare', 'apGenAvgFare', 'apGenProductivity', 'nsNum', 'nsTotalDuration', 'nsAvgDuration', 'nsTotalFare', 'nsAvgFare', 'nsTotalQueueing', 'nsAvgQueueing', 'nsProductivity', 'nsGenNum', 'nsGenTotalDuration', 'nsGenAvgDuration', 'nsGenTotalFare', 'nsGenAvgFare', 'nsGenProductivity', 'key' ] writer.writerow(header) for k in time_period_order: all_total_dur, all_total_fare, all_num, \ ap_total_dur, ap_total_fare, ap_total_queue, ap_num, \ ns_total_dur, ns_total_fare, ns_total_queue, ns_num = hp_summary[k] year, month, day, hour = k # if all_num == 0: all_avg_dur, all_avg_fare = -1, -1 all_prod = -1 else: all_avg_dur, all_avg_fare = all_total_dur / float( all_num), all_total_fare / float(all_num) if all_total_dur == 0: all_prod = -1 else: all_prod = all_total_fare / float(all_total_dur) if ap_num == 0: ap_avg_dur, ap_avg_fare, ap_avg_queue = -1, -1, -1 ap_prod = -1 else: ap_avg_dur, ap_avg_fare, ap_avg_queue = \ ap_total_dur / float(ap_num), ap_total_fare / float(ap_num), ap_total_queue / float(ap_num) if ap_total_dur == 0: ap_prod = -1 else: ap_prod = ap_total_fare / float(ap_total_dur) ap_gen_num = all_num - ap_num ap_gen_total_dur = all_total_dur - (ap_total_dur + ap_total_queue) ap_gen_total_fare = all_total_fare - ap_total_fare if ap_gen_num == 0: ap_gen_avg_dur, ap_gen_avg_fare = -1, -1 ap_gen_prod = -1 else: ap_gen_avg_dur, ap_gen_avg_fare = \ ap_gen_total_dur / float(ap_gen_num), ap_gen_total_fare / float(ap_gen_num) if ap_gen_total_dur == 0: ap_gen_prod = -1 else: ap_gen_prod = ap_gen_total_fare / float( ap_gen_total_dur) # if ns_num == 0: ns_avg_dur, ns_avg_fare, ns_avg_queue = -1, -1, -1 ns_prod = -1 else: ns_avg_dur, ns_avg_fare, ns_avg_queue = \ ns_total_dur / float(ns_num), ns_total_fare / float(ns_num), ns_total_queue / float(ns_num) if ns_total_dur == 0: ns_prod = -1 else: ns_prod = ns_total_fare / float(ns_total_dur) ns_gen_num = all_num - ns_num ns_gen_total_dur = all_total_dur - (ns_total_dur + ns_total_queue) ns_gen_total_fare = all_total_fare - ns_total_fare if ns_gen_num == 0: ns_gen_avg_dur, ns_gen_avg_fare = -1, -1 ns_gen_prod = -1 else: ns_gen_avg_dur, ns_gen_avg_fare = \ ns_gen_total_dur / float(ns_gen_num), ns_gen_total_fare / float(ns_gen_num) if ns_gen_total_dur == 0: ns_gen_prod = -1 else: ns_gen_prod = ns_gen_total_fare / float( ns_gen_total_dur) # writer.writerow([ year, month, day, hour, all_num, all_total_dur, all_avg_dur, all_total_fare, all_avg_fare, all_prod, ap_num, ap_total_dur, ap_avg_dur, ap_total_fare, ap_avg_fare, ap_total_queue, ap_avg_queue, ap_prod, ap_gen_num, ap_gen_total_dur, ap_gen_avg_dur, ap_gen_total_fare, ap_gen_avg_fare, ap_gen_prod, ns_num, ns_total_dur, ap_avg_dur, ns_total_fare, ns_avg_fare, ns_total_queue, ns_avg_queue, ns_prod, ns_gen_num, ns_gen_total_dur, ap_gen_avg_dur, ns_gen_total_fare, ns_gen_avg_fare, ns_gen_prod, k ]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], 'summary'), 'w') as f: f.write(format_exc()) raise
def process_dayBased(): logger.info('handle dayBased') # for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) statistics1517_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversDay_ns1517_prefix, yyyy) statistics2023_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversDay_ns2023_prefix, yyyy) # dateDid_statistics1517, dateDid_statistics2023 = {}, {} logger.info('process locTrip') for ns_prefix, dateDid_statistics in [(statisticsAllDriversTrip_ns1517_prefix, dateDid_statistics1517), (statisticsAllDriversTrip_ns2023_prefix, dateDid_statistics2023)]: tripBased_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ns_dpath, ns_prefix, yyyy) logger.info('process locTrip') with open(tripBased_fpath, 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']]) did = int(row[hid['driverID']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): dateDid_statistics[k] = [0.0 for _ in [WTN, WOH, WF, LTN, LIN, LON, LQ, LEP, LD, LF]] dateDid_statistics[k][LTN] += 1 if int(row[hid['locIn']]) == 1: dateDid_statistics[k][LIN] += 1 else: assert int(row[hid['locIn']]) == 0 dateDid_statistics[k][LON] += 1 dateDid_statistics[k][LQ] += float(row[hid['locQTime']]) dateDid_statistics[k][LEP] += float(row[hid['locEP']]) dateDid_statistics[k][LD] += float(row[hid['locDuration']]) dateDid_statistics[k][LF] += float(row[hid['locFare']]) yy = yyyy[2:] logger.info('process shift') for fn in get_all_files(shiftProDur_dpath, '%s%s*' % (shiftProDur_prefix, yy)): logger.info('shift; %s' % fn) with open('%s/%s' % (shiftProDur_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = 2000 + int(row[hid['yy']]), int(row[hid['mm']]), int(row[hid['dd']]), int(row[hid['hh']]) if hour in tf_ns1517: dateDid_statistics = dateDid_statistics1517 elif hour in tf_ns2023: dateDid_statistics = dateDid_statistics2023 else: continue did = int(row[hid['did']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): continue dateDid_statistics[k][WOH] += (float(row[hid['pro-dur']]) * SEC60) / SEC3600 logger.info('process trip') for fn in get_all_files(trip_dpath, '%s%s*' % (trip_prefix, yy)): logger.info('Trip; %s' % fn) _, yymm = fn[:-len('.csv')].split('-') yy, mm = yymm[:2], yymm[-2:] year, month = 2000 + int(yy), int(mm) with open('%s/%s' % (trip_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: day, hour = int(row[hid['day']]), int(row[hid['hour']]) if hour in tf_ns1517: dateDid_statistics = dateDid_statistics1517 elif hour in tf_ns2023: dateDid_statistics = dateDid_statistics2023 else: continue did = int(row[hid['did']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): continue dateDid_statistics[k][WTN] += 1 dateDid_statistics[k][WF] += float(row[hid['fare']]) / CENT # logger.info('write statistics; %s' % yymm) for statistics_fpath, dateDid_statistics in [(statistics1517_fpath, dateDid_statistics1517), (statistics2023_fpath, dateDid_statistics2023)]: with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'wleProductivity', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'QTime/locTrip', 'EP/locTrip', 'locProductivity'] writer.writerow(header) for (year, month, day, did), statistics in dateDid_statistics.iteritems(): wleTripNumber, wleOperatingHour, wleFare = int(statistics[WTN]), statistics[WOH], statistics[WF], if wleOperatingHour == 0.0: continue wleProductivity = wleFare / wleOperatingHour # locTripNumber, locInNumber, locOutNumber = map(int, [statistics[LTN], statistics[LIN], statistics[LON]]) if locTripNumber == 0.0: continue locQTime, locEP, locDuration, locFare = statistics[LQ], statistics[LEP], statistics[LD], statistics[LF] if (locQTime + locDuration) == 0.0: continue QTime_locTrip, EP_locTrip = locQTime / float(locTripNumber), locEP / float(locTripNumber) locProductivity = (locFare / (locQTime + locDuration)) * SEC60 new_row = [ year, month, day, did, wleTripNumber, wleOperatingHour, wleFare, wleProductivity, locTripNumber, locInNumber, locOutNumber, locQTime, locEP, locDuration, locFare, QTime_locTrip, EP_locTrip, locProductivity] writer.writerow(new_row)
def process_file(yymm): def record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not): with open(path_to_csv_file, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: t, vid = eval(row[hid['time']]), row[hid['vid']] ap_or_not, ns_or_not = eval(row[hid['ap-or-not']]), eval(row[hid['ns-or-not']]) # if not veh_last_log_ap_or_not.has_key(vid): if ap_or_not == IN: # the first log's position was occurred in the AP zone assert not veh_ap_crossing_time.has_key(vid) veh_ap_crossing_time[vid] = [t] else: assert veh_last_log_ap_or_not.has_key(vid) if veh_last_log_ap_or_not[vid] == OUT and ap_or_not == IN: veh_ap_crossing_time.setdefault(vid, [t]).append(t) # if not veh_last_log_ns_or_not.has_key(vid): if ns_or_not == IN: # the first log's position was occurred in the NS zone assert not veh_ns_crossing_time.has_key(vid) veh_ns_crossing_time[vid] = [t] else: assert veh_last_log_ns_or_not.has_key(vid) if veh_last_log_ns_or_not[vid] == OUT and ns_or_not == IN: veh_ns_crossing_time.setdefault(vid, [t]).append(t) # veh_last_log_ap_or_not[vid] = ap_or_not veh_last_log_ns_or_not[vid] = ns_or_not return veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not # from traceback import format_exc try: logger.info('handle the file; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' %(y, prev_m) prev_fn = get_all_files(log_last_day_dpath, '%s%s*.csv' % (log_last_day_prefix, prev_yymm))[0] path_to_last_day_csv_file = '%s/%s' % (log_last_day_dpath, prev_fn) veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): gp_summary_fpath = '%s/%ssummary.csv' % (of_dpath, of_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (of_dpath, of_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (of_dpath, of_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) logger.info('Start handling SP_group_dpath') orignal_graph = {} for fn in get_all_files(if_dpath, '%ssigRelation-%s-*.pkl' % (if_prefix, year)): _, _, _, _, _did1 = fn[:-len('.csv')].split('-') sigRelatioin = load_pickle_file('%s/%s' % (if_dpath, fn)) for _did0, coef in sigRelatioin['pos']: did0, did1 = map(int, [_did0, _did1]) orignal_graph[did0, did1] = coef save_pickle_file(gp_original_fpath, orignal_graph) # igid, did_igid = 0, {} igG = ig.Graph(directed=True) for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()): if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (of_dpath, of_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (of_dpath, of_prefix, gn) # layout = sg.layout("kk") # if len(drivers) < 100: # ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) # else: # ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (of_dpath, of_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
import __init__ ''' ''' from community_analysis import dpaths, prefixs # from taxi_common.file_handling_functions import get_all_files, load_pickle_file year = '20%02d' % 9 # depVar = 'roamingTime' depVar = 'interTravelTime' # # of_dpath = dpaths[depVar, 'influenceGraph'] of_prefixs = prefixs[depVar, 'influenceGraph'] countRelationWhole = {k: 0 for k in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']} for fn in get_all_files(of_dpath, '%scount-*' % of_prefixs): print fn fpath = '%s/%s' % (of_dpath, fn) countRelation = load_pickle_file(fpath) for n in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']: countRelationWhole[n] += countRelation[n] print countRelationWhole
def run(): ignoring_periods = [] for ys, ms, ds, hs in error_hours: yyyy = 2000 + int(ys) mm, dd, hh = map(int, [ms, ds, hs]) k = (yyyy, mm, dd, hh) ignoring_periods.append(k) cur_timestamp = datetime.datetime(2008, 12, 31, 23) last_timestamp = datetime.datetime(2011, 1, 1, 0) hp_summary, time_period_order = {}, [] while cur_timestamp < last_timestamp: cur_timestamp += datetime.timedelta(hours=1) yyyy, mm, dd, hh = cur_timestamp.year, cur_timestamp.month, cur_timestamp.day, cur_timestamp.hour if yyyy == 2009 and mm == 12: continue if yyyy == 2010 and mm == 10: continue if yyyy == 2011: continue if AM2 <= hh and hh <= AM5: continue need2skip = False for ys, ms, ds, hs in error_hours: yyyy0 = 2000 + int(ys) mm0, dd0, hh0 = map(int, [ms, ds, hs]) if (yyyy == yyyy0) and (mm == mm0) and (dd == dd0) and (hh == hh0): need2skip = True if need2skip: continue # k = (str(yyyy - 2000), str(mm), str(dd), str(hh)) hp_summary[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, \ AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, \ NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))] time_period_order.append(k) # yy_l, mm_l, dd_l, hh_l = 'yy', 'mm', 'dd', 'hh' for fn in get_all_files(productivity_dir, productivity_prefix, '.csv'): with open('%s/%s' % (productivity_dir, fn), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} for row in reader: yy, mm, dd, hh = row[hid[yy_l]], row[hid[mm_l]], row[hid[dd_l]], row[hid[hh_l]] k = (yy, mm, dd, hh) if not hp_summary.has_key(k): continue hp_summary[k][ALL_DUR] += eval(row[hid['all-duration']]) hp_summary[k][ALL_FARE] += eval(row[hid['all-fare']]) hp_summary[k][ALL_NUM] += eval(row[hid['all-num']]) hp_summary[k][AP_DUR] += eval(row[hid['ap-duration']]) hp_summary[k][AP_FARE] += eval(row[hid['ap-fare']]) hp_summary[k][AP_QUEUE] += eval(row[hid['ap-queueing-time']]) hp_summary[k][AP_NUM] += eval(row[hid['ap-num']]) hp_summary[k][NS_DUR] += eval(row[hid['ns-duration']]) hp_summary[k][NS_FARE] += eval(row[hid['ns-fare']]) hp_summary[k][NS_QUEUE] += eval(row[hid['ns-queueing-time']]) hp_summary[k][NS_NUM] += eval(row[hid['ns-num']]) # Summary print 'summary' zero_dur = [] with open(hourly_stats_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile) header = ['yy', 'mm', 'dd', 'hh', 'all-num', 'all-total-duration', 'all-avg-duration', 'all-total-fare', 'all-avg-fare', 'all-productivity', 'ap-num', 'atotal-duration', 'aavg-duration', 'atotal-fare', 'aavg-fare', 'atotal-queueing', 'aavg-queueing', 'ap-productivity', 'ap-gen-num', 'ap-gtotal-duration', 'ap-gavg-duration', 'ap-gtotal-fare', 'ap-gavg-fare', 'ap-gen-productivity', 'ns-num', 'ntotal-duration', 'navg-duration', 'ntotal-fare', 'navg-fare', 'ntotal-queueing', 'navg-queueing', 'ns-productivity', 'ns-gen-num', 'ns-gtotal-duration', 'ns-gavg-duration', 'ns-gtotal-fare', 'ns-gavg-fare', 'ns-gen-productivity', 'key'] writer.writerow(header) for k in time_period_order: all_total_dur, all_total_fare, all_num, \ ap_total_dur, ap_total_fare, ap_total_queue, ap_num, \ ns_total_dur, ns_total_fare, ns_total_queue, ns_num = hp_summary[k] # if all_num == 0: all_avg_dur, all_avg_fare = -1, -1 all_prod = -1 else: all_avg_dur, all_avg_fare = all_total_dur / float(all_num), all_total_fare / float(all_num) if all_total_dur == 0: zero_dur.append([ALL, k]) all_prod = -1 else: all_prod = all_total_fare / float(all_total_dur) # yy, mm, dd, hh = k if ap_num == 0: ap_avg_dur, ap_avg_fare, ap_avg_queue = -1, -1, -1 ap_prod = -1 else: ap_avg_dur, ap_avg_fare, ap_avg_queue = \ ap_total_dur / float(ap_num), ap_total_fare / float(ap_num), ap_total_queue / float(ap_num) if ap_total_dur == 0: zero_dur.append([AP, k]) ap_prod = -1 else: ap_prod = ap_total_fare / float(ap_total_dur) ap_gen_num = all_num - ap_num ap_gen_total_dur = all_total_dur - (ap_total_dur + ap_total_queue) ap_gen_total_fare = all_total_fare - ap_total_fare if ap_gen_num == 0: ap_gen_avg_dur, ap_gen_avg_fare = -1, -1 ap_gen_prod = -1 else: ap_gen_avg_dur, ap_gen_avg_fare = \ ap_gen_total_dur / float(ap_gen_num), ap_gen_total_fare / float(ap_gen_num) if ap_gen_total_dur == 0: zero_dur.append([AP_GEN, k]) ap_gen_prod = -1 else: ap_gen_prod = ap_gen_total_fare / float(ap_gen_total_dur) # if ns_num == 0: ns_avg_dur, ns_avg_fare, ns_avg_queue = -1, -1, -1 ns_prod = -1 else: ns_avg_dur, ns_avg_fare, ns_avg_queue = \ ns_total_dur / float(ns_num), ns_total_fare / float(ns_num), ns_total_queue / float(ns_num) if ns_total_dur == 0: zero_dur.append([NS, k]) ns_prod = -1 else: ns_prod = ns_total_fare / float(ns_total_dur) ns_gen_num = all_num - ns_num ns_gen_total_dur = all_total_dur - (ns_total_dur + ns_total_queue) ns_gen_total_fare = all_total_fare - ns_total_fare if ns_gen_num == 0: ns_gen_avg_dur, ns_gen_avg_fare = -1, -1 ns_gen_prod = -1 else: ns_gen_avg_dur, ns_gen_avg_fare = \ ns_gen_total_dur / float(ns_gen_num), ns_gen_total_fare / float(ns_gen_num) if ns_gen_total_dur == 0: zero_dur.append([NS_GEN, k]) ns_gen_prod = -1 else: ns_gen_prod = ns_gen_total_fare / float(ns_gen_total_dur) # writer.writerow([yy, mm, dd, hh, all_num, all_total_dur, all_avg_dur, all_total_fare, all_avg_fare, all_prod, ap_num, ap_total_dur, ap_avg_dur, ap_total_fare, ap_avg_fare, ap_total_queue, ap_avg_queue, ap_prod, ap_gen_num, ap_gen_total_dur, ap_gen_avg_dur, ap_gen_total_fare, ap_gen_avg_fare, ap_gen_prod, ns_num, ns_total_dur, ap_avg_dur, ns_total_fare, ns_avg_fare, ns_total_queue, ns_avg_queue, ns_prod, ns_gen_num, ns_gen_total_dur, ap_gen_avg_dur, ns_gen_total_fare, ns_gen_avg_fare, ns_gen_prod, k])
def run(processorNum): for i, fn in enumerate(get_all_files(if_dpath, '%s%s*.csv' % (if_prefix, year))): if i % numWorker != processorNum: continue fpath = '%s/%s' % (if_dpath, fn) process_file(fpath)
def summary(): from traceback import format_exc try: logger.info('Start summary') ignoring_periods = [] for ys, ms, ds, hs in error_hours: yyyy = 2000 + int(ys) mm, dd, hh = map(int, [ms, ds, hs]) k = (yyyy, mm, dd, hh) ignoring_periods.append(k) cur_timestamp = datetime.datetime(2008, 12, 31, 23) last_timestamp = datetime.datetime(2011, 1, 1, 0) hp_summary, time_period_order = {}, [] while cur_timestamp < last_timestamp: cur_timestamp += datetime.timedelta(hours=1) year, month, day, hour = cur_timestamp.year, cur_timestamp.month, cur_timestamp.day, cur_timestamp.hour if year == 2009 and month == 12: continue if year == 2010 and month == 10: continue if year == 2011: continue if AM2 <= hour and hour <= AM5: continue need2skip = False for ys, ms, ds, hs in error_hours: year0 = 2000 + int(ys) month0, day0, hour0 = map(int, [ms, ds, hs]) if (year == year0) and (month == month0) and (day == day0) and (hour == hour0): need2skip = True if need2skip: continue # k = year, month, day, hour hp_summary[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, \ AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, \ NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))] time_period_order.append(k) # year_l, month_l, day_l, hour_l = 'year', 'month', 'day', 'hour' for fn in get_all_files(productivity_dpath, '%s*.csv' % productivity_prefix): with open('%s/%s' % (productivity_dpath, fn), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month = int(row[hid[year_l]]), int(row[hid[month_l]]) day, hour = int(row[hid[day_l]]), int(row[hid[hour_l]]) k = (year, month, day, hour) if not hp_summary.has_key(k): continue hp_summary[k][ALL_DUR] += eval(row[hid['allDuration']]) hp_summary[k][ALL_FARE] += eval(row[hid['allFare']]) hp_summary[k][ALL_NUM] += eval(row[hid['allNum']]) # hp_summary[k][AP_DUR] += eval(row[hid['apDuration']]) hp_summary[k][AP_FARE] += eval(row[hid['apFare']]) hp_summary[k][AP_QUEUE] += eval(row[hid['apQueueingTime']]) hp_summary[k][AP_NUM] += eval(row[hid['apNum']]) # hp_summary[k][NS_DUR] += eval(row[hid['nsDuration']]) hp_summary[k][NS_FARE] += eval(row[hid['nsFare']]) hp_summary[k][NS_QUEUE] += eval(row[hid['nsQueueingTime']]) hp_summary[k][NS_NUM] += eval(row[hid['nsNum']]) # with open(productivity_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile) header = ['year', 'month', 'day', 'hour', 'allNum', 'allTotalDuration', 'allAvgDuration', 'allTotalFare', 'allAvgFare', 'allProductivity', 'apNum', 'apTotalDuration', 'apAvgDuration', 'apTotalFare', 'apAvgFare', 'apTotalQueueing', 'apAvgQueueing', 'apProductivity', 'apGenNum', 'apGenTotalDuration', 'apGenAvgDuration', 'apGenTotalFare', 'apGenAvgFare', 'apGenProductivity', 'nsNum', 'nsTotalDuration', 'nsAvgDuration', 'nsTotalFare', 'nsAvgFare', 'nsTotalQueueing', 'nsAvgQueueing', 'nsProductivity', 'nsGenNum', 'nsGenTotalDuration', 'nsGenAvgDuration', 'nsGenTotalFare', 'nsGenAvgFare', 'nsGenProductivity', 'key'] writer.writerow(header) for k in time_period_order: all_total_dur, all_total_fare, all_num, \ ap_total_dur, ap_total_fare, ap_total_queue, ap_num, \ ns_total_dur, ns_total_fare, ns_total_queue, ns_num = hp_summary[k] year, month, day, hour = k # if all_num == 0: all_avg_dur, all_avg_fare = -1, -1 all_prod = -1 else: all_avg_dur, all_avg_fare = all_total_dur / float(all_num), all_total_fare / float(all_num) if all_total_dur == 0: all_prod = -1 else: all_prod = all_total_fare / float(all_total_dur) if ap_num == 0: ap_avg_dur, ap_avg_fare, ap_avg_queue = -1, -1, -1 ap_prod = -1 else: ap_avg_dur, ap_avg_fare, ap_avg_queue = \ ap_total_dur / float(ap_num), ap_total_fare / float(ap_num), ap_total_queue / float(ap_num) if ap_total_dur == 0: ap_prod = -1 else: ap_prod = ap_total_fare / float(ap_total_dur) ap_gen_num = all_num - ap_num ap_gen_total_dur = all_total_dur - (ap_total_dur + ap_total_queue) ap_gen_total_fare = all_total_fare - ap_total_fare if ap_gen_num == 0: ap_gen_avg_dur, ap_gen_avg_fare = -1, -1 ap_gen_prod = -1 else: ap_gen_avg_dur, ap_gen_avg_fare = \ ap_gen_total_dur / float(ap_gen_num), ap_gen_total_fare / float(ap_gen_num) if ap_gen_total_dur == 0: ap_gen_prod = -1 else: ap_gen_prod = ap_gen_total_fare / float(ap_gen_total_dur) # if ns_num == 0: ns_avg_dur, ns_avg_fare, ns_avg_queue = -1, -1, -1 ns_prod = -1 else: ns_avg_dur, ns_avg_fare, ns_avg_queue = \ ns_total_dur / float(ns_num), ns_total_fare / float(ns_num), ns_total_queue / float(ns_num) if ns_total_dur == 0: ns_prod = -1 else: ns_prod = ns_total_fare / float(ns_total_dur) ns_gen_num = all_num - ns_num ns_gen_total_dur = all_total_dur - (ns_total_dur + ns_total_queue) ns_gen_total_fare = all_total_fare - ns_total_fare if ns_gen_num == 0: ns_gen_avg_dur, ns_gen_avg_fare = -1, -1 ns_gen_prod = -1 else: ns_gen_avg_dur, ns_gen_avg_fare = \ ns_gen_total_dur / float(ns_gen_num), ns_gen_total_fare / float(ns_gen_num) if ns_gen_total_dur == 0: ns_gen_prod = -1 else: ns_gen_prod = ns_gen_total_fare / float(ns_gen_total_dur) # writer.writerow([year, month, day, hour, all_num, all_total_dur, all_avg_dur, all_total_fare, all_avg_fare, all_prod, ap_num, ap_total_dur, ap_avg_dur, ap_total_fare, ap_avg_fare, ap_total_queue, ap_avg_queue, ap_prod, ap_gen_num, ap_gen_total_dur, ap_gen_avg_dur, ap_gen_total_fare, ap_gen_avg_fare, ap_gen_prod, ns_num, ns_total_dur, ap_avg_dur, ns_total_fare, ns_avg_fare, ns_total_queue, ns_avg_queue, ns_prod, ns_gen_num, ns_gen_total_dur, ap_gen_avg_dur, ns_gen_total_fare, ns_gen_avg_fare, ns_gen_prod, k]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], 'summary'), 'w') as f: f.write(format_exc()) raise
def process_file(tm, year): ig_dpath = dpaths[tm, year, 'influenceGraph'] ig_prefix = prefixs[tm, year, 'influenceGraph'] gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon']) # logger.info('Start handling SP_group_dpath') orignal_graph = {} for fn in get_all_files(ig_dpath, '%s*' % ig_prefix): regression_graph = load_pickle_file('%s/%s' % (ig_dpath, fn)) for i, ((did0, did1), w) in enumerate(regression_graph.iteritems()): orignal_graph[did0, did1] = w save_pickle_file(gp_original_fpath, orignal_graph) # igid, did_igid = 0, {} igG = ig.Graph(directed=True) for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()): if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def process_tripBased(): for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) logger.info('handle the file; %s' % yyyy) # statistics1517_fpath = '%s/%s%s.csv' % ( statisticsAllDrivers_ns_dpath, statisticsAllDriversTrip_ns1517_prefix, yyyy) statistics2023_fpath = '%s/%s%s.csv' % ( statisticsAllDrivers_ns_dpath, statisticsAllDriversTrip_ns2023_prefix, yyyy) # yy = yyyy[2:] holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010 for statistics_fpath in [statistics1517_fpath, statistics2023_fpath]: with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'year', 'month', 'day', 'hour', 'driverID', 'locQTime', 'locEP', 'locDuration', 'locFare', 'locProductivity', 'locIn', 'weekEnd' ] writer.writerow(header) for fn in get_all_files(economicProfit_ns_dpath, '%s%s*' % (economicProfit_ns_prefix, yy)): with open('%s/%s' % (economicProfit_ns_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = map(int, [ row[hid[cn]] for cn in ['year', 'month', 'day', 'hour'] ]) did = int(row[hid['did']]) locQTime = float(row[hid['queueingTime']]) / SEC60 locEP = float(row[hid['economicProfit']]) / CENT locDuration = float(row[hid['duration']]) / SEC60 locFare = float(row[hid['fare']]) / CENT locProductivity = (locFare / (locQTime + locDuration)) * SEC60 locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0 weekEnd = 0 if (year, month, day) in holidays: weekEnd = 1 if datetime.datetime(year, month, day).weekday() in WEEKENDS: weekEnd = 1 if hour in tf_ns1517: statistics_fpath = statistics1517_fpath elif hour in tf_ns2023: statistics_fpath = statistics2023_fpath else: continue with open(statistics_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ year, month, day, hour, did, locQTime, locEP, locDuration, locFare, locProductivity, locIn, weekEnd ] writer.writerow(new_row)
def process_dayBased(): logger.info('handle dayBased') # for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) statistics1517_fpath = '%s/%s%s.csv' % ( statisticsAllDrivers_ns_dpath, statisticsAllDriversDay_ns1517_prefix, yyyy) statistics2023_fpath = '%s/%s%s.csv' % ( statisticsAllDrivers_ns_dpath, statisticsAllDriversDay_ns2023_prefix, yyyy) # dateDid_statistics1517, dateDid_statistics2023 = {}, {} logger.info('process locTrip') for ns_prefix, dateDid_statistics in [ (statisticsAllDriversTrip_ns1517_prefix, dateDid_statistics1517), (statisticsAllDriversTrip_ns2023_prefix, dateDid_statistics2023) ]: tripBased_fpath = '%s/Filtered-%s%s.csv' % ( statisticsAllDrivers_ns_dpath, ns_prefix, yyyy) logger.info('process locTrip') with open(tripBased_fpath, 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day = map( int, [row[hid[cn]] for cn in ['year', 'month', 'day']]) did = int(row[hid['driverID']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): dateDid_statistics[k] = [ 0.0 for _ in [WTN, WOH, WF, LTN, LIN, LON, LQ, LEP, LD, LF] ] dateDid_statistics[k][LTN] += 1 if int(row[hid['locIn']]) == 1: dateDid_statistics[k][LIN] += 1 else: assert int(row[hid['locIn']]) == 0 dateDid_statistics[k][LON] += 1 dateDid_statistics[k][LQ] += float(row[hid['locQTime']]) dateDid_statistics[k][LEP] += float(row[hid['locEP']]) dateDid_statistics[k][LD] += float(row[hid['locDuration']]) dateDid_statistics[k][LF] += float(row[hid['locFare']]) yy = yyyy[2:] logger.info('process shift') for fn in get_all_files(shiftProDur_dpath, '%s%s*' % (shiftProDur_prefix, yy)): logger.info('shift; %s' % fn) with open('%s/%s' % (shiftProDur_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = 2000 + int(row[hid['yy']]), int( row[hid['mm']]), int(row[hid['dd']]), int( row[hid['hh']]) if hour in tf_ns1517: dateDid_statistics = dateDid_statistics1517 elif hour in tf_ns2023: dateDid_statistics = dateDid_statistics2023 else: continue did = int(row[hid['did']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): continue dateDid_statistics[k][WOH] += (float(row[hid['pro-dur']]) * SEC60) / SEC3600 logger.info('process trip') for fn in get_all_files(trip_dpath, '%s%s*' % (trip_prefix, yy)): logger.info('Trip; %s' % fn) _, yymm = fn[:-len('.csv')].split('-') yy, mm = yymm[:2], yymm[-2:] year, month = 2000 + int(yy), int(mm) with open('%s/%s' % (trip_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: day, hour = int(row[hid['day']]), int(row[hid['hour']]) if hour in tf_ns1517: dateDid_statistics = dateDid_statistics1517 elif hour in tf_ns2023: dateDid_statistics = dateDid_statistics2023 else: continue did = int(row[hid['did']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): continue dateDid_statistics[k][WTN] += 1 dateDid_statistics[k][WF] += float(row[hid['fare']]) / CENT # logger.info('write statistics; %s' % yymm) for statistics_fpath, dateDid_statistics in [ (statistics1517_fpath, dateDid_statistics1517), (statistics2023_fpath, dateDid_statistics2023) ]: with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'year', 'month', 'day', 'driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'wleProductivity', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'QTime/locTrip', 'EP/locTrip', 'locProductivity' ] writer.writerow(header) for (year, month, day, did), statistics in dateDid_statistics.iteritems(): wleTripNumber, wleOperatingHour, wleFare = int( statistics[WTN]), statistics[WOH], statistics[WF], if wleOperatingHour == 0.0: continue wleProductivity = wleFare / wleOperatingHour # locTripNumber, locInNumber, locOutNumber = map( int, [statistics[LTN], statistics[LIN], statistics[LON]]) if locTripNumber == 0.0: continue locQTime, locEP, locDuration, locFare = statistics[ LQ], statistics[LEP], statistics[LD], statistics[LF] if (locQTime + locDuration) == 0.0: continue QTime_locTrip, EP_locTrip = locQTime / float( locTripNumber), locEP / float(locTripNumber) locProductivity = (locFare / (locQTime + locDuration)) * SEC60 new_row = [ year, month, day, did, wleTripNumber, wleOperatingHour, wleFare, wleProductivity, locTripNumber, locInNumber, locOutNumber, locQTime, locEP, locDuration, locFare, QTime_locTrip, EP_locTrip, locProductivity ] writer.writerow(new_row)
def process_dayBased(): logger.info('handle dayBased') # for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) statistics_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversDay_ap_prefix, yyyy) # dateDid_statistics = {} dateDid_DP = {} dp_index, count = {}, 0 for l0 in locations: for l1 in locations: drop_pick = 'D%s#P%s' % (l0, l1) dp_index[drop_pick] = count count += 1 tripBased_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, yyyy) logger.info('process locTrip') with open(tripBased_fpath, 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']]) did = int(row[hid['driverID']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): dateDid_statistics[k] = [0.0 for _ in [WTN, WOH, WF, LTN, LIN, LON, LQ, LEP, LD, LF]] dateDid_DP[k] = [0 for _ in range(len(dp_index))] dateDid_statistics[k][LTN] += 1 if int(row[hid['locIn']]) == 1: dateDid_statistics[k][LIN] += 1 else: assert int(row[hid['locIn']]) == 0 dateDid_statistics[k][LON] += 1 dateDid_statistics[k][LQ] += float(row[hid['locQTime']]) dateDid_statistics[k][LEP] += float(row[hid['locEP']]) dateDid_statistics[k][LD] += float(row[hid['locDuration']]) dateDid_statistics[k][LF] += float(row[hid['locFare']]) # for l0 in locations: for l1 in locations: cn = 'D%s#P%s' % (l0, l1) dateDid_DP[k][dp_index[cn]] += int(row[hid[cn]]) # yy = yyyy[2:] logger.info('process shift') for fn in get_all_files(shiftProDur_dpath, '%s%s*' % (shiftProDur_prefix, yy)): logger.info('shift; %s' % fn) with open('%s/%s' % (shiftProDur_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']]) did = int(row[hid['did']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): continue dateDid_statistics[k][WOH] += (float(row[hid['pro-dur']]) * SEC60) / SEC3600 # logger.info('process trip') for fn in get_all_files(trip_dpath, '%s%s*' % (trip_prefix, yy)): logger.info('Trip; %s' % fn) with open('%s/%s' % (trip_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']]) did = int(row[hid['did']]) k = (year, month, day, did) if not dateDid_statistics.has_key(k): continue dateDid_statistics[k][WTN] += 1 dateDid_statistics[k][WF] += float(row[hid['fare']]) / CENT # logger.info('write statistics; %s' % yyyy) with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'wleProductivity', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'QTime/locTrip', 'EP/locTrip', 'locProductivity'] for dp in dp_index.iterkeys(): header.append(dp) writer.writerow(header) for (year, month, day, did), statistics in dateDid_statistics.iteritems(): wleTripNumber, wleOperatingHour, wleFare = int(statistics[WTN]), statistics[WOH], statistics[WF], if wleOperatingHour == 0.0: continue wleProductivity = wleFare / wleOperatingHour # locTripNumber, locInNumber, locOutNumber = map(int, [statistics[LTN], statistics[LIN], statistics[LON]]) if locTripNumber == 0.0: continue locQTime, locEP, locDuration, locFare = statistics[LQ], statistics[LEP], statistics[LD], statistics[LF] if (locQTime + locDuration) == 0.0: continue QTime_locTrip, EP_locTrip = locQTime / float(locTripNumber), locEP / float(locTripNumber) locProductivity = (locFare / (locQTime + locDuration)) * SEC60 new_row = [ year, month, day, did, wleTripNumber, wleOperatingHour, wleFare, wleProductivity, locTripNumber, locInNumber, locOutNumber, locQTime, locEP, locDuration, locFare, QTime_locTrip, EP_locTrip, locProductivity] for dp in dp_index.iterkeys(): new_row.append(dateDid_DP[(year, month, day, did)][dp_index[dp]]) writer.writerow(new_row)
def process_file(tm, year, gn, groupDrivers): logger.info('handle the file; %s-%s-%s' % (tm, year, gn)) gt_dpath = dpaths[tm, year, 'groupTrips'] gt_prefix = prefixs[tm, year, 'groupTrips'] gt_fpath = '%s/%s%s.csv' % (gt_dpath, gt_prefix, gn) # # gs_dpath = dpaths[tm, year, 'groupShifts'] # gs_prefix = prefixs[tm, year, 'groupShifts'] # gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn) with open(gt_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['time', 'year', 'month', 'day', 'hour', 'did', 'groupName', 'zi', 'zj', 'zizj', tm, 'priorPresence', 'start-long', 'start-lat', 'distance', 'duration', 'fare'] writer.writerow(header) num_gt_fpath = '%s/%snum-%s.csv' % (gt_dpath, gt_prefix, gn) with open(num_gt_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['groupName','did0', 'did1', 'zi', 'zj', 'zizj', 'time', 'year', 'month', 'day', 'hour'] writer.writerow(header) # with open(gs_fpath, 'wb') as w_csvfile: # writer = csv.writer(w_csvfile, lineterminator='\n') # new_headers = ['year', 'month', 'day', 'hour', 'did', 'pro-dur'] # writer.writerow(new_headers) yy = year[2:] for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' % (prevDriversDefined_prefix, yy)): fpath = '%s/%s' % (prevDriversDefined_dpath, fn) logger.info('handle the file %s; %s-%s-%s' % (fn, tm, year, gn)) with open(fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} for row in reader: did1 = int(row[hid['did']]) if did1 not in groupDrivers: continue tm_value = row[hid[tm]] t, month, day, hour = [row[hid[cn]] for cn in ['time', 'month', 'day', 'timeFrame']] zi, zj = row[hid['zi']], row[hid['zj']] zizj = '%s#%s' % (zi, zj) _prevDrivers = row[hid['prevDrivers']].split('&') priorPresence = X_PRESENCE if len(_prevDrivers) == 1 and _prevDrivers[0] == '': priorPresence = X_PRESENCE else: prevDrivers = map(int, _prevDrivers) for did0 in groupDrivers.difference(set([did1])): if did0 in prevDrivers: with open(num_gt_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [gn, did0, did1, zi, zj, zizj, t, year, month, day, hour] writer.writerow(new_row) priorPresence = O_PRESENCE new_row = [t, year, month, day, hour, did1, gn, zi, zj, zizj, tm_value, priorPresence] for cn in ['start-long', 'start-lat', 'distance', 'duration', 'fare']: new_row.append(row[hid[cn]]) with open(gt_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(new_row)