Пример #1
0
def process_file(yymm):
    ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
        return None
    print 'handle the file; %s' % yymm
    veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
    veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
    if yymm not in ['0901', '1001', '1011']:
        path_to_last_day_csv_file = None
        temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix,
                                       '.csv')
        prev_fn = None
        y, m = int(yymm[:2]), int(yymm[2:])
        prev_m = m - 1
        prev_yymm = '%02d%02d' % (y, prev_m)
        for temp_fn in temp_csv_files:
            if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)):
                prev_fn = temp_fn
                break
        assert prev_fn, yymm
        path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn)
        # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1:
        #     return None
        veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                        record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                             veh_ns_crossing_time, veh_last_log_ns_or_not)
    path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
            record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
    #
    save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
    save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
    print 'end the file; %s' % yymm
Пример #2
0
def run():
    check_dir_create(dpaths['baseline', '2009', 'countGraph'])
    #
    yyyy = '20%02d' % 9
    for tfZ_TP_fn in get_all_files(tfZ_TP_dpath, '%s%s*.csv' % (tfZ_TP_prefix, yyyy)):
        tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn)
        process_file(tfZ_TP_fpath)
Пример #3
0
def run():
    yearDriver_gn = {}
    whole_ss_drivers = set()
    tm = 'spendingTime'
    for year in ['2009', '2010', '2011', '2012']:
        gp_dpath = dpaths[tm, year, 'groupPartition']
        gp_prefix = prefixs[tm, year, 'groupPartition']
        gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
        gp_drivers = load_pickle_file(gp_drivers_fpath)
        for gn, drivers in gp_drivers.iteritems():
            for did in drivers:
                yearDriver_gn[year, did] = gn
        yy = year[2:]
        for fn in get_all_files(ss_drivers_dpath,
                                '%s%s*.pkl' % (ss_drivers_prefix, yy)):
            ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn)
            ss_drivers = load_pickle_file(ss_drivers_fpath)
            for did in ss_drivers:
                whole_ss_drivers.add(did)
    with open(groupEvolution_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['did', 'G2009', 'G2010', 'G2011', 'G2012']
        writer.writerow(header)
        for did in whole_ss_drivers:
            new_row = [did]
            for year in ['2009', '2010', '2011', '2012']:
                k = (year, did)
                if yearDriver_gn.has_key(k):
                    gn = yearDriver_gn[k]
                else:
                    gn = 'X'
                new_row += [gn]
            writer.writerow(new_row)
Пример #4
0
def run(processorNum):
    for i, fn in enumerate(
            get_all_files(if_dpath, '%s%s*.csv' % (if_prefix, year))):
        if i % numWorker != processorNum:
            continue
        fpath = '%s/%s' % (if_dpath, fn)
        process_file(fpath)
Пример #5
0
def get_driver_trajectory(did):
    ofpath = '%s%d.pkl' % (if_prefix, did)
    if check_path_exist(ofpath):
        dt_xy_state = load_pickle_file(ofpath)
    else:
        dates = []
        for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
            _, _date, _did = fn[:-len('.csv')].split('-')
            if int(_did) != did:
                continue
            year = 2000 + int(_date[:2])
            month, day = map(int, [_date[2:4], _date[4:6]])
            dt = datetime.datetime(year, month, day)
            dates += [dt]
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(row[hid['time']]))
                    lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [(dt, x, y, int(row[hid['state']]))]
        save_pickle_file(ofpath, dt_xy_state)
    return dt_xy_state
Пример #6
0
def process_files(yyyy, reducerID, driver_subset, pickUp_drivers):
    from traceback import format_exc
    #
    try:
        logger.info('Handle arrange %s(%d)' % (yyyy, reducerID))
        tfZ_TP_fpath = '%s/%s%s-%d.csv' % (tfZ_TP_dpath, tfZ_TP_prefix, yyyy,
                                           reducerID)
        with open(tfZ_TP_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = [
                'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did',
                'spendingTime', 'roamingTime'
            ]
            for did0 in pickUp_drivers:
                header.append(did0)
            writer.writerow(header)
        yy = yyyy[2:]
        for fn in get_all_files(
                prevDriversDefined_dpath,
                'Filtered-%s%s*.csv' % (prevDriversDefined_prefix, yy)):
            prevDriverDefined_fpath = '%s/%s' % (prevDriversDefined_dpath, fn)
            logger.info('Handling %s(%d); %s' % (yyyy, reducerID, fn))
            with open(prevDriverDefined_fpath, 'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                header = reader.next()
                hid = {h: i for i, h in enumerate(header)}
                handling_day = 0
                for row in reader:
                    cur_dtT = datetime.datetime.fromtimestamp(
                        eval(row[hid['time']]))
                    if handling_day != cur_dtT.day:
                        handling_day = cur_dtT.day
                        logger.info('Processing %s %dth day; reducer %d' %
                                    (fn, cur_dtT.day, reducerID))
                    did1 = int(row[hid['did']])
                    if did1 not in driver_subset:
                        continue
                    _prevDrivers = row[hid['prevDrivers']].split('&')
                    if len(_prevDrivers) == 1 and _prevDrivers[0] == '':
                        continue
                    prevDrivers = map(int, _prevDrivers)
                    tf = row[hid['timeFrame']]
                    zi, zj = row[hid['zi']], row[hid['zj']]
                    tfZ = '(%s,%s,%s)' % (tf, zi, zj)
                    with open(tfZ_TP_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [
                            row[hid['month']], row[hid['day']], tf, zi, zj,
                            tfZ, did1, row[hid['spendingTime']],
                            row[hid['roamingTime']]
                        ]
                        for did0 in pickUp_drivers:
                            new_row.append(O_PRESENCE if did0 in
                                           prevDrivers else X_PRESENCE)
                        writer.writerow(new_row)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yyyy), 'w') as f:
            f.write(format_exc())
        raise
Пример #7
0
def run():
    yearDriver_gn = {}
    whole_ss_drivers = set()
    tm = 'spendingTime'
    for year in ['2009', '2010', '2011', '2012']:
        gp_dpath = dpaths[tm, year, 'groupPartition']
        gp_prefix = prefixs[tm, year, 'groupPartition']
        gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
        gp_drivers = load_pickle_file(gp_drivers_fpath)
        for gn, drivers in gp_drivers.iteritems():
            for did in drivers:
                yearDriver_gn[year, did] = gn
        yy = year[2:]
        for fn in get_all_files(ss_drivers_dpath, '%s%s*.pkl' % (ss_drivers_prefix, yy)):
            ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn)
            ss_drivers = load_pickle_file(ss_drivers_fpath)
            for did in ss_drivers:
                whole_ss_drivers.add(did)
    with open(groupEvolution_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['did', 'G2009', 'G2010', 'G2011', 'G2012']
        writer.writerow(header)
        for did in whole_ss_drivers:
            new_row = [did]
            for year in ['2009', '2010', '2011', '2012']:
                k = (year, did)
                if yearDriver_gn.has_key(k):
                    gn = yearDriver_gn[k]
                else:
                    gn = 'X'
                new_row += [gn]
            writer.writerow(new_row)
def run():
    def summary(write_fpath, read_fpath):
        logger.info('start the file; %s' % read_fpath.split('/')[-1])
        num_statistics = {}
        with open(read_fpath, 'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']])
                cur_dt = datetime.datetime(year, month, day, hour)
                dow = cur_dt.strftime("%a")
                pickUpTerminalAP, prevEndTerminalAP = row[hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']]
                k = (year, month, day, dow, hour, pickUpTerminalAP, prevEndTerminalAP)
                if not num_statistics.has_key(k):
                    num_statistics[k] = 0
                num_statistics[k] += 1
        for (year, month, day, dow, hour, pickUpTerminalAP, prevEndTerminalAP), num in num_statistics.iteritems():
            with open(write_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([year, month, day, dow, hour, prevEndTerminalAP, pickUpTerminalAP, num])
        logger.info('end the file; %s' % read_fpath.split('/')[-1])
    #
    for y in xrange(9, 11):
        yyyy = str(2000 + y)
        yy = '%02d' % y
        logger.info('Start; %s' % yyyy)
        write_fpath = '%s/%s%s.csv' % (trip_dpath, trip_ap_dp_flow_prefix, yyyy)
        with open(write_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['year', 'month', 'day', 'dayOfWeek', 'hour', 'prevEndTerminalAP', 'pickUpTerminalAP', 'totalNum']
            writer.writerow(header)
        for fn in get_all_files(trip_dpath, '%s%s*' %(trip_prefix, yy)):
            read_fpath = '%s/%s' % (trip_dpath, fn)
            summary(write_fpath, read_fpath)
def process_file(yymm):
    ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
        return None
    print 'handle the file; %s' % yymm
    veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
    veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
    if yymm not in ['0901', '1001', '1011']:
        path_to_last_day_csv_file = None
        temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv')
        prev_fn = None
        y, m = int(yymm[:2]), int(yymm[2:])
        prev_m = m - 1
        prev_yymm = '%02d%02d' %(y, prev_m)
        for temp_fn in temp_csv_files:
            if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)):
                prev_fn = temp_fn
                break
        assert prev_fn, yymm
        path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn)
        # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1:
        #     return None
        veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                        record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                             veh_ns_crossing_time, veh_last_log_ns_or_not)
    path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
            record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
    #
    save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
    save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
    print 'end the file; %s' % yymm
def process_tripBased():
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        #
        statistics_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, yyyy)
        if check_path_exist(statistics_fpath):
            logger.info('The file had already been processed; %s' % yyyy)
            return
        yy = yyyy[2:]
        holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010
        with open(statistics_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['year', 'month', 'day', 'hour', 'weekEnd',
                      'driverID',
                      'locQTime', 'locEP', 'locDuration', 'locFare',
                      'locProductivity',
                      'locIn']
            drop_pick_cns = []
            for l0 in locations:
                for l1 in locations:
                    cn = 'D%s#P%s' % (l0, l1)
                    drop_pick_cns.append(cn)
                    header.append(cn)
            writer.writerow(header)
            for fn in get_all_files(economicProfit_ap_dpath, '%s%s*' % (economicProfit_ap_prefix, yy)):
                with open('%s/%s' % (economicProfit_ap_dpath, fn), 'rt') as r_csvfile:
                    reader = csv.reader(r_csvfile)
                    headers = reader.next()
                    hid = {h: i for i, h in enumerate(headers)}
                    for row in reader:
                        year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']])
                        did = int(row[hid['did']])
                        locQTime = float(row[hid['queueingTime']]) / SEC60
                        locEP = float(row[hid['economicProfit']]) / CENT
                        locDuration = float(row[hid['duration']]) / SEC60
                        locFare = float(row[hid['fare']]) / CENT
                        locProductivity = (locFare / (locQTime + locDuration)) * SEC60
                        locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0
                        weekEnd = 0
                        if (year, month, day) in holidays:
                            weekEnd = 1
                        if datetime.datetime(year, month, day).weekday() in WEEKENDS:
                            weekEnd = 1
                        l0, l1 = row[hid['prevEndTerminalAP']], row[hid['pickUpTerminalAP']]
                        drop_pick = 'D%s#P%s' % (l0, l1)
                        new_row = [
                            year, month, day, hour, weekEnd,
                            did,
                            locQTime, locEP, locDuration, locFare,
                            locProductivity,
                            locIn
                        ]
                        for dp_candidate in drop_pick_cns:
                            if dp_candidate == drop_pick:
                                new_row.append(1)
                            else:
                                new_row.append(0)
                        writer.writerow(new_row)
Пример #11
0
def run():
    check_dir_create(dpaths['baseline', '2009', 'countGraph'])
    #
    yyyy = '20%02d' % 9
    for tfZ_TP_fn in get_all_files(tfZ_TP_dpath,
                                   '%s%s*.csv' % (tfZ_TP_prefix, yyyy)):
        tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn)
        process_file(tfZ_TP_fpath)
Пример #12
0
def filtering(year):
    yy = year[2:]
    for fn in get_all_files(prevDriversDefined_dpath,
                            '%s%s*' % (prevDriversDefined_prefix, yy)):
        df = pd.read_csv('%s/%s' % (prevDriversDefined_dpath, fn))
        cn = 'spendingTime'
        outlier_set = set(np.where(df[cn] > MINUTES40)[0].tolist())
        df = df.drop(df.index[list(outlier_set)])
        df.to_csv('%s/Filtered-%s' % (prevDriversDefined_dpath, fn),
                  index=False)
Пример #13
0
def process_file(tm, year):
    logger.info('handle the file; %s-%s' % (tm, year))
    gds_dpath = dpaths[tm, year, 'groupDayStats']
    gds_prefix = prefixs[tm, year, 'groupDayStats']
    gds_fpath = '%s/%s%s.csv' % (gds_dpath, gds_prefix, 'summary')
    with open(gds_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['groupName', 'numDrivers',
                  'numTrips', 'proDur','fare', 'fare/Trip', 'distance/Trip', 'duration/Trip', 'spendingTime', 'spendingTime/Trip',
                  'priorOnumTrips', 'priorO_ST', 'priorO_ST/Trip',
                  'priorXnumTrips', 'priorX_ST', 'priorX_ST/Trip']
        writer.writerow(header)
    gt_dpath = dpaths[tm, year, 'groupTrips']
    gt_prefix = prefixs[tm, year, 'groupTrips']
    gs_dpath = dpaths[tm, year, 'groupShifts']
    gs_prefix = prefixs[tm, year, 'groupShifts']

    for fn in get_all_files(gt_dpath, '%s*.csv' % gt_prefix):
        if len(fn[:-len('.csv')].split('-')) != 4:
            continue
        _, _, _, gn = fn[:-len('.csv')].split('-')
        gt_fpath = '%s/%s' % (gt_dpath, fn)
        gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn)
        gt_df = pd.read_csv(gt_fpath)
        gs_df = pd.read_csv(gs_fpath)
        numDrivers = len(set(gt_df['did']))
        numTrips = gt_df.groupby(['year', 'month', 'day', 'did']).count().reset_index()['groupName'].mean()
        proDur = gs_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['pro-dur'].mean()
        distance = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['distance'].mean()
        duration = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['duration'].mean()
        fare = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['fare'].mean()
        distance_trip = distance / float(numTrips)
        duration_trip = duration / float(numTrips)
        fare_trip = fare / float(numTrips)
        spendingTime = gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean()
        spendingTime_trip = spendingTime / float(numTrips)
        #
        priorO_gt_df = gt_df[(gt_df['priorPresence'] == 1)]
        priorOnumTrips = priorO_gt_df.groupby(['year', 'month', 'day', 'did']).count().reset_index()['groupName'].mean()
        priorO_ST = priorO_gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean()
        priorO_ST_trip = priorO_ST / float(priorOnumTrips)
        #
        priorX_gt_df = gt_df[(gt_df['priorPresence'] == 0)]
        priorXnumTrips = priorX_gt_df.groupby(['year', 'month', 'day', 'did']).count().reset_index()['groupName'].mean()
        priorX_ST = priorX_gt_df.groupby(['year', 'month', 'day', 'did']).sum().reset_index()['spendingTime'].mean()
        priorX_ST_trip = priorX_ST / float(priorXnumTrips)


        with open(gds_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [gn, numDrivers,
                      numTrips, proDur, fare, fare_trip, distance_trip, duration_trip, spendingTime, spendingTime_trip,
                       priorOnumTrips, priorO_ST, priorO_ST_trip,
                       priorXnumTrips, priorX_ST, priorX_ST_trip]
            writer.writerow(new_row)
Пример #14
0
def run():
    ir = 'influenceGraph'
    #
    tm = 'spendingTime'
    for year in ['2009', '2010', '2011', '2012']:
        check_dir_create(dpaths[tm, year, ir])
    #
    yyyy = '20%02d' % 9
    for tfZ_TP_fn in get_all_files(tfZ_TP_dpath, '%s%s*.csv' % (tfZ_TP_prefix, yyyy)):
        tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn)
        process_file(tfZ_TP_fpath)
def process_tripBased():
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        logger.info('handle the file; %s' % yyyy)
        #
        statistics1517_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversTrip_ns1517_prefix, yyyy)
        statistics2023_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversTrip_ns2023_prefix, yyyy)
        #
        yy = yyyy[2:]
        holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010
        for statistics_fpath in [statistics1517_fpath, statistics2023_fpath]:
            with open(statistics_fpath, 'wb') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                header = ['year', 'month', 'day', 'hour',
                          'driverID',
                          'locQTime', 'locEP', 'locDuration', 'locFare',
                          'locProductivity',
                          'locIn', 'weekEnd']
                writer.writerow(header)
        for fn in get_all_files(economicProfit_ns_dpath, '%s%s*' % (economicProfit_ns_prefix, yy)):
            with open('%s/%s' % (economicProfit_ns_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']])
                    did = int(row[hid['did']])
                    locQTime = float(row[hid['queueingTime']]) / SEC60
                    locEP = float(row[hid['economicProfit']]) / CENT
                    locDuration = float(row[hid['duration']]) / SEC60
                    locFare = float(row[hid['fare']]) / CENT
                    locProductivity = (locFare / (locQTime + locDuration)) * SEC60
                    locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0
                    weekEnd = 0
                    if (year, month, day) in holidays:
                        weekEnd = 1
                    if datetime.datetime(year, month, day).weekday() in WEEKENDS:
                        weekEnd = 1
                    if hour in tf_ns1517:
                        statistics_fpath = statistics1517_fpath
                    elif hour in tf_ns2023:
                        statistics_fpath = statistics2023_fpath
                    else:
                        continue
                    with open(statistics_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [
                            year, month, day, hour,
                            did,
                            locQTime, locEP, locDuration, locFare,
                            locProductivity,
                            locIn, weekEnd]
                        writer.writerow(new_row)
Пример #16
0
def run():
    ir = 'influenceGraph'
    # for tm in ['spendingTime', 'roamingTime']:
    for tm in ['spendingTime']:
        for year in ['2009', '2010', '2011', '2012']:
            check_dir_create(dpaths[tm, year, ir])

    yyyy = '20%02d' % 9
    for tfZ_TP_fn in get_all_files(tfZ_TP_dpath,
                                   '%s%s*.csv' % (tfZ_TP_prefix, yyyy)):
        tfZ_TP_fpath = '%s/%s' % (tfZ_TP_dpath, tfZ_TP_fn)
        process_file(tfZ_TP_fpath)
Пример #17
0
def process_files(yyyy, reducerID, driver_subset, pickUp_drivers):
    from traceback import format_exc
    #
    try:
        logger.info('Handle arrange %s(%d)' % (yyyy, reducerID))
        tfZ_TP_fpath = '%s/%s%s-%d.csv' % (tfZ_TP_dpath, tfZ_TP_prefix, yyyy, reducerID)
        with open(tfZ_TP_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['month', 'day',
                      'timeFrame', 'zi', 'zj', 'tfZ',
                      'did', 'spendingTime', 'roamingTime']
            for did0 in pickUp_drivers:
                header.append(did0)
            writer.writerow(header)
        yy = yyyy[2:]
        for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*.csv' % (prevDriversDefined_prefix, yy)):
            prevDriverDefined_fpath = '%s/%s' % (prevDriversDefined_dpath, fn)
            logger.info('Handling %s(%d); %s' % (yyyy, reducerID, fn))
            with open(prevDriverDefined_fpath, 'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                header = reader.next()
                hid = {h: i for i, h in enumerate(header)}
                handling_day = 0
                for row in reader:
                    cur_dtT = datetime.datetime.fromtimestamp(eval(row[hid['time']]))
                    if handling_day != cur_dtT.day:
                        handling_day = cur_dtT.day
                        logger.info('Processing %s %dth day; reducer %d' % (fn, cur_dtT.day, reducerID))
                    did1 = int(row[hid['did']])
                    if did1 not in driver_subset:
                        continue
                    _prevDrivers = row[hid['prevDrivers']].split('&')
                    if len(_prevDrivers) == 1 and _prevDrivers[0] == '':
                        continue
                    prevDrivers = map(int, _prevDrivers)
                    tf = row[hid['timeFrame']]
                    zi, zj = row[hid['zi']], row[hid['zj']]
                    tfZ = '(%s,%s,%s)' % (tf, zi, zj)
                    with open(tfZ_TP_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [row[hid['month']], row[hid['day']],
                                   tf, zi, zj, tfZ,
                                   did1, row[hid['spendingTime']], row[hid['roamingTime']]
                        ]
                        for did0 in pickUp_drivers:
                            new_row.append(O_PRESENCE if did0 in prevDrivers else X_PRESENCE)
                        writer.writerow(new_row)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yyyy), 'w') as f:
            f.write(format_exc())
        raise
Пример #18
0
def process_file(tm, year):
    logger.info('handle the file; %s-%s' % (tm, year))
    gds_dpath = dpaths[tm, year, 'groupDayStats']
    gds_prefix = prefixs[tm, year, 'groupDayStats']
    gds_fpath = '%s/%s%s.csv' % (gds_dpath, gds_prefix, 'summary')
    with open(gds_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = [
            'groupName', 'numDrivers', 'numTrips', 'proDur', 'fare',
            'fare/Trip', 'distance/Trip', 'duration/Trip', 'spendingTime',
            'spendingTime/Trip'
        ]
        writer.writerow(header)
    gt_dpath = dpaths[tm, year, 'groupTrips']
    gt_prefix = prefixs[tm, year, 'groupTrips']
    gs_dpath = dpaths[tm, year, 'groupShifts']
    gs_prefix = prefixs[tm, year, 'groupShifts']
    #
    for fn in get_all_files(gt_dpath, '%s*.csv' % gt_prefix):
        if len(fn[:-len('.csv')].split('-')) != 4:
            continue
        _, _, _, gn = fn[:-len('.csv')].split('-')
        gt_fpath = '%s/%s' % (gt_dpath, fn)
        gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn)
        gt_df = pd.read_csv(gt_fpath)
        gs_df = pd.read_csv(gs_fpath)
        numDrivers = len(set(gt_df['did']))
        numTrips = gt_df.groupby(['year', 'month', 'day', 'did'
                                  ]).count().reset_index()['groupName'].mean()
        proDur = gs_df.groupby(['year', 'month', 'day',
                                'did']).sum().reset_index()['pro-dur'].mean()
        distance = gt_df.groupby(['year', 'month', 'day', 'did'
                                  ]).sum().reset_index()['distance'].mean()
        duration = gt_df.groupby(['year', 'month', 'day', 'did'
                                  ]).sum().reset_index()['duration'].mean()
        fare = gt_df.groupby(['year', 'month', 'day',
                              'did']).sum().reset_index()['fare'].mean()
        distance_trip = distance / float(numTrips)
        duration_trip = duration / float(numTrips)
        fare_trip = fare / float(numTrips)
        spendingTime = gt_df.groupby(
            ['year', 'month', 'day',
             'did']).sum().reset_index()['spendingTime'].mean()
        spendingTime_trip = spendingTime / float(numTrips)
        with open(gds_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [
                gn, numDrivers, numTrips, proDur, fare, fare_trip,
                distance_trip, duration_trip, spendingTime, spendingTime_trip
            ]
            writer.writerow(new_row)
def run():
    for dpath in [
                    # statisticsSsDrivers_ap_dpath,
                    statisticsSsDrivers_ns_dpath
                    ]:
        check_dir_create(dpath)
    #
    ssDrivers = set()
    for y in xrange(9, 11):
        for m in xrange(1, 13):
            yymm = '%02d%02d' % (y, m)
            if yymm in ['0912', '1010']:
                # both years data are corrupted
                continue
            ssDrivers = ssDrivers.union(load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)))
    #
    for all_dpath, ss_dpath in [
                                # (statisticsAllDrivers_ap_dpath, statisticsSsDrivers_ap_dpath),
                                (statisticsAllDrivers_ns_dpath, statisticsSsDrivers_ns_dpath)
                                ]:
        for all_prefix, ss_prefix in [
                                    # (statisticsAllDriversDay_ap_prefix, statisticsSsDriversDay_ap_prefix),
                                      (statisticsAllDriversDay_ns1517_prefix, statisticsSsDriversDay_ns1517_prefix),
                                      (statisticsAllDriversDay_ns2023_prefix, statisticsSsDriversDay_ns2023_prefix),

                                      # (statisticsAllDriversMonth_ap_prefix, statisticsSsDriversMonth_ap_prefix),
                                      (statisticsAllDriversMonth_ns1517_prefix, statisticsSsDriversMonth_ns1517_prefix),
                                      (statisticsAllDriversMonth_ns2023_prefix, statisticsSsDriversMonth_ns2023_prefix),

                                      # (statisticsAllDriversTrip_ap_prefix, statisticsSsDriversTrip_ap_prefix),
                                      (statisticsAllDriversTrip_ns1517_prefix, statisticsSsDriversTrip_ns1517_prefix),
                                      (statisticsAllDriversTrip_ns2023_prefix, statisticsSsDriversTrip_ns2023_prefix),
                                      ]:
            for fn in get_all_files(all_dpath, '%s*' % all_prefix):
                period = fn[:-len('.csv')].split('-')[2]
                with open('%s/%s' % (all_dpath, fn), 'rt') as r_csvfile:
                    reader = csv.reader(r_csvfile)
                    header = reader.next()
                    hid = {h: i for i, h in enumerate(header)}
                    with open('%s/%s%s.csv' % (ss_dpath, ss_prefix, period), 'wt') as w_csvfile:
                        writer = csv.writer(w_csvfile)
                        writer.writerow(header)
                        for row in reader:
                            did = int(row[hid['driverID']])
                            if did not in ssDrivers:
                                continue
                            writer.writerow(row)
Пример #20
0
def run():
    init_multiprocessor(6)
    count_num_jobs = 0
    tm = 'spendingTime'
    # for year in ['2009', '2010', '2011', '2012']:
    for year in ['2009']:
        gds_dpath = dpaths[tm, year, 'groupDriverStats']
        check_dir_create(gds_dpath)
        #
        gm_dpath = dpaths[tm, year, 'groupMarginal']
        gm_prefix = prefixs[tm, year, 'groupMarginal']
        for fn in get_all_files(gm_dpath, '%s*.csv' % gm_prefix):
            _, _, _, gn = fn[:-len('.csv')].split('-')
            # process_file(tm, year, gn)
            put_task(process_file, [tm, year, gn])
            count_num_jobs += 1
    end_multiprocessor(count_num_jobs)
Пример #21
0
def run():
    init_multiprocessor(6)
    count_num_jobs = 0
    for tm in ['spendingTime']:
        # for year in ['2009', '2010', '2011', '2012']:
        for year in ['2009']:
            gm_dpath = dpaths[tm, year, 'groupMarginal']
            check_dir_create(gm_dpath)
            #
            gp_dpath = dpaths[tm, year, 'groupPartition']
            gp_prefix = prefixs[tm, year, 'groupPartition']
            for fn in get_all_files(gp_dpath, '%s*.pkl' % gp_prefix):
                _, _, _, gn = fn[:-len('.pkl')].split('-')
                if gn == 'drivers' or gn == 'original':
                    continue
                # process_file(tm, year, gn)
                put_task(process_file, [tm, year, gn])
                count_num_jobs += 1
    end_multiprocessor(count_num_jobs)
Пример #22
0
def process_file(tm, year, gn, groupDrivers):
    logger.info('handle the file; %s-%s-%s' % (tm, year, gn))
    gt_dpath = dpaths[tm, year, 'groupTrips']
    gt_prefix = prefixs[tm, year, 'groupTrips']
    gt_fpath = '%s/%s%s.csv' % (gt_dpath, gt_prefix, gn)
    #
    # gs_dpath = dpaths[tm, year, 'groupShifts']
    # gs_prefix = prefixs[tm, year, 'groupShifts']
    # gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn)
    xgt_fpath = '%s/%s%s.csv' % (gt_dpath, gt_prefix, 'X')
    assert xgt_fpath == gt_fpath, (gt_fpath)
    with open(xgt_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['time', 'year', 'month', 'day', 'hour',
                  'did', 'groupName',
                  'zi', 'zj', 'zizj',
                  tm, 'priorPresence',
                  'start-long', 'start-lat',
                  'distance', 'duration', 'fare']
        writer.writerow(header)
    yy = year[2:]
    for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' % (prevDriversDefined_prefix, yy)):
        fpath = '%s/%s' % (prevDriversDefined_dpath, fn)
        logger.info('handle the file %s; %s-%s-%s' % (fn, tm, year, gn))
        with open(fpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            header = reader.next()
            hid = {h: i for i, h in enumerate(header)}
            for row in reader:
                did1 = int(row[hid['did']])
                if did1 not in groupDrivers:
                    with open(xgt_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [row[hid['time']], year]
                        new_row += [row[hid[cn]] for cn in ['month', 'day', 'timeFrame']]
                        new_row += [did1, 'X']
                        zi, zj = row[hid['zi']], row[hid['zj']]
                        zizj = '%s#%s' % (zi, zj)
                        new_row += [zi, zj, zizj]
                        new_row += [row[hid[tm]], 'X']
                        for cn in ['start-long', 'start-lat', 'distance', 'duration', 'fare']:
                            new_row.append(row[hid[cn]])
                        writer.writerow(new_row)
Пример #23
0
def find_driversRelations(year):
    yy = year[2:]
    driversRelations = {}
    for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' %
                            (prevDriversDefined_prefix, yy)):
        logger.info('handle the file; %s' % fn)
        with open('%s/%s' % (prevDriversDefined_dpath, fn), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                did1 = int(row[hid['did']])
                prevDrivers = row[hid['prevDrivers']].split('&')
                if len(prevDrivers) == 1 and prevDrivers[0] == '':
                    continue
                if not driversRelations.has_key(did1):
                    driversRelations[did1] = set()
                for did0 in map(int, prevDrivers):
                    driversRelations[did1].add(did0)
    save_pickle_file(driversRelations_fpaths[year], driversRelations)
Пример #24
0
def summary():
    summary_fpath = '%s/%s%s.csv' % (of_dpath, of_prefix, year)
    with open(summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = [
            'did', 'numObservations', 'numPrevDrivers', 'numSigRelationship',
            'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation'
        ]
        writer.writerow(header)

    for fn in get_all_files(of_dpath, '%s%s-*.csv' % (of_prefix, year)):
        _, _, _, _did1 = fn[:-len('.csv')].split('-')
        fpath = '%s/%s' % (of_dpath, fn)
        with open(fpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            reader.next()
            hid = {h: i for i, h in enumerate(header)}
            for row in reader:
                with open(summary_fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    writer.writerow(row)
Пример #25
0
def summary():
    summary_fpath = '%s/%s%s.csv' % (of_dpath, of_prefix, year)
    with open(summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['did',
                  'numObservations', 'numPrevDrivers',
                  'numSigRelationship',
                  'numPosCoef', 'numNegCoef',
                  'sigPosRelation', 'sigNegRelation']
        writer.writerow(header)

    for fn in get_all_files(of_dpath, '%s%s-*.csv' % (of_prefix, year)):
        _, _, _, _did1 = fn[:-len('.csv')].split('-')
        fpath = '%s/%s' % (of_dpath, fn)
        with open(fpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            reader.next()
            hid = {h: i for i, h in enumerate(header)}
            for row in reader:
                with open(summary_fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    writer.writerow(row)
Пример #26
0
def run():
    init_multiprocessor(6)
    count_num_jobs = 0
    tm = 'baseline'
    # for tm in ['spendingTime', 'roamingTime']:
        # for year in ['2009', '2010', '2011', '2012']:
    for year in ['2009']:
        gz_dpath = dpaths[tm, year, 'groupZones']
        check_dir_create(gz_dpath)
        #
        gt_dpath = dpaths[tm, year, 'groupTrips']
        gt_prefix = prefixs[tm, year, 'groupTrips']
        for fn in get_all_files(gt_dpath, '%s*' % gt_prefix):
            if len(fn[:-len('.csv')].split('-')) != 4:
                continue
            _, _, _, gn = fn[:-len('.csv')].split('-')
            if gn == 'X':
                continue
            gt_fpath = '%s/%s' % (gt_dpath, fn)
            # process_file(tm, year, gt_fpath)
            put_task(process_file, [tm, year, gt_fpath])
            count_num_jobs += 1
    end_multiprocessor(count_num_jobs)
Пример #27
0
def run():
    init_multiprocessor(6)
    count_num_jobs = 0
    tm = 'spendingTime'
    # for tm in ['spendingTime', 'roamingTime']:
    # for year in ['2009', '2010', '2011', '2012']:
    for year in ['2009']:
        gz_dpath = dpaths[tm, year, 'groupZones']
        check_dir_create(gz_dpath)
        #
        gt_dpath = dpaths[tm, year, 'groupTrips']
        gt_prefix = prefixs[tm, year, 'groupTrips']
        for fn in get_all_files(gt_dpath, '%s*' % gt_prefix):
            if len(fn[:-len('.csv')].split('-')) != 4:
                continue
            _, _, _, gn = fn[:-len('.csv')].split('-')
            if gn == 'X':
                continue
            gt_fpath = '%s/%s' % (gt_dpath, fn)
            # process_file(tm, year, gt_fpath)
            put_task(process_file, [tm, year, gt_fpath])
            count_num_jobs += 1
    end_multiprocessor(count_num_jobs)
Пример #28
0
def run():
    drivers_dates = {}
    for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
        _, _date, _did = fn[:-len('.csv')].split('-')
        year = 2000 + int(_date[:2])
        month, day = map(int, [_date[2:4], _date[4:6]])
        dt = datetime.datetime(year, month, day)
        k = int(_did)
        if not drivers_dates.has_key(k):
            drivers_dates[k] = []
        drivers_dates[k] += [dt]
    #
    for did, dates in drivers_dates.iteritems():
        ofpath = '%s%d.pkl' % (if_prefix, did)
        if check_path_exist(ofpath):
            continue
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(
                        row[hid['time']]))
                    lon, lat = map(
                        eval,
                        [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [dt, x, y, int(row[hid['state']])]
        save_pickle_file(ofpath, dt_xy_state)
Пример #29
0
def run():
    cg_dpath = dpaths['baseline', '2009', 'countGraph']
    cg_prefix = prefixs['baseline', '2009', 'countGraph']
    gp_dpath = dpaths['baseline', '2009', 'groupPartition']
    gp_prefix = prefixs['baseline', '2009', 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    #
    logger.info('Start handling SP_group_dpath')
    if not check_path_exist(gp_original_fpath):
        original_graph = {}
        for fn in get_all_files(cg_dpath, '%s*' % cg_prefix):
            count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn))
            logger.info('Start handling; %s' % fn)
            numEdges = len(count_graph)
            moduloNumber = numEdges / 10
            for i, ((did0, did1), w) in enumerate(count_graph.iteritems()):
                if i % moduloNumber == 0:
                    logger.info('Handling; %.2f' % (i / float(numEdges)))
                original_graph[did0, did1] = w
        save_pickle_file(gp_original_fpath, original_graph)
    else:
        original_graph = load_pickle_file(gp_original_fpath)
    #
    logger.info('igraph converting')
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    numEdges = len(original_graph)
    moduloNumber = numEdges / 10
    for i, ((did0, did1), w) in enumerate(original_graph.iteritems()):
        if i % moduloNumber == 0:
            logger.info('Handling; %.2f' % i / float(numEdges))
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Пример #30
0
def summary():
    from traceback import format_exc
    try:
        logger.info('Start summary')
        ignoring_periods = []
        for ys, ms, ds, hs in error_hours:
            yyyy = 2000 + int(ys)
            mm, dd, hh = map(int, [ms, ds, hs])
            k = (yyyy, mm, dd, hh)
            ignoring_periods.append(k)
        cur_timestamp = datetime.datetime(2008, 12, 31, 23)
        last_timestamp = datetime.datetime(2011, 1, 1, 0)
        hp_summary, time_period_order = {}, []
        while cur_timestamp < last_timestamp:
            cur_timestamp += datetime.timedelta(hours=1)
            year, month, day, hour = cur_timestamp.year, cur_timestamp.month, cur_timestamp.day, cur_timestamp.hour
            if year == 2009 and month == 12: continue
            if year == 2010 and month == 10: continue
            if year == 2011: continue
            if AM2 <= hour and hour <= AM5: continue
            need2skip = False
            for ys, ms, ds, hs in error_hours:
                year0 = 2000 + int(ys)
                month0, day0, hour0 = map(int, [ms, ds, hs])
                if (year == year0) and (month == month0) and (
                        day == day0) and (hour == hour0):
                    need2skip = True
            if need2skip: continue
            #
            k = year, month, day, hour
            hp_summary[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, \
                                                   AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, \
                                                   NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))]
            time_period_order.append(k)
            #
        year_l, month_l, day_l, hour_l = 'year', 'month', 'day', 'hour'
        for fn in get_all_files(productivity_dpath,
                                '%s*.csv' % productivity_prefix):
            with open('%s/%s' % (productivity_dpath, fn), 'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month = int(row[hid[year_l]]), int(row[hid[month_l]])
                    day, hour = int(row[hid[day_l]]), int(row[hid[hour_l]])
                    k = (year, month, day, hour)
                    if not hp_summary.has_key(k): continue
                    hp_summary[k][ALL_DUR] += eval(row[hid['allDuration']])
                    hp_summary[k][ALL_FARE] += eval(row[hid['allFare']])
                    hp_summary[k][ALL_NUM] += eval(row[hid['allNum']])
                    #
                    hp_summary[k][AP_DUR] += eval(row[hid['apDuration']])
                    hp_summary[k][AP_FARE] += eval(row[hid['apFare']])
                    hp_summary[k][AP_QUEUE] += eval(row[hid['apQueueingTime']])
                    hp_summary[k][AP_NUM] += eval(row[hid['apNum']])
                    #
                    hp_summary[k][NS_DUR] += eval(row[hid['nsDuration']])
                    hp_summary[k][NS_FARE] += eval(row[hid['nsFare']])
                    hp_summary[k][NS_QUEUE] += eval(row[hid['nsQueueingTime']])
                    hp_summary[k][NS_NUM] += eval(row[hid['nsNum']])
        #
        with open(productivity_summary_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile)
            header = [
                'year', 'month', 'day', 'hour', 'allNum', 'allTotalDuration',
                'allAvgDuration', 'allTotalFare', 'allAvgFare',
                'allProductivity', 'apNum', 'apTotalDuration', 'apAvgDuration',
                'apTotalFare', 'apAvgFare', 'apTotalQueueing', 'apAvgQueueing',
                'apProductivity', 'apGenNum', 'apGenTotalDuration',
                'apGenAvgDuration', 'apGenTotalFare', 'apGenAvgFare',
                'apGenProductivity', 'nsNum', 'nsTotalDuration',
                'nsAvgDuration', 'nsTotalFare', 'nsAvgFare', 'nsTotalQueueing',
                'nsAvgQueueing', 'nsProductivity', 'nsGenNum',
                'nsGenTotalDuration', 'nsGenAvgDuration', 'nsGenTotalFare',
                'nsGenAvgFare', 'nsGenProductivity', 'key'
            ]
            writer.writerow(header)
            for k in time_period_order:
                all_total_dur, all_total_fare, all_num, \
                ap_total_dur, ap_total_fare, ap_total_queue, ap_num, \
                ns_total_dur, ns_total_fare, ns_total_queue, ns_num = hp_summary[k]
                year, month, day, hour = k
                #
                if all_num == 0:
                    all_avg_dur, all_avg_fare = -1, -1
                    all_prod = -1
                else:
                    all_avg_dur, all_avg_fare = all_total_dur / float(
                        all_num), all_total_fare / float(all_num)
                    if all_total_dur == 0:
                        all_prod = -1
                    else:
                        all_prod = all_total_fare / float(all_total_dur)
                if ap_num == 0:
                    ap_avg_dur, ap_avg_fare, ap_avg_queue = -1, -1, -1
                    ap_prod = -1
                else:
                    ap_avg_dur, ap_avg_fare, ap_avg_queue = \
                        ap_total_dur / float(ap_num), ap_total_fare / float(ap_num), ap_total_queue / float(ap_num)
                    if ap_total_dur == 0:
                        ap_prod = -1
                    else:
                        ap_prod = ap_total_fare / float(ap_total_dur)
                ap_gen_num = all_num - ap_num
                ap_gen_total_dur = all_total_dur - (ap_total_dur +
                                                    ap_total_queue)
                ap_gen_total_fare = all_total_fare - ap_total_fare
                if ap_gen_num == 0:
                    ap_gen_avg_dur, ap_gen_avg_fare = -1, -1
                    ap_gen_prod = -1
                else:
                    ap_gen_avg_dur, ap_gen_avg_fare = \
                        ap_gen_total_dur / float(ap_gen_num), ap_gen_total_fare / float(ap_gen_num)
                    if ap_gen_total_dur == 0:
                        ap_gen_prod = -1
                    else:
                        ap_gen_prod = ap_gen_total_fare / float(
                            ap_gen_total_dur)
                #
                if ns_num == 0:
                    ns_avg_dur, ns_avg_fare, ns_avg_queue = -1, -1, -1
                    ns_prod = -1
                else:
                    ns_avg_dur, ns_avg_fare, ns_avg_queue = \
                        ns_total_dur / float(ns_num), ns_total_fare / float(ns_num), ns_total_queue / float(ns_num)
                    if ns_total_dur == 0:
                        ns_prod = -1
                    else:
                        ns_prod = ns_total_fare / float(ns_total_dur)
                ns_gen_num = all_num - ns_num
                ns_gen_total_dur = all_total_dur - (ns_total_dur +
                                                    ns_total_queue)
                ns_gen_total_fare = all_total_fare - ns_total_fare
                if ns_gen_num == 0:
                    ns_gen_avg_dur, ns_gen_avg_fare = -1, -1
                    ns_gen_prod = -1
                else:
                    ns_gen_avg_dur, ns_gen_avg_fare = \
                        ns_gen_total_dur / float(ns_gen_num), ns_gen_total_fare / float(ns_gen_num)
                    if ns_gen_total_dur == 0:
                        ns_gen_prod = -1
                    else:
                        ns_gen_prod = ns_gen_total_fare / float(
                            ns_gen_total_dur)
                #
                writer.writerow([
                    year, month, day, hour, all_num, all_total_dur,
                    all_avg_dur, all_total_fare, all_avg_fare, all_prod,
                    ap_num, ap_total_dur, ap_avg_dur, ap_total_fare,
                    ap_avg_fare, ap_total_queue, ap_avg_queue, ap_prod,
                    ap_gen_num, ap_gen_total_dur, ap_gen_avg_dur,
                    ap_gen_total_fare, ap_gen_avg_fare, ap_gen_prod, ns_num,
                    ns_total_dur, ap_avg_dur, ns_total_fare, ns_avg_fare,
                    ns_total_queue, ns_avg_queue, ns_prod, ns_gen_num,
                    ns_gen_total_dur, ap_gen_avg_dur, ns_gen_total_fare,
                    ns_gen_avg_fare, ns_gen_prod, k
                ])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], 'summary'), 'w') as f:
            f.write(format_exc())
        raise
def process_dayBased():
    logger.info('handle dayBased')
    #
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        statistics1517_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversDay_ns1517_prefix, yyyy)
        statistics2023_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversDay_ns2023_prefix, yyyy)
        #
        dateDid_statistics1517, dateDid_statistics2023 = {}, {}
        logger.info('process locTrip')
        for ns_prefix, dateDid_statistics in [(statisticsAllDriversTrip_ns1517_prefix, dateDid_statistics1517),
                                                   (statisticsAllDriversTrip_ns2023_prefix, dateDid_statistics2023)]:
            tripBased_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ns_dpath, ns_prefix, yyyy)
            logger.info('process locTrip')
            with open(tripBased_fpath, 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']])
                    did = int(row[hid['driverID']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        dateDid_statistics[k] = [0.0 for _ in [WTN, WOH, WF, LTN, LIN, LON, LQ, LEP, LD, LF]]
                    dateDid_statistics[k][LTN] += 1
                    if int(row[hid['locIn']]) == 1:
                        dateDid_statistics[k][LIN] += 1
                    else:
                        assert int(row[hid['locIn']]) == 0
                        dateDid_statistics[k][LON] += 1
                    dateDid_statistics[k][LQ] += float(row[hid['locQTime']])
                    dateDid_statistics[k][LEP] += float(row[hid['locEP']])
                    dateDid_statistics[k][LD] += float(row[hid['locDuration']])
                    dateDid_statistics[k][LF] += float(row[hid['locFare']])
        yy = yyyy[2:]
        logger.info('process shift')
        for fn in get_all_files(shiftProDur_dpath, '%s%s*' % (shiftProDur_prefix, yy)):
            logger.info('shift; %s' % fn)
            with open('%s/%s' % (shiftProDur_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day, hour = 2000 + int(row[hid['yy']]), int(row[hid['mm']]), int(row[hid['dd']]), int(row[hid['hh']])
                    if hour in tf_ns1517:
                        dateDid_statistics = dateDid_statistics1517
                    elif hour in tf_ns2023:
                        dateDid_statistics = dateDid_statistics2023
                    else:
                        continue
                    did = int(row[hid['did']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        continue
                    dateDid_statistics[k][WOH] += (float(row[hid['pro-dur']]) * SEC60) / SEC3600

        logger.info('process trip')
        for fn in get_all_files(trip_dpath, '%s%s*' % (trip_prefix, yy)):
            logger.info('Trip; %s' % fn)
            _, yymm = fn[:-len('.csv')].split('-')
            yy, mm = yymm[:2], yymm[-2:]
            year, month = 2000 + int(yy), int(mm)
            with open('%s/%s' % (trip_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    day, hour = int(row[hid['day']]), int(row[hid['hour']])
                    if hour in tf_ns1517:
                        dateDid_statistics = dateDid_statistics1517
                    elif hour in tf_ns2023:
                        dateDid_statistics = dateDid_statistics2023
                    else:
                        continue
                    did = int(row[hid['did']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        continue
                    dateDid_statistics[k][WTN] += 1
                    dateDid_statistics[k][WF] += float(row[hid['fare']]) / CENT
        #
        logger.info('write statistics; %s' % yymm)
        for statistics_fpath, dateDid_statistics in [(statistics1517_fpath, dateDid_statistics1517),
                                                     (statistics2023_fpath, dateDid_statistics2023)]:
            with open(statistics_fpath, 'wb') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                header = ['year', 'month', 'day', 'driverID',
                          'wleTripNumber', 'wleOperatingHour', 'wleFare',
                          'wleProductivity',
                          'locTripNumber', 'locInNumber', 'locOutNumber',
                          'locQTime', 'locEP', 'locDuration', 'locFare',
                          'QTime/locTrip', 'EP/locTrip',
                          'locProductivity']
                writer.writerow(header)
                for (year, month, day, did), statistics in dateDid_statistics.iteritems():
                    wleTripNumber, wleOperatingHour, wleFare = int(statistics[WTN]), statistics[WOH], statistics[WF],
                    if wleOperatingHour == 0.0:
                        continue
                    wleProductivity = wleFare / wleOperatingHour
                    #
                    locTripNumber, locInNumber, locOutNumber = map(int, [statistics[LTN], statistics[LIN], statistics[LON]])
                    if locTripNumber == 0.0:
                        continue
                    locQTime, locEP, locDuration, locFare = statistics[LQ], statistics[LEP], statistics[LD], statistics[LF]
                    if (locQTime + locDuration) == 0.0:
                        continue
                    QTime_locTrip, EP_locTrip = locQTime / float(locTripNumber), locEP / float(locTripNumber)
                    locProductivity = (locFare / (locQTime + locDuration)) * SEC60
                    new_row = [
                        year, month, day, did,
                        wleTripNumber, wleOperatingHour, wleFare,
                        wleProductivity,
                        locTripNumber, locInNumber, locOutNumber,
                        locQTime, locEP, locDuration, locFare,
                        QTime_locTrip, EP_locTrip,
                        locProductivity]
                    writer.writerow(new_row)
Пример #32
0
def process_file(yymm):
    def record_crossing_time(path_to_csv_file,
                             veh_ap_crossing_time, veh_last_log_ap_or_not,
                             veh_ns_crossing_time, veh_last_log_ns_or_not):
        with open(path_to_csv_file, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                t, vid = eval(row[hid['time']]), row[hid['vid']]
                ap_or_not, ns_or_not = eval(row[hid['ap-or-not']]), eval(row[hid['ns-or-not']])
                #
                if not veh_last_log_ap_or_not.has_key(vid):
                    if ap_or_not == IN:
                        # the first log's position was occurred in the AP zone
                        assert not veh_ap_crossing_time.has_key(vid)
                        veh_ap_crossing_time[vid] = [t]
                else:
                    assert veh_last_log_ap_or_not.has_key(vid)
                    if veh_last_log_ap_or_not[vid] == OUT and ap_or_not == IN:
                        veh_ap_crossing_time.setdefault(vid, [t]).append(t)
                #
                if not veh_last_log_ns_or_not.has_key(vid):
                    if ns_or_not == IN:
                        # the first log's position was occurred in the NS zone
                        assert not veh_ns_crossing_time.has_key(vid)
                        veh_ns_crossing_time[vid] = [t]
                else:
                    assert veh_last_log_ns_or_not.has_key(vid)
                    if veh_last_log_ns_or_not[vid] == OUT and ns_or_not == IN:
                        veh_ns_crossing_time.setdefault(vid, [t]).append(t)
                #
                veh_last_log_ap_or_not[vid] = ap_or_not
                veh_last_log_ns_or_not[vid] = ns_or_not
        return veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not
    #
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm)
        if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
            return None
        print 'handle the file; %s' % yymm
        veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
        veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
        if yymm not in ['0901', '1001', '1011']:
            y, m = int(yymm[:2]), int(yymm[2:])
            prev_m = m - 1
            prev_yymm = '%02d%02d' %(y, prev_m)
            prev_fn = get_all_files(log_last_day_dpath, '%s%s*.csv' % (log_last_day_prefix, prev_yymm))[0]
            path_to_last_day_csv_file = '%s/%s' % (log_last_day_dpath, prev_fn)
            veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                            record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
        path_to_csv_file = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm)
        veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
                record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                     veh_ns_crossing_time, veh_last_log_ns_or_not)
        #
        save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
        save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
Пример #33
0
def run():
    gp_summary_fpath = '%s/%ssummary.csv' % (of_dpath, of_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (of_dpath, of_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (of_dpath, of_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    logger.info('Start handling SP_group_dpath')
    orignal_graph = {}
    for fn in get_all_files(if_dpath,
                            '%ssigRelation-%s-*.pkl' % (if_prefix, year)):
        _, _, _, _, _did1 = fn[:-len('.csv')].split('-')
        sigRelatioin = load_pickle_file('%s/%s' % (if_dpath, fn))
        for _did0, coef in sigRelatioin['pos']:
            did0, did1 = map(int, [_did0, _did1])
            orignal_graph[did0, did1] = coef
    save_pickle_file(gp_original_fpath, orignal_graph)
    #
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()):
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (of_dpath, of_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (of_dpath, of_prefix, gn)
        # layout = sg.layout("kk")
        # if len(drivers) < 100:
        #     ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        # else:
        #     ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (of_dpath, of_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Пример #34
0
import __init__
'''
'''

from community_analysis import dpaths, prefixs
#
from taxi_common.file_handling_functions import get_all_files, load_pickle_file

year = '20%02d' % 9
# depVar = 'roamingTime'
depVar = 'interTravelTime'
#
#
of_dpath = dpaths[depVar, 'influenceGraph']
of_prefixs = prefixs[depVar, 'influenceGraph']

countRelationWhole = {k: 0 for k in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']}

for fn in get_all_files(of_dpath, '%scount-*' % of_prefixs):
    print fn
    fpath = '%s/%s' % (of_dpath, fn)
    countRelation = load_pickle_file(fpath)
    for n in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']:
        countRelationWhole[n] += countRelation[n]

print countRelationWhole
def run():
    ignoring_periods = []
    for ys, ms, ds, hs in error_hours:
        yyyy = 2000 + int(ys)
        mm, dd, hh = map(int, [ms, ds, hs])
        k = (yyyy, mm, dd, hh)
        ignoring_periods.append(k)
    cur_timestamp = datetime.datetime(2008, 12, 31, 23)
    last_timestamp = datetime.datetime(2011, 1, 1, 0)
    hp_summary, time_period_order = {}, []
    while cur_timestamp < last_timestamp:
        cur_timestamp += datetime.timedelta(hours=1)
        yyyy, mm, dd, hh = cur_timestamp.year, cur_timestamp.month, cur_timestamp.day, cur_timestamp.hour
        if yyyy == 2009 and mm == 12: continue
        if yyyy == 2010 and mm == 10: continue
        if yyyy == 2011: continue
        if AM2 <= hh and hh <= AM5: continue
        need2skip = False
        for ys, ms, ds, hs in error_hours:
            yyyy0 = 2000 + int(ys)
            mm0, dd0, hh0 = map(int, [ms, ds, hs])
            if (yyyy == yyyy0) and (mm == mm0) and (dd == dd0) and (hh == hh0):
                need2skip = True
        if need2skip: continue
        #
        k = (str(yyyy - 2000), str(mm), str(dd), str(hh))
        hp_summary[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, \
                                               AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, \
                                               NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))]
        time_period_order.append(k)
        #
    yy_l, mm_l, dd_l, hh_l = 'yy', 'mm', 'dd', 'hh'
    for fn in get_all_files(productivity_dir, productivity_prefix, '.csv'):
        with open('%s/%s' % (productivity_dir, fn), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h : i for i, h in enumerate(headers)}
            for row in reader:
                yy, mm, dd, hh = row[hid[yy_l]], row[hid[mm_l]], row[hid[dd_l]], row[hid[hh_l]]
                k = (yy, mm, dd, hh)
                if not hp_summary.has_key(k): continue
                hp_summary[k][ALL_DUR] += eval(row[hid['all-duration']])
                hp_summary[k][ALL_FARE] += eval(row[hid['all-fare']])
                hp_summary[k][ALL_NUM] += eval(row[hid['all-num']])

                hp_summary[k][AP_DUR] += eval(row[hid['ap-duration']])
                hp_summary[k][AP_FARE] += eval(row[hid['ap-fare']])
                hp_summary[k][AP_QUEUE] += eval(row[hid['ap-queueing-time']])
                hp_summary[k][AP_NUM] += eval(row[hid['ap-num']])

                hp_summary[k][NS_DUR] += eval(row[hid['ns-duration']])
                hp_summary[k][NS_FARE] += eval(row[hid['ns-fare']])
                hp_summary[k][NS_QUEUE] += eval(row[hid['ns-queueing-time']])
                hp_summary[k][NS_NUM] += eval(row[hid['ns-num']])

    # Summary
    print 'summary'
    zero_dur = []
    with open(hourly_stats_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile)
        header = ['yy', 'mm', 'dd', 'hh',
                    'all-num',
                        'all-total-duration', 'all-avg-duration',
                        'all-total-fare', 'all-avg-fare',
                        'all-productivity',
                    'ap-num',
                        'atotal-duration', 'aavg-duration',
                        'atotal-fare', 'aavg-fare',
                        'atotal-queueing', 'aavg-queueing',
                        'ap-productivity',
                    'ap-gen-num',
                        'ap-gtotal-duration', 'ap-gavg-duration',
                        'ap-gtotal-fare', 'ap-gavg-fare',
                        'ap-gen-productivity',
                    'ns-num',
                        'ntotal-duration', 'navg-duration',
                        'ntotal-fare', 'navg-fare',
                        'ntotal-queueing', 'navg-queueing',
                        'ns-productivity',
                    'ns-gen-num',
                        'ns-gtotal-duration', 'ns-gavg-duration',
                        'ns-gtotal-fare', 'ns-gavg-fare',
                        'ns-gen-productivity',
                     'key']
        writer.writerow(header)
        for k in time_period_order:
            all_total_dur, all_total_fare, all_num, \
            ap_total_dur, ap_total_fare, ap_total_queue, ap_num, \
            ns_total_dur, ns_total_fare, ns_total_queue, ns_num = hp_summary[k]
            #
            if all_num == 0:
                all_avg_dur, all_avg_fare = -1, -1
                all_prod = -1
            else:
                all_avg_dur, all_avg_fare = all_total_dur / float(all_num), all_total_fare / float(all_num)
                if all_total_dur == 0:
                    zero_dur.append([ALL, k])
                    all_prod = -1
                else:
                    all_prod = all_total_fare / float(all_total_dur)
            #
            yy, mm, dd, hh = k
            if ap_num == 0:
                ap_avg_dur, ap_avg_fare, ap_avg_queue = -1, -1, -1
                ap_prod = -1
            else:
                ap_avg_dur, ap_avg_fare, ap_avg_queue = \
                    ap_total_dur / float(ap_num), ap_total_fare / float(ap_num), ap_total_queue / float(ap_num)
                if ap_total_dur == 0:
                    zero_dur.append([AP, k])
                    ap_prod = -1
                else:
                    ap_prod = ap_total_fare / float(ap_total_dur)
            ap_gen_num = all_num - ap_num
            ap_gen_total_dur = all_total_dur - (ap_total_dur + ap_total_queue)
            ap_gen_total_fare = all_total_fare - ap_total_fare
            if ap_gen_num == 0:
                ap_gen_avg_dur, ap_gen_avg_fare = -1, -1
                ap_gen_prod = -1
            else:
                ap_gen_avg_dur, ap_gen_avg_fare = \
                    ap_gen_total_dur / float(ap_gen_num), ap_gen_total_fare / float(ap_gen_num)
                if ap_gen_total_dur == 0:
                    zero_dur.append([AP_GEN, k])
                    ap_gen_prod = -1
                else:
                    ap_gen_prod = ap_gen_total_fare / float(ap_gen_total_dur)
            #
            if ns_num == 0:
                ns_avg_dur, ns_avg_fare, ns_avg_queue = -1, -1, -1
                ns_prod = -1
            else:
                ns_avg_dur, ns_avg_fare, ns_avg_queue = \
                    ns_total_dur / float(ns_num), ns_total_fare / float(ns_num), ns_total_queue / float(ns_num)
                if ns_total_dur == 0:
                    zero_dur.append([NS, k])
                    ns_prod = -1
                else:
                    ns_prod = ns_total_fare / float(ns_total_dur)
            ns_gen_num = all_num - ns_num
            ns_gen_total_dur = all_total_dur - (ns_total_dur + ns_total_queue)
            ns_gen_total_fare = all_total_fare - ns_total_fare
            if ns_gen_num == 0:
                ns_gen_avg_dur, ns_gen_avg_fare = -1, -1
                ns_gen_prod = -1
            else:
                ns_gen_avg_dur, ns_gen_avg_fare = \
                    ns_gen_total_dur / float(ns_gen_num), ns_gen_total_fare / float(ns_gen_num)
                if ns_gen_total_dur == 0:
                    zero_dur.append([NS_GEN, k])
                    ns_gen_prod = -1
                else:
                    ns_gen_prod = ns_gen_total_fare / float(ns_gen_total_dur)
            #
            writer.writerow([yy, mm, dd, hh,
                             all_num,
                                all_total_dur, all_avg_dur,
                                all_total_fare, all_avg_fare,
                                all_prod,
                             ap_num,
                                ap_total_dur, ap_avg_dur,
                                ap_total_fare, ap_avg_fare,
                                ap_total_queue, ap_avg_queue,
                                ap_prod,
                             ap_gen_num,
                                ap_gen_total_dur, ap_gen_avg_dur,
                                ap_gen_total_fare, ap_gen_avg_fare,
                                ap_gen_prod,
                             ns_num,
                                ns_total_dur, ap_avg_dur,
                                ns_total_fare, ns_avg_fare,
                                ns_total_queue, ns_avg_queue,
                                ns_prod,
                             ns_gen_num,
                                ns_gen_total_dur, ap_gen_avg_dur,
                                ns_gen_total_fare, ns_gen_avg_fare,
                                ns_gen_prod,
                             k])
Пример #36
0
def run(processorNum):
    for i, fn in enumerate(get_all_files(if_dpath, '%s%s*.csv' % (if_prefix, year))):
        if i % numWorker != processorNum:
            continue
        fpath = '%s/%s' % (if_dpath, fn)
        process_file(fpath)
Пример #37
0
def summary():
    from traceback import format_exc
    try:
        logger.info('Start summary')
        ignoring_periods = []
        for ys, ms, ds, hs in error_hours:
            yyyy = 2000 + int(ys)
            mm, dd, hh = map(int, [ms, ds, hs])
            k = (yyyy, mm, dd, hh)
            ignoring_periods.append(k)
        cur_timestamp = datetime.datetime(2008, 12, 31, 23)
        last_timestamp = datetime.datetime(2011, 1, 1, 0)
        hp_summary, time_period_order = {}, []
        while cur_timestamp < last_timestamp:
            cur_timestamp += datetime.timedelta(hours=1)
            year, month, day, hour = cur_timestamp.year, cur_timestamp.month, cur_timestamp.day, cur_timestamp.hour
            if year == 2009 and month == 12: continue
            if year == 2010 and month == 10: continue
            if year == 2011: continue
            if AM2 <= hour and hour <= AM5: continue
            need2skip = False
            for ys, ms, ds, hs in error_hours:
                year0 = 2000 + int(ys)
                month0, day0, hour0 = map(int, [ms, ds, hs])
                if (year == year0) and (month == month0) and (day == day0) and (hour == hour0):
                    need2skip = True
            if need2skip: continue
            #
            k = year, month, day, hour
            hp_summary[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, \
                                                   AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, \
                                                   NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))]
            time_period_order.append(k)
            #
        year_l, month_l, day_l, hour_l = 'year', 'month', 'day', 'hour'
        for fn in get_all_files(productivity_dpath, '%s*.csv' % productivity_prefix):
            with open('%s/%s' % (productivity_dpath, fn), 'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month = int(row[hid[year_l]]), int(row[hid[month_l]])
                    day, hour = int(row[hid[day_l]]), int(row[hid[hour_l]])
                    k = (year, month, day, hour)
                    if not hp_summary.has_key(k): continue
                    hp_summary[k][ALL_DUR] += eval(row[hid['allDuration']])
                    hp_summary[k][ALL_FARE] += eval(row[hid['allFare']])
                    hp_summary[k][ALL_NUM] += eval(row[hid['allNum']])
                    #
                    hp_summary[k][AP_DUR] += eval(row[hid['apDuration']])
                    hp_summary[k][AP_FARE] += eval(row[hid['apFare']])
                    hp_summary[k][AP_QUEUE] += eval(row[hid['apQueueingTime']])
                    hp_summary[k][AP_NUM] += eval(row[hid['apNum']])
                    #
                    hp_summary[k][NS_DUR] += eval(row[hid['nsDuration']])
                    hp_summary[k][NS_FARE] += eval(row[hid['nsFare']])
                    hp_summary[k][NS_QUEUE] += eval(row[hid['nsQueueingTime']])
                    hp_summary[k][NS_NUM] += eval(row[hid['nsNum']])
        #
        with open(productivity_summary_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile)
            header = ['year', 'month', 'day', 'hour',
                      'allNum',
                      'allTotalDuration', 'allAvgDuration',
                      'allTotalFare', 'allAvgFare',
                      'allProductivity',
                      'apNum',
                      'apTotalDuration', 'apAvgDuration',
                      'apTotalFare', 'apAvgFare',
                      'apTotalQueueing', 'apAvgQueueing',
                      'apProductivity',
                      'apGenNum',
                      'apGenTotalDuration', 'apGenAvgDuration',
                      'apGenTotalFare', 'apGenAvgFare',
                      'apGenProductivity',
                      'nsNum',
                      'nsTotalDuration', 'nsAvgDuration',
                      'nsTotalFare', 'nsAvgFare',
                      'nsTotalQueueing', 'nsAvgQueueing',
                      'nsProductivity',
                      'nsGenNum',
                      'nsGenTotalDuration', 'nsGenAvgDuration',
                      'nsGenTotalFare', 'nsGenAvgFare',
                      'nsGenProductivity',
                      'key']
            writer.writerow(header)
            for k in time_period_order:
                all_total_dur, all_total_fare, all_num, \
                ap_total_dur, ap_total_fare, ap_total_queue, ap_num, \
                ns_total_dur, ns_total_fare, ns_total_queue, ns_num = hp_summary[k]
                year, month, day, hour = k
                #
                if all_num == 0:
                    all_avg_dur, all_avg_fare = -1, -1
                    all_prod = -1
                else:
                    all_avg_dur, all_avg_fare = all_total_dur / float(all_num), all_total_fare / float(all_num)
                    if all_total_dur == 0:
                        all_prod = -1
                    else:
                        all_prod = all_total_fare / float(all_total_dur)
                if ap_num == 0:
                    ap_avg_dur, ap_avg_fare, ap_avg_queue = -1, -1, -1
                    ap_prod = -1
                else:
                    ap_avg_dur, ap_avg_fare, ap_avg_queue = \
                        ap_total_dur / float(ap_num), ap_total_fare / float(ap_num), ap_total_queue / float(ap_num)
                    if ap_total_dur == 0:
                        ap_prod = -1
                    else:
                        ap_prod = ap_total_fare / float(ap_total_dur)
                ap_gen_num = all_num - ap_num
                ap_gen_total_dur = all_total_dur - (ap_total_dur + ap_total_queue)
                ap_gen_total_fare = all_total_fare - ap_total_fare
                if ap_gen_num == 0:
                    ap_gen_avg_dur, ap_gen_avg_fare = -1, -1
                    ap_gen_prod = -1
                else:
                    ap_gen_avg_dur, ap_gen_avg_fare = \
                        ap_gen_total_dur / float(ap_gen_num), ap_gen_total_fare / float(ap_gen_num)
                    if ap_gen_total_dur == 0:
                        ap_gen_prod = -1
                    else:
                        ap_gen_prod = ap_gen_total_fare / float(ap_gen_total_dur)
                #
                if ns_num == 0:
                    ns_avg_dur, ns_avg_fare, ns_avg_queue = -1, -1, -1
                    ns_prod = -1
                else:
                    ns_avg_dur, ns_avg_fare, ns_avg_queue = \
                        ns_total_dur / float(ns_num), ns_total_fare / float(ns_num), ns_total_queue / float(ns_num)
                    if ns_total_dur == 0:
                        ns_prod = -1
                    else:
                        ns_prod = ns_total_fare / float(ns_total_dur)
                ns_gen_num = all_num - ns_num
                ns_gen_total_dur = all_total_dur - (ns_total_dur + ns_total_queue)
                ns_gen_total_fare = all_total_fare - ns_total_fare
                if ns_gen_num == 0:
                    ns_gen_avg_dur, ns_gen_avg_fare = -1, -1
                    ns_gen_prod = -1
                else:
                    ns_gen_avg_dur, ns_gen_avg_fare = \
                        ns_gen_total_dur / float(ns_gen_num), ns_gen_total_fare / float(ns_gen_num)
                    if ns_gen_total_dur == 0:
                        ns_gen_prod = -1
                    else:
                        ns_gen_prod = ns_gen_total_fare / float(ns_gen_total_dur)
                #
                writer.writerow([year, month, day, hour,
                                 all_num,
                                 all_total_dur, all_avg_dur,
                                 all_total_fare, all_avg_fare,
                                 all_prod,
                                 ap_num,
                                 ap_total_dur, ap_avg_dur,
                                 ap_total_fare, ap_avg_fare,
                                 ap_total_queue, ap_avg_queue,
                                 ap_prod,
                                 ap_gen_num,
                                 ap_gen_total_dur, ap_gen_avg_dur,
                                 ap_gen_total_fare, ap_gen_avg_fare,
                                 ap_gen_prod,
                                 ns_num,
                                 ns_total_dur, ap_avg_dur,
                                 ns_total_fare, ns_avg_fare,
                                 ns_total_queue, ns_avg_queue,
                                 ns_prod,
                                 ns_gen_num,
                                 ns_gen_total_dur, ap_gen_avg_dur,
                                 ns_gen_total_fare, ns_gen_avg_fare,
                                 ns_gen_prod,
                                 k])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], 'summary'), 'w') as f:
            f.write(format_exc())
        raise
Пример #38
0
def process_file(tm, year):
    ig_dpath = dpaths[tm, year, 'influenceGraph']
    ig_prefix = prefixs[tm, year, 'influenceGraph']
    gp_dpath = dpaths[tm, year, 'groupPartition']
    gp_prefix = prefixs[tm, year, 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon'])
    #
    logger.info('Start handling SP_group_dpath')
    orignal_graph = {}
    for fn in get_all_files(ig_dpath, '%s*' % ig_prefix):
        regression_graph = load_pickle_file('%s/%s' % (ig_dpath, fn))
        for i, ((did0, did1), w) in enumerate(regression_graph.iteritems()):
            orignal_graph[did0, did1] = w
    save_pickle_file(gp_original_fpath, orignal_graph)
    #
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()):
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Пример #39
0
def process_tripBased():
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        logger.info('handle the file; %s' % yyyy)
        #
        statistics1517_fpath = '%s/%s%s.csv' % (
            statisticsAllDrivers_ns_dpath,
            statisticsAllDriversTrip_ns1517_prefix, yyyy)
        statistics2023_fpath = '%s/%s%s.csv' % (
            statisticsAllDrivers_ns_dpath,
            statisticsAllDriversTrip_ns2023_prefix, yyyy)
        #
        yy = yyyy[2:]
        holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010
        for statistics_fpath in [statistics1517_fpath, statistics2023_fpath]:
            with open(statistics_fpath, 'wb') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                header = [
                    'year', 'month', 'day', 'hour', 'driverID', 'locQTime',
                    'locEP', 'locDuration', 'locFare', 'locProductivity',
                    'locIn', 'weekEnd'
                ]
                writer.writerow(header)
        for fn in get_all_files(economicProfit_ns_dpath,
                                '%s%s*' % (economicProfit_ns_prefix, yy)):
            with open('%s/%s' % (economicProfit_ns_dpath, fn),
                      'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day, hour = map(int, [
                        row[hid[cn]]
                        for cn in ['year', 'month', 'day', 'hour']
                    ])
                    did = int(row[hid['did']])
                    locQTime = float(row[hid['queueingTime']]) / SEC60
                    locEP = float(row[hid['economicProfit']]) / CENT
                    locDuration = float(row[hid['duration']]) / SEC60
                    locFare = float(row[hid['fare']]) / CENT
                    locProductivity = (locFare /
                                       (locQTime + locDuration)) * SEC60
                    locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0
                    weekEnd = 0
                    if (year, month, day) in holidays:
                        weekEnd = 1
                    if datetime.datetime(year, month,
                                         day).weekday() in WEEKENDS:
                        weekEnd = 1
                    if hour in tf_ns1517:
                        statistics_fpath = statistics1517_fpath
                    elif hour in tf_ns2023:
                        statistics_fpath = statistics2023_fpath
                    else:
                        continue
                    with open(statistics_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [
                            year, month, day, hour, did, locQTime, locEP,
                            locDuration, locFare, locProductivity, locIn,
                            weekEnd
                        ]
                        writer.writerow(new_row)
Пример #40
0
def process_dayBased():
    logger.info('handle dayBased')
    #
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        statistics1517_fpath = '%s/%s%s.csv' % (
            statisticsAllDrivers_ns_dpath,
            statisticsAllDriversDay_ns1517_prefix, yyyy)
        statistics2023_fpath = '%s/%s%s.csv' % (
            statisticsAllDrivers_ns_dpath,
            statisticsAllDriversDay_ns2023_prefix, yyyy)
        #
        dateDid_statistics1517, dateDid_statistics2023 = {}, {}
        logger.info('process locTrip')
        for ns_prefix, dateDid_statistics in [
            (statisticsAllDriversTrip_ns1517_prefix, dateDid_statistics1517),
            (statisticsAllDriversTrip_ns2023_prefix, dateDid_statistics2023)
        ]:
            tripBased_fpath = '%s/Filtered-%s%s.csv' % (
                statisticsAllDrivers_ns_dpath, ns_prefix, yyyy)
            logger.info('process locTrip')
            with open(tripBased_fpath, 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day = map(
                        int, [row[hid[cn]] for cn in ['year', 'month', 'day']])
                    did = int(row[hid['driverID']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        dateDid_statistics[k] = [
                            0.0 for _ in
                            [WTN, WOH, WF, LTN, LIN, LON, LQ, LEP, LD, LF]
                        ]
                    dateDid_statistics[k][LTN] += 1
                    if int(row[hid['locIn']]) == 1:
                        dateDid_statistics[k][LIN] += 1
                    else:
                        assert int(row[hid['locIn']]) == 0
                        dateDid_statistics[k][LON] += 1
                    dateDid_statistics[k][LQ] += float(row[hid['locQTime']])
                    dateDid_statistics[k][LEP] += float(row[hid['locEP']])
                    dateDid_statistics[k][LD] += float(row[hid['locDuration']])
                    dateDid_statistics[k][LF] += float(row[hid['locFare']])
        yy = yyyy[2:]
        logger.info('process shift')
        for fn in get_all_files(shiftProDur_dpath,
                                '%s%s*' % (shiftProDur_prefix, yy)):
            logger.info('shift; %s' % fn)
            with open('%s/%s' % (shiftProDur_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day, hour = 2000 + int(row[hid['yy']]), int(
                        row[hid['mm']]), int(row[hid['dd']]), int(
                            row[hid['hh']])
                    if hour in tf_ns1517:
                        dateDid_statistics = dateDid_statistics1517
                    elif hour in tf_ns2023:
                        dateDid_statistics = dateDid_statistics2023
                    else:
                        continue
                    did = int(row[hid['did']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        continue
                    dateDid_statistics[k][WOH] += (float(row[hid['pro-dur']]) *
                                                   SEC60) / SEC3600

        logger.info('process trip')
        for fn in get_all_files(trip_dpath, '%s%s*' % (trip_prefix, yy)):
            logger.info('Trip; %s' % fn)
            _, yymm = fn[:-len('.csv')].split('-')
            yy, mm = yymm[:2], yymm[-2:]
            year, month = 2000 + int(yy), int(mm)
            with open('%s/%s' % (trip_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    day, hour = int(row[hid['day']]), int(row[hid['hour']])
                    if hour in tf_ns1517:
                        dateDid_statistics = dateDid_statistics1517
                    elif hour in tf_ns2023:
                        dateDid_statistics = dateDid_statistics2023
                    else:
                        continue
                    did = int(row[hid['did']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        continue
                    dateDid_statistics[k][WTN] += 1
                    dateDid_statistics[k][WF] += float(row[hid['fare']]) / CENT
        #
        logger.info('write statistics; %s' % yymm)
        for statistics_fpath, dateDid_statistics in [
            (statistics1517_fpath, dateDid_statistics1517),
            (statistics2023_fpath, dateDid_statistics2023)
        ]:
            with open(statistics_fpath, 'wb') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                header = [
                    'year', 'month', 'day', 'driverID', 'wleTripNumber',
                    'wleOperatingHour', 'wleFare', 'wleProductivity',
                    'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime',
                    'locEP', 'locDuration', 'locFare', 'QTime/locTrip',
                    'EP/locTrip', 'locProductivity'
                ]
                writer.writerow(header)
                for (year, month, day,
                     did), statistics in dateDid_statistics.iteritems():
                    wleTripNumber, wleOperatingHour, wleFare = int(
                        statistics[WTN]), statistics[WOH], statistics[WF],
                    if wleOperatingHour == 0.0:
                        continue
                    wleProductivity = wleFare / wleOperatingHour
                    #
                    locTripNumber, locInNumber, locOutNumber = map(
                        int,
                        [statistics[LTN], statistics[LIN], statistics[LON]])
                    if locTripNumber == 0.0:
                        continue
                    locQTime, locEP, locDuration, locFare = statistics[
                        LQ], statistics[LEP], statistics[LD], statistics[LF]
                    if (locQTime + locDuration) == 0.0:
                        continue
                    QTime_locTrip, EP_locTrip = locQTime / float(
                        locTripNumber), locEP / float(locTripNumber)
                    locProductivity = (locFare /
                                       (locQTime + locDuration)) * SEC60
                    new_row = [
                        year, month, day, did, wleTripNumber, wleOperatingHour,
                        wleFare, wleProductivity, locTripNumber, locInNumber,
                        locOutNumber, locQTime, locEP, locDuration, locFare,
                        QTime_locTrip, EP_locTrip, locProductivity
                    ]
                    writer.writerow(new_row)
def process_dayBased():
    logger.info('handle dayBased')
    #
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        statistics_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversDay_ap_prefix, yyyy)
        #
        dateDid_statistics = {}
        dateDid_DP = {}
        dp_index, count = {}, 0
        for l0 in locations:
            for l1 in locations:
                drop_pick = 'D%s#P%s' % (l0, l1)
                dp_index[drop_pick] = count
                count += 1
        tripBased_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, yyyy)
        logger.info('process locTrip')
        with open(tripBased_fpath, 'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']])
                did = int(row[hid['driverID']])
                k = (year, month, day, did)
                if not dateDid_statistics.has_key(k):
                    dateDid_statistics[k] = [0.0 for _ in [WTN, WOH, WF, LTN, LIN, LON, LQ, LEP, LD, LF]]
                    dateDid_DP[k] = [0 for _ in range(len(dp_index))]
                dateDid_statistics[k][LTN] += 1
                if int(row[hid['locIn']]) == 1:
                    dateDid_statistics[k][LIN] += 1
                else:
                    assert int(row[hid['locIn']]) == 0
                    dateDid_statistics[k][LON] += 1
                dateDid_statistics[k][LQ] += float(row[hid['locQTime']])
                dateDid_statistics[k][LEP] += float(row[hid['locEP']])
                dateDid_statistics[k][LD] += float(row[hid['locDuration']])
                dateDid_statistics[k][LF] += float(row[hid['locFare']])
                #
                for l0 in locations:
                    for l1 in locations:
                        cn = 'D%s#P%s' % (l0, l1)
                        dateDid_DP[k][dp_index[cn]] += int(row[hid[cn]])
        #
        yy = yyyy[2:]
        logger.info('process shift')
        for fn in get_all_files(shiftProDur_dpath, '%s%s*' % (shiftProDur_prefix, yy)):
            logger.info('shift; %s' % fn)
            with open('%s/%s' % (shiftProDur_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']])
                    did = int(row[hid['did']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        continue
                    dateDid_statistics[k][WOH] += (float(row[hid['pro-dur']]) * SEC60) / SEC3600
        #
        logger.info('process trip')
        for fn in get_all_files(trip_dpath, '%s%s*' % (trip_prefix, yy)):
            logger.info('Trip; %s' % fn)
            with open('%s/%s' % (trip_dpath, fn), 'rt') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    year, month, day = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day']])
                    did = int(row[hid['did']])
                    k = (year, month, day, did)
                    if not dateDid_statistics.has_key(k):
                        continue
                    dateDid_statistics[k][WTN] += 1
                    dateDid_statistics[k][WF] += float(row[hid['fare']]) / CENT
        #
        logger.info('write statistics; %s' % yyyy)
        with open(statistics_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['year', 'month', 'day', 'driverID',
                      'wleTripNumber', 'wleOperatingHour', 'wleFare',
                      'wleProductivity',
                      'locTripNumber', 'locInNumber', 'locOutNumber',
                      'locQTime', 'locEP', 'locDuration', 'locFare',
                      'QTime/locTrip', 'EP/locTrip',
                      'locProductivity']
            for dp in dp_index.iterkeys():
                header.append(dp)
            writer.writerow(header)
            for (year, month, day, did), statistics in dateDid_statistics.iteritems():
                wleTripNumber, wleOperatingHour, wleFare = int(statistics[WTN]), statistics[WOH], statistics[WF],
                if wleOperatingHour == 0.0:
                    continue
                wleProductivity = wleFare / wleOperatingHour
                #
                locTripNumber, locInNumber, locOutNumber = map(int, [statistics[LTN], statistics[LIN], statistics[LON]])
                if locTripNumber == 0.0:
                    continue
                locQTime, locEP, locDuration, locFare = statistics[LQ], statistics[LEP], statistics[LD], statistics[LF]
                if (locQTime + locDuration) == 0.0:
                    continue
                QTime_locTrip, EP_locTrip = locQTime / float(locTripNumber), locEP / float(locTripNumber)
                locProductivity = (locFare / (locQTime + locDuration)) * SEC60
                new_row = [
                    year, month, day, did,
                    wleTripNumber, wleOperatingHour, wleFare,
                    wleProductivity,
                    locTripNumber, locInNumber, locOutNumber,
                    locQTime, locEP, locDuration, locFare,
                    QTime_locTrip, EP_locTrip,
                    locProductivity]
                for dp in dp_index.iterkeys():
                    new_row.append(dateDid_DP[(year, month, day, did)][dp_index[dp]])
                writer.writerow(new_row)
Пример #42
0
def process_file(tm, year, gn, groupDrivers):
    logger.info('handle the file; %s-%s-%s' % (tm, year, gn))
    gt_dpath = dpaths[tm, year, 'groupTrips']
    gt_prefix = prefixs[tm, year, 'groupTrips']
    gt_fpath = '%s/%s%s.csv' % (gt_dpath, gt_prefix, gn)
    #
    # gs_dpath = dpaths[tm, year, 'groupShifts']
    # gs_prefix = prefixs[tm, year, 'groupShifts']
    # gs_fpath = '%s/%s%s.csv' % (gs_dpath, gs_prefix, gn)
    with open(gt_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['time', 'year', 'month', 'day', 'hour',
                  'did', 'groupName',
                  'zi', 'zj', 'zizj',
                  tm, 'priorPresence',
                  'start-long', 'start-lat',
                  'distance', 'duration', 'fare']
        writer.writerow(header)
    num_gt_fpath = '%s/%snum-%s.csv' % (gt_dpath, gt_prefix, gn)
    with open(num_gt_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['groupName','did0', 'did1',
                  'zi', 'zj', 'zizj',
                  'time', 'year', 'month', 'day', 'hour']
        writer.writerow(header)
    # with open(gs_fpath, 'wb') as w_csvfile:
    #     writer = csv.writer(w_csvfile, lineterminator='\n')
    #     new_headers = ['year', 'month', 'day', 'hour', 'did', 'pro-dur']
    #     writer.writerow(new_headers)
    yy = year[2:]
    for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' % (prevDriversDefined_prefix, yy)):
        fpath = '%s/%s' % (prevDriversDefined_dpath, fn)
        logger.info('handle the file %s; %s-%s-%s' % (fn, tm, year, gn))
        with open(fpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            header = reader.next()
            hid = {h: i for i, h in enumerate(header)}
            for row in reader:
                did1 = int(row[hid['did']])
                if did1 not in groupDrivers:
                    continue
                tm_value = row[hid[tm]]
                t, month, day, hour = [row[hid[cn]] for cn in ['time', 'month', 'day', 'timeFrame']]
                zi, zj = row[hid['zi']], row[hid['zj']]
                zizj = '%s#%s' % (zi, zj)
                _prevDrivers = row[hid['prevDrivers']].split('&')
                priorPresence = X_PRESENCE
                if len(_prevDrivers) == 1 and _prevDrivers[0] == '':
                    priorPresence = X_PRESENCE
                else:
                    prevDrivers = map(int, _prevDrivers)
                    for did0 in groupDrivers.difference(set([did1])):
                        if did0 in prevDrivers:
                            with open(num_gt_fpath, 'a') as w_csvfile:
                                writer = csv.writer(w_csvfile, lineterminator='\n')
                                new_row = [gn, did0, did1,
                                           zi, zj, zizj,
                                           t, year, month, day, hour]
                                writer.writerow(new_row)
                            priorPresence = O_PRESENCE
                new_row = [t, year, month, day, hour,
                           did1, gn,
                           zi, zj, zizj,
                           tm_value, priorPresence]
                for cn in ['start-long', 'start-lat', 'distance', 'duration', 'fare']:
                    new_row.append(row[hid[cn]])
                with open(gt_fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    writer.writerow(new_row)