Exemplo n.º 1
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        count_graph_dpath = dpaths['baseline', '2009', 'countGraph']
        count_graph_prefix = prefixs['baseline', '2009', 'countGraph']
        count_graph_fpath = '%s/%s%s.pkl' % (count_graph_dpath, count_graph_prefix, reducerID)
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        count_graph = {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)

            did1_df = did1_df.drop(['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'spendingTime'], axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            for _did0, numPriorPresence in did1_df.sum().iteritems():
                if numPriorPresence == 0:
                    continue
                count_graph[int(_did0), did1] = numPriorPresence
        #
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(count_graph_fpath, count_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f:
            f.write(format_exc())
        raise
Exemplo n.º 2
0
def get_driver_trajectory(did):
    ofpath = '%s%d.pkl' % (if_prefix, did)
    if check_path_exist(ofpath):
        dt_xy_state = load_pickle_file(ofpath)
    else:
        dates = []
        for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
            _, _date, _did = fn[:-len('.csv')].split('-')
            if int(_did) != did:
                continue
            year = 2000 + int(_date[:2])
            month, day = map(int, [_date[2:4], _date[4:6]])
            dt = datetime.datetime(year, month, day)
            dates += [dt]
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(row[hid['time']]))
                    lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [(dt, x, y, int(row[hid['state']]))]
        save_pickle_file(ofpath, dt_xy_state)
    return dt_xy_state
Exemplo n.º 3
0
def process_file(yymm):
    ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
        return None
    print 'handle the file; %s' % yymm
    veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
    veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
    if yymm not in ['0901', '1001', '1011']:
        path_to_last_day_csv_file = None
        temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix,
                                       '.csv')
        prev_fn = None
        y, m = int(yymm[:2]), int(yymm[2:])
        prev_m = m - 1
        prev_yymm = '%02d%02d' % (y, prev_m)
        for temp_fn in temp_csv_files:
            if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)):
                prev_fn = temp_fn
                break
        assert prev_fn, yymm
        path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn)
        # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1:
        #     return None
        veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                        record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                             veh_ns_crossing_time, veh_last_log_ns_or_not)
    path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
            record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
    #
    save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
    save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
    print 'end the file; %s' % yymm
def only_1001():
    yymm = '1001'
    id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, yymm)
    trip_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, '2010')
    df = pd.read_csv(trip_fpath)
    drivers = set(df['driverID'])
    intelDrivers = {}
    for did in drivers:
        didm_df = df[(df['driverID'] == did) & (df['month'] == 1)].copy(deep=True)
        hours = set(didm_df['hour'])
        dummiesH = []
        for h in hours:
            hour_str = 'H%02d' % h
            didm_df[hour_str] = np.where(didm_df['hour'] == h, 1, 0)
            dummiesH.append(hour_str)
        df_residual = len(didm_df) - (len(dummiesH) + 1)
        if df_residual / float(len(didm_df)) < min_df_residual_ratio:
            intelDrivers[did] = (len(didm_df), 'X')
            continue
        y = didm_df['locQTime']
        X = didm_df[dummiesH[:-1] + ['locIn']]
        X = sm.add_constant(X)
        res = sm.OLS(y, X, missing='drop').fit()
        if res.pvalues['locIn'] < sig_level:
            intelDrivers[did] = (len(didm_df), res.params['locIn'])
        else:
            intelDrivers[did] = (len(didm_df), 'X')
    save_pickle_file(id_fpath, intelDrivers)
Exemplo n.º 5
0
def only_1001():
    yymm = '1001'
    id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath,
                                statisticsAllDriversIntellect_ap_prefix, yymm)
    trip_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath,
                                           statisticsAllDriversTrip_ap_prefix,
                                           '2010')
    df = pd.read_csv(trip_fpath)
    drivers = set(df['driverID'])
    intelDrivers = {}
    for did in drivers:
        didm_df = df[(df['driverID'] == did)
                     & (df['month'] == 1)].copy(deep=True)
        hours = set(didm_df['hour'])
        dummiesH = []
        for h in hours:
            hour_str = 'H%02d' % h
            didm_df[hour_str] = np.where(didm_df['hour'] == h, 1, 0)
            dummiesH.append(hour_str)
        df_residual = len(didm_df) - (len(dummiesH) + 1)
        if df_residual / float(len(didm_df)) < min_df_residual_ratio:
            intelDrivers[did] = (len(didm_df), 'X')
            continue
        y = didm_df['locQTime']
        X = didm_df[dummiesH[:-1] + ['locIn']]
        X = sm.add_constant(X)
        res = sm.OLS(y, X, missing='drop').fit()
        if res.pvalues['locIn'] < sig_level:
            intelDrivers[did] = (len(didm_df), res.params['locIn'])
        else:
            intelDrivers[did] = (len(didm_df), 'X')
    save_pickle_file(id_fpath, intelDrivers)
def process_file(yymm):
    ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
        return None
    print 'handle the file; %s' % yymm
    veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
    veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
    if yymm not in ['0901', '1001', '1011']:
        path_to_last_day_csv_file = None
        temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv')
        prev_fn = None
        y, m = int(yymm[:2]), int(yymm[2:])
        prev_m = m - 1
        prev_yymm = '%02d%02d' %(y, prev_m)
        for temp_fn in temp_csv_files:
            if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)):
                prev_fn = temp_fn
                break
        assert prev_fn, yymm
        path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn)
        # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1:
        #     return None
        veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                        record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                             veh_ns_crossing_time, veh_last_log_ns_or_not)
    path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
            record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
    #
    save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
    save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
    print 'end the file; %s' % yymm
def find_intelligentDrivers():
    idb_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'both')
    with open(idb_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['', 'Y2009', 'Y2010',
                  'significance level %.2f' % sig_level, 'minDfResidualRatio %.2f' % min_df_residual_ratio]
        writer.writerow(header)
    regressionClassification = {}
    for y in range(9, 11):
        year = '20%02d' % y
        id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, year)
        trip_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, year)
        df = pd.read_csv(trip_fpath)
        drivers = set(df['driverID'])
        intelDrivers = {}
        for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']:
            regressionClassification[year, mes] = 0
        for did in drivers:
            did_df = df[(df['driverID'] == did)].copy(deep=True)
            months = set(did_df['month'])
            hours = set(did_df['hour'])
            dummiesM = []
            for m in months:
                month_str = 'M%02d' % m
                did_df[month_str] = np.where(did_df['month'] == m, 1, 0)
                dummiesM.append(month_str)
            dummiesH = []
            for h in hours:
                hour_str = 'H%02d' % h
                did_df[hour_str] = np.where(did_df['hour'] == h, 1, 0)
                dummiesH.append(hour_str)
            df_residual = len(did_df) - (len(dummiesM) + len(dummiesH) + 1)
            if df_residual / float(len(did_df)) < min_df_residual_ratio:
                intelDrivers[did] = (len(did_df), 'X')
                regressionClassification[year, 'smallObs'] += 1
                continue
            y = did_df['locQTime']
            X = did_df[dummiesM[:-1] + dummiesH[:-1] + ['locIn']]
            X = sm.add_constant(X)
            res = sm.OLS(y, X, missing='drop').fit()
            if res.pvalues['locIn'] < sig_level:
                intelDrivers[did] = (len(did_df), res.params['locIn'])
                if res.params['locIn'] > 0:
                    regressionClassification[year, 'sigPos'] += 1
                else:
                    regressionClassification[year, 'sigNeg'] += 1
            else:
                intelDrivers[did] = (len(did_df), 'X')
                if res.params['locIn'] > 0:
                    regressionClassification[year, 'XsigPos'] += 1
                else:
                    regressionClassification[year, 'XsigNeg'] += 1
        save_pickle_file(id_fpath, intelDrivers)
    #
    with open(idb_fpath, 'a') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']:
            new_row = [mes, regressionClassification['2009', mes], regressionClassification['2010', mes]]
            writer.writerow(new_row)
Exemplo n.º 8
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1)
        sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year, _did1)
        if check_path_exist(ofpath):
            return None
        with open(ofpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['did',
                      'numObservations', 'numPrevDrivers',
                      'numSigRelationship',
                      'numPosCoef', 'numNegCoef',
                      'sigPosRelation', 'sigNegRelation']
            writer.writerow(header)
        #
        logger.info('Start loading; %s-%s' % (year, _did1))
        df = pd.read_csv(fpath)
        numObservations = len(df)
        did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1)
        if _did1 in did1_df.columns:
            did1_df = did1_df.drop([_did1], axis=1)
        prevDrivers = [cn for cn in did1_df.columns if cn != depVar]
        numPrevDrivers = len(prevDrivers)
        #
        sigRelatioin = {k: [] for k in ['pos', 'neg']}
        for _did0 in prevDrivers:
            num_encouters = sum(did1_df[_did0])
            if num_encouters < numObservations * MIN_PICKUP_RATIO:
                continue
            # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0:
            #     continue
            y = did1_df[depVar]
            X = did1_df[[_did0]]
            X = sm.add_constant(X)
            res = sm.OLS(y, X, missing='drop').fit()
            pv = res.pvalues[_did0]
            coef = res.params[_did0]
            if pv < SIGINIFICANCE_LEVEL:
                if coef < 0:
                    sigRelatioin['neg'] += [(_did0, coef)]
                elif coef > 0:
                    sigRelatioin['pos'] += [(_did0, coef)]
        with open(ofpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [_did1,
                       numObservations, numPrevDrivers,
                       len(sigRelatioin['pos']) + len(sigRelatioin['neg']),
                       len(sigRelatioin['pos']), len(sigRelatioin['neg']),
                       '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]), '&'.join([_did0 for _did0, _ in sigRelatioin['neg']])]
            writer.writerow(new_row)
        save_pickle_file(sig_fpath, sigRelatioin)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)), 'w') as f:
            f.write(format_exc())
        raise
    logger.info('End handling; %s' % fpath)
Exemplo n.º 9
0
def get_sgBoarder_xy():
    fpath = 'sgBorder_xy.pkl'
    if not check_path_exist(fpath):
        sgBorder_xy = []
        for lon, lat in sg_border:
            x, y = convert_GPS2xy(lon, lat)
            sgBorder_xy += [(x, y)]
        save_pickle_file(fpath, sgBorder_xy)
    else:
        sgBorder_xy = load_pickle_file(fpath)
    return sgBorder_xy
Exemplo n.º 10
0
def get_sgZones():
    ofpath = 'sgZone.pkl'
    if check_path_exist(ofpath):
        sgZones = load_pickle_file(ofpath)
    else:
        sgZones = get_sg_zones()
        for z in sgZones.values():
            z.cCoor_xy = convert_GPS2xy(*z.cCoor_gps)
            z.polyPoints_xy = [convert_GPS2xy(*gps_coord) for gps_coord in z.polyPoints_gps]
            z.marked = False
        save_pickle_file(ofpath, sgZones)
    return sgZones
Exemplo n.º 11
0
def get_sgRoards_xy():
    ofpath = 'sgRoards_xy.pkl'
    if check_path_exist(ofpath):
        sgRoards_xy = load_pickle_file(ofpath)
    else:
        sgRoards_xy = []
        for _, coords in get_SG_roads():
            road_fd = []
            for lon, lat in coords:
                road_fd += [convert_GPS2xy(lon, lat)]
            sgRoards_xy += [road_fd]
        save_pickle_file(ofpath, sgRoards_xy)
    return sgRoards_xy
Exemplo n.º 12
0
def ns_productivity_economical_profit():
    #
    # drivers who operate taxi in both years
    #
    df = dfs[Y09_PINS]
    df = df[((df['prod'] - df['prod'].mean()) / df['prod'].std()).abs() < 3]
    df = df[((df['eco-profit'] - df['eco-profit'].mean()) / df['eco-profit'].std()).abs() < 3]
    ns_full_drivers = set(df['did'])
    for i in [Y10_PINS, Y09_PONS, Y10_PONS]:
        df = dfs[i]
        df = df[((df['prod'] - df['prod'].mean()) / df['prod'].std()).abs() < 3]
        df = df[((df['eco-profit'] - df['eco-profit'].mean()) / df['eco-profit'].std()).abs() < 3]
        ns_full_drivers = ns_full_drivers.intersection(set(df['did']))
    #
    save_pickle_file(ftd_gen_prod_db_for_ns, general_productivities(ns_full_drivers))
    save_pickle_file(ftd_ns_prod_eco_prof_db, get_driver_average(ns_full_drivers, [Y09_PINS, Y10_PINS, Y09_PONS, Y10_PONS]))
Exemplo n.º 13
0
def run():
    Y09_monthly_fare, Y10_monthly_fare = [], []
    for y in xrange(9, 11):
        for m in xrange(1, 13):
            yymm = '%02d%02d' % (y, m) 
            if yymm in ['0912', '1010']:
                continue
            trip_df = pd.read_csv('%s/%s%s.csv' % (trips_dir, trip_prefix, yymm))
            trip_df = trip_df[(trip_df['did'] != -1)]
            #
            fares = [x / float(CENT) for x in list(trip_df.groupby(['did']).sum()['fare'])]
            if y == 9:
                Y09_monthly_fare += fares
            else:
                Y10_monthly_fare += fares
    save_pickle_file(driver_monthly_fare_fn, [Y09_monthly_fare, Y10_monthly_fare])
Exemplo n.º 14
0
def get_sgGrid_xy():
    ofpath = 'sgGrid_xy.pkl'
    if check_path_exist(ofpath):
        sgGrid_xy = load_pickle_file(ofpath)
    else:
        sgGrid_xy = []
        lons, lats = generate_sg_grid()
        for lon in lons:
            sx, sy = convert_GPS2xy(lon, lats[0])
            ex, ey = convert_GPS2xy(lon, lats[-1])
            sgGrid_xy += [[(sx, sy), (ex, ey)]]
        for lat in lats:
            sx, sy = convert_GPS2xy(lons[0], lat)
            ex, ey = convert_GPS2xy(lons[-1], lat)
            sgGrid_xy += [[(sx, sy), (ex, ey)]]
        save_pickle_file(ofpath, sgGrid_xy)
    return sgGrid_xy
Exemplo n.º 15
0
def find_driversRelations(year):
    yy = year[2:]
    driversRelations = {}
    for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' %
                            (prevDriversDefined_prefix, yy)):
        logger.info('handle the file; %s' % fn)
        with open('%s/%s' % (prevDriversDefined_dpath, fn), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                did1 = int(row[hid['did']])
                prevDrivers = row[hid['prevDrivers']].split('&')
                if len(prevDrivers) == 1 and prevDrivers[0] == '':
                    continue
                if not driversRelations.has_key(did1):
                    driversRelations[did1] = set()
                for did0 in map(int, prevDrivers):
                    driversRelations[did1].add(did0)
    save_pickle_file(driversRelations_fpaths[year], driversRelations)
Exemplo n.º 16
0
def run():
    Y09_monthly_fare, Y10_monthly_fare = [], []
    for y in xrange(9, 11):
        for m in xrange(1, 13):
            yymm = '%02d%02d' % (y, m)
            if yymm in ['0912', '1010']:
                continue
            trip_df = pd.read_csv('%s/%s%s.csv' %
                                  (trips_dir, trip_prefix, yymm))
            trip_df = trip_df[(trip_df['did'] != -1)]
            #
            fares = [
                x / float(CENT)
                for x in list(trip_df.groupby(['did']).sum()['fare'])
            ]
            if y == 9:
                Y09_monthly_fare += fares
            else:
                Y10_monthly_fare += fares
    save_pickle_file(driver_monthly_fare_fn,
                     [Y09_monthly_fare, Y10_monthly_fare])
Exemplo n.º 17
0
def process_file(tm, year, gt_fpath):
    gz_dpath = dpaths[tm, year, 'groupZones']
    gz_prefix = prefixs[tm, year, 'groupZones']
    df = pd.read_csv(gt_fpath)
    assert len(set(df['groupName'])) == 1
    gn = df['groupName'][0]
    gz_fpath = '%s/%s%s.pkl' % (gz_dpath, gz_prefix, gn)
    #
    df = df[~(np.abs(df[tm] - df[tm].mean()) > (3 * df[tm].std()))]
    groupZones = {}
    for zizj, pp_num in df.groupby(['zizj']).sum()['priorPresence'].iteritems():
        if pp_num < 2:
            continue
        zizj_df = df[(df['zizj'] == zizj)]
        y = zizj_df[tm]
        X = zizj_df['priorPresence']
        X = sm.add_constant(X)
        res = sm.OLS(y, X, missing='drop').fit()
        if res.params['priorPresence'] < 0 and res.pvalues['priorPresence'] < sig_level:
            groupZones[zizj] = res.params['priorPresence']
    save_pickle_file(gz_fpath, groupZones)
Exemplo n.º 18
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        count_graph_dpath = dpaths['baseline', '2009', 'countGraph']
        count_graph_prefix = prefixs['baseline', '2009', 'countGraph']
        count_graph_fpath = '%s/%s%s.pkl' % (count_graph_dpath,
                                             count_graph_prefix, reducerID)
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        count_graph = {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' %
                            (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)

            did1_df = did1_df.drop([
                'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did',
                'spendingTime'
            ],
                                   axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            for _did0, numPriorPresence in did1_df.sum().iteritems():
                if numPriorPresence == 0:
                    continue
                count_graph[int(_did0), did1] = numPriorPresence
        #
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(count_graph_fpath, count_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)),
                  'w') as f:
            f.write(format_exc())
        raise
Exemplo n.º 19
0
def process_file(tm, year, gt_fpath):
    gz_dpath = dpaths[tm, year, 'groupZones']
    gz_prefix = prefixs[tm, year, 'groupZones']
    df = pd.read_csv(gt_fpath)
    assert len(set(df['groupName'])) == 1
    gn = df['groupName'][0]
    gz_fpath = '%s/%s%s.pkl' % (gz_dpath, gz_prefix, gn)
    #
    df = df[~(np.abs(df[tm] - df[tm].mean()) > (3 * df[tm].std()))]
    groupZones = {}
    for zizj, pp_num in df.groupby(['zizj'
                                    ]).sum()['priorPresence'].iteritems():
        if pp_num < 2:
            continue
        zizj_df = df[(df['zizj'] == zizj)]
        y = zizj_df[tm]
        X = zizj_df['priorPresence']
        X = sm.add_constant(X)
        res = sm.OLS(y, X, missing='drop').fit()
        if res.params['priorPresence'] < 0 and res.pvalues[
                'priorPresence'] < sig_level:
            groupZones[zizj] = res.params['priorPresence']
    save_pickle_file(gz_fpath, groupZones)
Exemplo n.º 20
0
def run():
    drivers_dates = {}
    for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
        _, _date, _did = fn[:-len('.csv')].split('-')
        year = 2000 + int(_date[:2])
        month, day = map(int, [_date[2:4], _date[4:6]])
        dt = datetime.datetime(year, month, day)
        k = int(_did)
        if not drivers_dates.has_key(k):
            drivers_dates[k] = []
        drivers_dates[k] += [dt]
    #
    for did, dates in drivers_dates.iteritems():
        ofpath = '%s%d.pkl' % (if_prefix, did)
        if check_path_exist(ofpath):
            continue
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(
                        row[hid['time']]))
                    lon, lat = map(
                        eval,
                        [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [dt, x, y, int(row[hid['state']])]
        save_pickle_file(ofpath, dt_xy_state)
def run():
    for path in [
            ftd_general_prod_mb, ftd_ap_prod_eco_prof_mb,
            ftd_ns_prod_eco_prof_mb
    ]:
        remove_file(path)
    #
    save_pickle_file(ftd_general_prod_mb, general_productivity())
    save_pickle_file(ftd_ap_prod_eco_prof_mb,
                     ap_productivity_economical_profit())
    save_pickle_file(ftd_ns_prod_eco_prof_mb,
                     ns_productivity_economical_profit())
Exemplo n.º 22
0
def process_file(fpath):
    def regression(dv, df):
        oc_dv = 'roamingTime' if dv == 'spendingTime' else 'spendingTime'
        rdf = df.copy(deep=True).drop([oc_dv], axis=1)
        candi_dummies = []
        num_iter = 1
        while True:
            for i, vs in enumerate(zip(*rdf.values)):
                if rdf.columns[i] == dv:
                    continue
                if sum(vs) > len(rdf) * MIN_PICKUP_RATIO * num_iter:
                    candi_dummies.append(rdf.columns[i])
            if len(rdf) <= len(candi_dummies):
                candi_dummies = []
                num_iter += 1
            else:
                break
        y = rdf[dv]
        X = rdf[candi_dummies]
        X = sm.add_constant(X)
        return sm.OLS(y, X, missing='drop').fit()

    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        st_graph_dpath = dpaths['spendingTime', year, 'influenceGraph']
        st_graph_prefix = prefixs['spendingTime', year, 'influenceGraph']
        SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix,
                                          reducerID)
        rt_graph_dpath = dpaths['roamingTime', year, 'influenceGraph']
        rt_graph_prefix = prefixs['roamingTime', year, 'influenceGraph']
        RP_graph_fpath = '%s/%s%s.pkl' % (rt_graph_dpath, rt_graph_prefix,
                                          reducerID)
        if check_path_exist(SP_graph_fpath):
            return None
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        SP_graph, RP_graph = {}, {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' %
                            (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)
            did1_df = did1_df.drop(
                ['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did'],
                axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            SP_res = regression('spendingTime', did1_df)
            if SP_res.f_pvalue < SIGINIFICANCE_LEVEL:
                significant_drivers = set()
                for _did0, pv in SP_res.pvalues.iteritems():
                    if _did0 == 'const':
                        continue
                    if pv < SIGINIFICANCE_LEVEL:
                        significant_drivers.add(_did0)
                positive_ef_drivers = set()
                for _did0, cof in SP_res.params.iteritems():
                    if _did0 == 'const':
                        continue
                    if cof > 0:
                        positive_ef_drivers.add(_did0)
                for _did0 in significant_drivers.difference(
                        positive_ef_drivers):
                    SP_graph[int(_did0), did1] = SP_res.params[_did0]
            #
            # RP_res = regression('roamingTime', did1_df)
            # if RP_res.f_pvalue < SIGINIFICANCE_LEVEL:
            #     significant_drivers = set()
            #     for _did0, pv in RP_res.pvalues.iteritems():
            #         if _did0 == 'const':
            #             continue
            #         if pv < SIGINIFICANCE_LEVEL:
            #             significant_drivers.add(_did0)
            #     positive_ef_drivers = set()
            #     for _did0, cof in RP_res.params.iteritems():
            #         if _did0 == 'const':
            #             continue
            #         if cof > 0:
            #             positive_ef_drivers.add(_did0)
            #     for _did0 in significant_drivers.difference(positive_ef_drivers):
            #         RP_graph[int(_did0), did1] = RP_res.params[_did0]
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(SP_graph_fpath, SP_graph)
        # save_pickle_file(RP_graph_fpath, RP_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)),
                  'w') as f:
            f.write(format_exc())
        raise
Exemplo n.º 23
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        tm = 'spendingTime'
        st_graph_dpath = dpaths[tm, year, 'influenceGraph']
        st_graph_prefix = prefixs[tm, year, 'influenceGraph']
        SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix,
                                          reducerID)
        if check_path_exist(SP_graph_fpath):
            return None
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        SP_graph, RP_graph = {}, {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' %
                            (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)
            numObservations = len(did1_df)
            minDFResiduals = numObservations * MIN_RATIO_RESIDUAL
            did1_df = did1_df.drop([
                'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did',
                'roamingTime'
            ],
                                   axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            candi_dummies = []
            num_iter = 1
            while True:
                for i, vs in enumerate(zip(*did1_df.values)):
                    if did1_df.columns[i] == tm:
                        continue
                    if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter:
                        candi_dummies.append(did1_df.columns[i])
                numIndepVariables = len(candi_dummies)
                if numIndepVariables == 0:
                    break
                if numObservations < numIndepVariables + minDFResiduals:
                    candi_dummies = []
                    num_iter += 1
                else:
                    break
            if not candi_dummies:
                continue
            y = did1_df[tm]
            X = did1_df[candi_dummies]
            X = sm.add_constant(X)
            SP_res = sm.OLS(y, X, missing='drop').fit()
            # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL:
            significant_drivers = set()
            for _did0, pv in SP_res.pvalues.iteritems():
                if _did0 == 'const':
                    continue
                if pv < SIGINIFICANCE_LEVEL:
                    significant_drivers.add(_did0)
            positive_ef_drivers = set()
            for _did0, cof in SP_res.params.iteritems():
                if _did0 == 'const':
                    continue
                if cof > 0:
                    positive_ef_drivers.add(_did0)
            for _did0 in significant_drivers.difference(positive_ef_drivers):
                SP_graph[int(_did0), did1] = SP_res.params[_did0]
        #
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(SP_graph_fpath, SP_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)),
                  'w') as f:
            f.write(format_exc())
        raise
Exemplo n.º 24
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1)
        sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year,
                                                    _did1)
        if check_path_exist(ofpath):
            return None
        with open(ofpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = [
                'did', 'numObservations', 'numPrevDrivers',
                'numSigRelationship', 'numPosCoef', 'numNegCoef',
                'sigPosRelation', 'sigNegRelation'
            ]
            writer.writerow(header)
        #
        logger.info('Start loading; %s-%s' % (year, _did1))
        df = pd.read_csv(fpath)
        numObservations = len(df)
        did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1)
        if _did1 in did1_df.columns:
            did1_df = did1_df.drop([_did1], axis=1)
        prevDrivers = [cn for cn in did1_df.columns if cn != depVar]
        numPrevDrivers = len(prevDrivers)
        #
        sigRelatioin = {k: [] for k in ['pos', 'neg']}
        for _did0 in prevDrivers:
            num_encouters = sum(did1_df[_did0])
            if num_encouters < numObservations * MIN_PICKUP_RATIO:
                continue
            # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0:
            #     continue
            y = did1_df[depVar]
            X = did1_df[[_did0]]
            X = sm.add_constant(X)
            res = sm.OLS(y, X, missing='drop').fit()
            pv = res.pvalues[_did0]
            coef = res.params[_did0]
            if pv < SIGINIFICANCE_LEVEL:
                if coef < 0:
                    sigRelatioin['neg'] += [(_did0, coef)]
                elif coef > 0:
                    sigRelatioin['pos'] += [(_did0, coef)]
        with open(ofpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [
                _did1, numObservations, numPrevDrivers,
                len(sigRelatioin['pos']) + len(sigRelatioin['neg']),
                len(sigRelatioin['pos']),
                len(sigRelatioin['neg']),
                '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]),
                '&'.join([_did0 for _did0, _ in sigRelatioin['neg']])
            ]
            writer.writerow(new_row)
        save_pickle_file(sig_fpath, sigRelatioin)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)),
                  'w') as f:
            f.write(format_exc())
        raise
    logger.info('End handling; %s' % fpath)
Exemplo n.º 25
0
def process_file(yymm):
    def record_crossing_time(path_to_csv_file,
                             veh_ap_crossing_time, veh_last_log_ap_or_not,
                             veh_ns_crossing_time, veh_last_log_ns_or_not):
        with open(path_to_csv_file, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                t, vid = eval(row[hid['time']]), row[hid['vid']]
                ap_or_not, ns_or_not = eval(row[hid['ap-or-not']]), eval(row[hid['ns-or-not']])
                #
                if not veh_last_log_ap_or_not.has_key(vid):
                    if ap_or_not == IN:
                        # the first log's position was occurred in the AP zone
                        assert not veh_ap_crossing_time.has_key(vid)
                        veh_ap_crossing_time[vid] = [t]
                else:
                    assert veh_last_log_ap_or_not.has_key(vid)
                    if veh_last_log_ap_or_not[vid] == OUT and ap_or_not == IN:
                        veh_ap_crossing_time.setdefault(vid, [t]).append(t)
                #
                if not veh_last_log_ns_or_not.has_key(vid):
                    if ns_or_not == IN:
                        # the first log's position was occurred in the NS zone
                        assert not veh_ns_crossing_time.has_key(vid)
                        veh_ns_crossing_time[vid] = [t]
                else:
                    assert veh_last_log_ns_or_not.has_key(vid)
                    if veh_last_log_ns_or_not[vid] == OUT and ns_or_not == IN:
                        veh_ns_crossing_time.setdefault(vid, [t]).append(t)
                #
                veh_last_log_ap_or_not[vid] = ap_or_not
                veh_last_log_ns_or_not[vid] = ns_or_not
        return veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not
    #
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm)
        if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
            return None
        print 'handle the file; %s' % yymm
        veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
        veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
        if yymm not in ['0901', '1001', '1011']:
            y, m = int(yymm[:2]), int(yymm[2:])
            prev_m = m - 1
            prev_yymm = '%02d%02d' %(y, prev_m)
            prev_fn = get_all_files(log_last_day_dpath, '%s%s*.csv' % (log_last_day_prefix, prev_yymm))[0]
            path_to_last_day_csv_file = '%s/%s' % (log_last_day_dpath, prev_fn)
            veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                            record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
        path_to_csv_file = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm)
        veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
                record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                     veh_ns_crossing_time, veh_last_log_ns_or_not)
        #
        save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
        save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
Exemplo n.º 26
0
def run():
    gp_summary_fpath = '%s/%ssummary.csv' % (of_dpath, of_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (of_dpath, of_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (of_dpath, of_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    logger.info('Start handling SP_group_dpath')
    orignal_graph = {}
    for fn in get_all_files(if_dpath,
                            '%ssigRelation-%s-*.pkl' % (if_prefix, year)):
        _, _, _, _, _did1 = fn[:-len('.csv')].split('-')
        sigRelatioin = load_pickle_file('%s/%s' % (if_dpath, fn))
        for _did0, coef in sigRelatioin['pos']:
            did0, did1 = map(int, [_did0, _did1])
            orignal_graph[did0, did1] = coef
    save_pickle_file(gp_original_fpath, orignal_graph)
    #
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()):
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (of_dpath, of_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (of_dpath, of_prefix, gn)
        # layout = sg.layout("kk")
        # if len(drivers) < 100:
        #     ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        # else:
        #     ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (of_dpath, of_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Exemplo n.º 27
0
def find_intelligentDrivers():
    idb_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath,
                                 statisticsAllDriversIntellect_ap_prefix,
                                 'both')
    with open(idb_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = [
            '', 'Y2009', 'Y2010',
            'significance level %.2f' % sig_level,
            'minDfResidualRatio %.2f' % min_df_residual_ratio
        ]
        writer.writerow(header)
    regressionClassification = {}
    for y in range(9, 11):
        year = '20%02d' % y
        id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath,
                                    statisticsAllDriversIntellect_ap_prefix,
                                    year)
        trip_fpath = '%s/Filtered-%s%s.csv' % (
            statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix,
            year)
        df = pd.read_csv(trip_fpath)
        drivers = set(df['driverID'])
        intelDrivers = {}
        for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']:
            regressionClassification[year, mes] = 0
        for did in drivers:
            did_df = df[(df['driverID'] == did)].copy(deep=True)
            months = set(did_df['month'])
            hours = set(did_df['hour'])
            dummiesM = []
            for m in months:
                month_str = 'M%02d' % m
                did_df[month_str] = np.where(did_df['month'] == m, 1, 0)
                dummiesM.append(month_str)
            dummiesH = []
            for h in hours:
                hour_str = 'H%02d' % h
                did_df[hour_str] = np.where(did_df['hour'] == h, 1, 0)
                dummiesH.append(hour_str)
            df_residual = len(did_df) - (len(dummiesM) + len(dummiesH) + 1)
            if df_residual / float(len(did_df)) < min_df_residual_ratio:
                intelDrivers[did] = (len(did_df), 'X')
                regressionClassification[year, 'smallObs'] += 1
                continue
            y = did_df['locQTime']
            X = did_df[dummiesM[:-1] + dummiesH[:-1] + ['locIn']]
            X = sm.add_constant(X)
            res = sm.OLS(y, X, missing='drop').fit()
            if res.pvalues['locIn'] < sig_level:
                intelDrivers[did] = (len(did_df), res.params['locIn'])
                if res.params['locIn'] > 0:
                    regressionClassification[year, 'sigPos'] += 1
                else:
                    regressionClassification[year, 'sigNeg'] += 1
            else:
                intelDrivers[did] = (len(did_df), 'X')
                if res.params['locIn'] > 0:
                    regressionClassification[year, 'XsigPos'] += 1
                else:
                    regressionClassification[year, 'XsigNeg'] += 1
        save_pickle_file(id_fpath, intelDrivers)
    #
    with open(idb_fpath, 'a') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']:
            new_row = [
                mes, regressionClassification['2009', mes],
                regressionClassification['2010', mes]
            ]
            writer.writerow(new_row)
Exemplo n.º 28
0
def run():
    cg_dpath = dpaths['baseline', '2009', 'countGraph']
    cg_prefix = prefixs['baseline', '2009', 'countGraph']
    gp_dpath = dpaths['baseline', '2009', 'groupPartition']
    gp_prefix = prefixs['baseline', '2009', 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    #
    logger.info('Start handling SP_group_dpath')
    if not check_path_exist(gp_original_fpath):
        original_graph = {}
        for fn in get_all_files(cg_dpath, '%s*' % cg_prefix):
            count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn))
            logger.info('Start handling; %s' % fn)
            numEdges = len(count_graph)
            moduloNumber = numEdges / 10
            for i, ((did0, did1), w) in enumerate(count_graph.iteritems()):
                if i % moduloNumber == 0:
                    logger.info('Handling; %.2f' % (i / float(numEdges)))
                original_graph[did0, did1] = w
        save_pickle_file(gp_original_fpath, original_graph)
    else:
        original_graph = load_pickle_file(gp_original_fpath)
    #
    logger.info('igraph converting')
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    numEdges = len(original_graph)
    moduloNumber = numEdges / 10
    for i, ((did0, did1), w) in enumerate(original_graph.iteritems()):
        if i % moduloNumber == 0:
            logger.info('Handling; %.2f' % i / float(numEdges))
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Exemplo n.º 29
0
def run():
    print 'start'
    check_dir_create(com_dir)
    #
    yyyy = '2009'
    la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl'
    la_fpath = '%s/%s' % (la_dir, la_fn)
    _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-')
    CD = int(str_CD[len('CD('):-len(')')])
    print 'pick file loading...'
    pairs_day_counting = load_pickle_file(la_fpath)
    print 'finished'
    for thD in [18, 36, 55, 73, 82, 92]:
        thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD))
        check_dir_create(thD_dpath)
        summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % (
            thD_dpath, yyyy, CD, thD)
        glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy,
                                                              CD, thD)
        with open(summary_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'com-name', 'num-nodes', 'num-edges',
                'tie-strength(# of days encounter / # of drivers)'
            ]
            writer.writerow(new_headers)
        #
        nxG = nx.Graph()
        for (k0, k1), num_days in pairs_day_counting.iteritems():
            if num_days < thD:
                continue
            nxG.add_edge(k0, k1, weight=num_days)

        print 'Whole graph pickling ...', yyyy, CD, thD
        nx.write_gpickle(
            nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' %
            (thD_dpath, yyyy, CD, thD, len(nxG.nodes()), len(nxG.edges())))
        n_label, n_comId = [], []
        nxId_igId = {}
        ig_nid = 0
        print 'Partitioning ...'
        partition = community.best_partition(nxG)
        for i, com in enumerate(set(partition.values())):
            list_nodes = [
                nodes for nodes in partition.keys() if partition[nodes] == com
            ]
            print i, 'Saving sub-graph ...'
            sub_nxG = nxG.subgraph(list_nodes)
            com_name = 'COM(%d)' % i
            com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % (
                thD_dpath, yyyy, CD, thD, com_name, len(
                    sub_nxG.nodes()), len(sub_nxG.edges()))
            nx.write_gpickle(sub_nxG, com_fpath)

            _, _, weight = zip(
                *list(sub_nxG.edges_iter(data='weight', default=1)))
            num_nodes, num_edges = len(sub_nxG), len(weight)
            with open(summary_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([
                    com_name, num_nodes, num_edges,
                    sum(weight) / float(num_nodes)
                ])
            #
            print i, 'labeling...'
            for n in sub_nxG.nodes():
                n_label.append(n)
                n_comId.append(i)
                nxId_igId[n] = ig_nid
                ig_nid += 1
        #
        if len(nxG.nodes()) < 1000:
            print 'Layout calculating...'
            print datetime.datetime.now()
            Edges = [(nxId_igId[n0], nxId_igId[n1])
                     for (n0, n1) in nxG.edges()]
            print 'finish edge converting', len(Edges)
            print datetime.datetime.now()
            igG = ig.Graph(Edges, directed=False)
            layt = igG.layout('kk', dim=3)
            print 'finish layout calculation'
            print datetime.datetime.now()
            #
            save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges])
        else:
            save_pickle_file(glayout_fpath, [])
Exemplo n.º 30
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        tm = 'spendingTime'
        st_graph_dpath = dpaths[tm, year, 'influenceGraph']
        st_graph_prefix = prefixs[tm, year, 'influenceGraph']
        SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID)
        if check_path_exist(SP_graph_fpath):
            return None
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        SP_graph, RP_graph = {}, {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)
            numObservations = len(did1_df)
            minDFResiduals = numObservations * MIN_RATIO_RESIDUAL
            did1_df = did1_df.drop(['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'roamingTime'], axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            candi_dummies = []
            num_iter = 1
            while True:
                for i, vs in enumerate(zip(*did1_df.values)):
                    if did1_df.columns[i] == tm:
                        continue
                    if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter:
                        candi_dummies.append(did1_df.columns[i])
                numIndepVariables = len(candi_dummies)
                if numIndepVariables == 0:
                    break
                if numObservations < numIndepVariables + minDFResiduals:
                    candi_dummies = []
                    num_iter += 1
                else:
                    break
            if not candi_dummies:
                continue
            y = did1_df[tm]
            X = did1_df[candi_dummies]
            X = sm.add_constant(X)
            SP_res = sm.OLS(y, X, missing='drop').fit()
            # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL:
            significant_drivers = set()
            for _did0, pv in SP_res.pvalues.iteritems():
                if _did0 == 'const':
                    continue
                if pv < SIGINIFICANCE_LEVEL:
                    significant_drivers.add(_did0)
            positive_ef_drivers = set()
            for _did0, cof in SP_res.params.iteritems():
                if _did0 == 'const':
                    continue
                if cof > 0:
                    positive_ef_drivers.add(_did0)
            for _did0 in significant_drivers.difference(positive_ef_drivers):
                SP_graph[int(_did0), did1] = SP_res.params[_did0]
        #
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(SP_graph_fpath, SP_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f:
            f.write(format_exc())
        raise
Exemplo n.º 31
0
def process_file(tm, year):
    ig_dpath = dpaths[tm, year, 'influenceGraph']
    ig_prefix = prefixs[tm, year, 'influenceGraph']
    gp_dpath = dpaths[tm, year, 'groupPartition']
    gp_prefix = prefixs[tm, year, 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon'])
    #
    logger.info('Start handling SP_group_dpath')
    orignal_graph = {}
    for fn in get_all_files(ig_dpath, '%s*' % ig_prefix):
        regression_graph = load_pickle_file('%s/%s' % (ig_dpath, fn))
        for i, ((did0, did1), w) in enumerate(regression_graph.iteritems()):
            orignal_graph[did0, did1] = w
    save_pickle_file(gp_original_fpath, orignal_graph)
    #
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()):
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Exemplo n.º 32
0
def run():
    print 'start'
    check_dir_create(com_dir)
    #
    yyyy = '2009'
    la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl'
    la_fpath = '%s/%s' % (la_dir, la_fn)
    _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-')
    CD = int(str_CD[len('CD('):-len(')')])
    print 'pick file loading...'
    pairs_day_counting = load_pickle_file(la_fpath)
    print 'finished'
    for thD in [18, 36, 55, 73, 82, 92]:
        thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD))
        check_dir_create(thD_dpath)
        summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % (thD_dpath, yyyy, CD, thD)
        glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy, CD, thD)
        with open(summary_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['com-name', 'num-nodes', 'num-edges', 'tie-strength(# of days encounter / # of drivers)']
            writer.writerow(new_headers)
        #
        nxG = nx.Graph()
        for (k0, k1), num_days in pairs_day_counting.iteritems():
            if num_days < thD:
                continue
            nxG.add_edge(k0, k1, weight=num_days)

        print 'Whole graph pickling ...', yyyy, CD, thD
        nx.write_gpickle(nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD,
                                                                              len(nxG.nodes()), len(nxG.edges())))
        n_label, n_comId = [], []
        nxId_igId = {}
        ig_nid = 0
        print 'Partitioning ...'
        partition = community.best_partition(nxG)
        for i, com in enumerate(set(partition.values())):
            list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
            print i, 'Saving sub-graph ...'
            sub_nxG = nxG.subgraph(list_nodes)
            com_name = 'COM(%d)' % i
            com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD,
                                                               com_name, len(sub_nxG.nodes()), len(sub_nxG.edges()))
            nx.write_gpickle(sub_nxG, com_fpath)

            _, _, weight = zip(*list(sub_nxG.edges_iter(data='weight', default=1)))
            num_nodes, num_edges = len(sub_nxG), len(weight)
            with open(summary_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([com_name, num_nodes, num_edges, sum(weight) / float(num_nodes)])
            #
            print i, 'labeling...'
            for n in sub_nxG.nodes():
                n_label.append(n)
                n_comId.append(i)
                nxId_igId[n] = ig_nid
                ig_nid += 1
        #
        if len(nxG.nodes()) < 1000:
            print 'Layout calculating...'
            print datetime.datetime.now()
            Edges = [(nxId_igId[n0], nxId_igId[n1]) for (n0, n1) in nxG.edges()]
            print 'finish edge converting', len(Edges)
            print datetime.datetime.now()
            igG = ig.Graph(Edges, directed=False)
            layt = igG.layout('kk', dim=3)
            print 'finish layout calculation'
            print datetime.datetime.now()
            #
            save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges])
        else:
            save_pickle_file(glayout_fpath, [])