def process_file(yymm):
    ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
        return None
    print 'handle the file; %s' % yymm
    veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
    veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
    if yymm not in ['0901', '1001', '1011']:
        path_to_last_day_csv_file = None
        temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv')
        prev_fn = None
        y, m = int(yymm[:2]), int(yymm[2:])
        prev_m = m - 1
        prev_yymm = '%02d%02d' %(y, prev_m)
        for temp_fn in temp_csv_files:
            if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)):
                prev_fn = temp_fn
                break
        assert prev_fn, yymm
        path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn)
        # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1:
        #     return None
        veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                        record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                             veh_ns_crossing_time, veh_last_log_ns_or_not)
    path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
            record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
    #
    save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
    save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
    print 'end the file; %s' % yymm
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    for dn, fn_prefix, Y09, Y10, both in _package:
        target_file = Y09 if yymm.startswith('09') else Y10
        with open('%s/%s%s.csv' % (dn, fn_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            for row in reader:
                if not check_path_exist(both):
                    with open(both, 'wt') as csvFile:
                        writer = csv.writer(csvFile)
                        writer.writerow(headers)
                with open(both, 'a') as csvFile:
                    writer = csv.writer(csvFile)
                    writer.writerow(row)
                #
                if not check_path_exist(target_file):
                    with open(target_file, 'wt') as csvFile:
                        writer = csv.writer(csvFile)
                        writer.writerow(headers)
                with open(target_file, 'a') as csvFile:
                    writer = csv.writer(csvFile)
                    writer.writerow(row)


    print 'end the file; %s' % yymm
def process_file(yymm):
    ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if not (check_path_exist(ap_pkl_file_path) and check_path_exist(ns_pkl_file_path)):
        return None
    #
    # Load pickle files
    #
    ap_crossing_time, ns_crossing_time = load_pickle_file(ap_pkl_file_path), load_pickle_file(ns_pkl_file_path)
    #
    # Initiate csv files
    #
    ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm)
    ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm)
    if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath):
        return None
    print 'handle the file; %s' % yymm
    for fpath in [ap_trip_fpath, ns_trip_fpath]:
        with open(fpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = ['tid', 'vid', 'did',
                               'start-time', 'end-time', 'duration',
                               'fare', 'prev-trip-end-time',
                               'trip-mode', 'queue—join-time', 'queueing-time']
                writer.writerow(new_headers)
    #
    with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h : i for i, h in enumerate(headers)}
        for row in reader:
            tid, did = row[hid['tid']], row[hid['did']]
            et, duration = row[hid['end-time']], row[hid['duration']]
            fare = row[hid['fare']]
            #
            ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int(row[hid['ns-trip-mode']]) 
            vid, st, prev_tet = row[hid['vid']], eval(row[hid['start-time']]), eval(row[hid['prev-trip-end-time']])
            #
            for tm, crossing_time, fpath in [(ap_tm, ap_crossing_time, ap_trip_fpath),
                                                             (ns_tm, ns_crossing_time, ns_trip_fpath)]:
                if tm == DIn_POut or tm == DOut_POut:
                    continue
                if tm == DIn_PIn:
                    queue_join_time = prev_tet
                elif tm == DOut_PIn:
                    try:
                        i = bisect(crossing_time[vid], st)
                    except KeyError:
                        print '%s-tid-%s' % (yymm, row[hid['tid']])
                        continue
                    queue_join_time = crossing_time[vid][i - 1] if i != 0 else crossing_time[vid][0]
                with open(fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    queueing_time = st - queue_join_time
                    if queueing_time < Q_LIMIT_MIN:
                        queueing_time = Q_LIMIT_MIN
                    new_row = [tid, vid, did, st, et, duration, fare, prev_tet,
                                tm, queue_join_time, queueing_time]
                    writer.writerow(new_row)
    print 'end the file; %s' % yymm 
예제 #4
0
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    for dn, fn_prefix, Y09, Y10, both in _package:
        target_file = Y09 if yymm.startswith('09') else Y10
        with open('%s/%s%s.csv' % (dn, fn_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            for row in reader:
                if not check_path_exist(both):
                    with open(both, 'wt') as csvFile:
                        writer = csv.writer(csvFile)
                        writer.writerow(headers)
                with open(both, 'a') as csvFile:
                    writer = csv.writer(csvFile)
                    writer.writerow(row)
                #
                if not check_path_exist(target_file):
                    with open(target_file, 'wt') as csvFile:
                        writer = csv.writer(csvFile)
                        writer.writerow(headers)
                with open(target_file, 'a') as csvFile:
                    writer = csv.writer(csvFile)
                    writer.writerow(row)

    print 'end the file; %s' % yymm
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    ap_target_file = Y09_ap_trips if yymm.startswith('09') else Y10_ap_trips
    with open('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm), 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        if not check_path_exist(ap_target_file):
            with open(ap_target_file, 'wt') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(headers)
        with open(ap_target_file, 'a') as csvFile:
            writer = csv.writer(csvFile)
            for row in reader:
                writer.writerow(row)
    #
    ns_target_file = Y09_ns_trips if yymm.startswith('09') else Y10_ns_trips
    with open('%s/%s%s.csv' % (ns_ep_dir, ns_ep_prefix, yymm), 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        if not check_path_exist(ns_target_file):
            with open(ns_target_file, 'wt') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(headers)
        with open(ns_target_file, 'a') as csvFile:
            writer = csv.writer(csvFile)
            for row in reader:
                writer.writerow(row)
    #
    print 'end the file; %s' % yymm
예제 #6
0
def process_file(yymm):
    ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
        return None
    print 'handle the file; %s' % yymm
    veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
    veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
    if yymm not in ['0901', '1001', '1011']:
        path_to_last_day_csv_file = None
        temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix,
                                       '.csv')
        prev_fn = None
        y, m = int(yymm[:2]), int(yymm[2:])
        prev_m = m - 1
        prev_yymm = '%02d%02d' % (y, prev_m)
        for temp_fn in temp_csv_files:
            if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)):
                prev_fn = temp_fn
                break
        assert prev_fn, yymm
        path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn)
        # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1:
        #     return None
        veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                        record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                             veh_ns_crossing_time, veh_last_log_ns_or_not)
    path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
            record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
    #
    save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
    save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
    print 'end the file; %s' % yymm
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    ap_target_file = Y09_ap_trips if yymm.startswith('09') else Y10_ap_trips
    with open('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        if not check_path_exist(ap_target_file):
            with open(ap_target_file, 'wt') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(headers)
        with open(ap_target_file, 'a') as csvFile:
            writer = csv.writer(csvFile)
            for row in reader:
                writer.writerow(row)
    #
    ns_target_file = Y09_ns_trips if yymm.startswith('09') else Y10_ns_trips
    with open('%s/%s%s.csv' % (ns_ep_dir, ns_ep_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        if not check_path_exist(ns_target_file):
            with open(ns_target_file, 'wt') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(headers)
        with open(ns_target_file, 'a') as csvFile:
            writer = csv.writer(csvFile)
            for row in reader:
                writer.writerow(row)
    #
    print 'end the file; %s' % yymm
예제 #8
0
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix,
                                          yymm)
        if not check_path_exist(ss_trips_fpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        prevDriversDefined_fpath = '%s/%s%s.csv' % (
            prevDriversDefined_dpath, prevDriversDefined_prefix, yymm)
        if check_path_exist(prevDriversDefined_fpath):
            logger.info('The processed; %s' % yymm)
            return None
        drivers = {}
        zones = generate_zones()
        handling_day = 0
        with open(prevDriversDefined_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                'did', 'timeFrame', 'zi', 'zj', 'time', 'day', 'month',
                'start-long', 'start-lat', 'distance', 'duration', 'fare',
                'spendingTime', 'prevDrivers'
            ])
            with open(ss_trips_fpath, 'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    t = eval(row[hid['time']])
                    cur_dt = datetime.datetime.fromtimestamp(t)
                    if handling_day != cur_dt.day:
                        logger.info('Processing %s %dth day (month %d)' %
                                    (yymm, cur_dt.day, cur_dt.month))
                        handling_day = cur_dt.day
                    did = int(row[hid['did']])
                    zi, zj = int(row[hid['zi']]), int(row[hid['zj']])
                    try:
                        z = zones[(zi, zj)]
                    except KeyError:
                        continue
                    if not drivers.has_key(did):
                        drivers[did] = ca_driver_withPrevDrivers(did)
                    prevDrivers = drivers[did].find_prevDriver(t, z)
                    writer.writerow(row + ['&'.join(map(str, prevDrivers))])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #9
0
def process_file(yymm):
    y, m = int('20' + yymm[:2]), int(yymm[2:])
    # find the next month's first day
    if m == 12:
        next_y, next_m = y + 1, 1
    else:
        next_y, next_m = y, m + 1
    next_m_first_day = datetime.datetime(next_y, next_m, 1, 0)
    cur_m_last_day = next_m_first_day - datetime.timedelta(days=1)
    dd = '%02d' % cur_m_last_day.day
    ll_fpath = '%s/%s%s%s.csv' % (logs_last_day_dir, log_last_day_prefix, yymm,
                                  dd)
    if check_path_exist(ll_fpath):
        return None
    print 'handle the file; %s' % yymm
    #
    last_day_timestamp = time.mktime(cur_m_last_day.timetuple())
    log_fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    # if (time.time() - get_created_time(log_fpath)) < HOUR1:
    #     return None
    with open(log_fpath, 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        with open(ll_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(headers)
            for row in reader:
                t = eval(row[hid['time']])
                if t <= last_day_timestamp:
                    continue
                writer.writerow(row)
    print 'end the file; %s' % yymm
예제 #10
0
def process_file(yymm):
    fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    if check_path_exist(fpath):
        return None

    print 'handle the file; %s' % yymm
    yy, mm = yymm[:2], yymm[-2:]
    #
    with open('%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        with open(fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['time', 'vid', 'did', 'ap-or-not', 'ns-or-not']
            writer.writerow(new_headers)
            #
            for row in reader:
                ap_or_not = ap_poly.is_including(
                    (eval(row[hid['longitude']]), eval(row[hid['latitude']])))
                np_or_not = ns_poly.is_including(
                    (eval(row[hid['longitude']]), eval(row[hid['latitude']])))
                new_row = [
                    row[hid['time']], row[hid['vehicle-id']],
                    row[hid['driver-id']], ap_or_not, np_or_not
                ]
                writer.writerow(new_row)
    print 'end the file; %s' % yymm
def process_file(yymm):
    y, m = int('20' + yymm[:2]), int(yymm[2:])
    # find the next month's first day
    if m == 12:
        next_y, next_m = y + 1, 1 
    else:
        next_y, next_m = y, m + 1
    next_m_first_day = datetime.datetime(next_y, next_m, 1, 0)
    cur_m_last_day = next_m_first_day - datetime.timedelta(days=1)
    dd = '%02d' % cur_m_last_day.day
    ll_fpath = '%s/%s%s%s.csv' % (logs_last_day_dir, log_last_day_prefix, yymm, dd)
    if check_path_exist(ll_fpath):
        return None
    print 'handle the file; %s' % yymm
    #
    last_day_timestamp = time.mktime(cur_m_last_day.timetuple())
    log_fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    # if (time.time() - get_created_time(log_fpath)) < HOUR1:
    #     return None
    with open(log_fpath, 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        with open(ll_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(headers)
            for row in reader:        
                t = eval(row[hid['time']])
                if t <= last_day_timestamp:
                    continue
                writer.writerow(row)
    print 'end the file; %s' % yymm
예제 #12
0
def get_driver_trajectory(did):
    ofpath = '%s%d.pkl' % (if_prefix, did)
    if check_path_exist(ofpath):
        dt_xy_state = load_pickle_file(ofpath)
    else:
        dates = []
        for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
            _, _date, _did = fn[:-len('.csv')].split('-')
            if int(_did) != did:
                continue
            year = 2000 + int(_date[:2])
            month, day = map(int, [_date[2:4], _date[4:6]])
            dt = datetime.datetime(year, month, day)
            dates += [dt]
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(row[hid['time']]))
                    lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [(dt, x, y, int(row[hid['state']]))]
        save_pickle_file(ofpath, dt_xy_state)
    return dt_xy_state
예제 #13
0
def log_location_labeling(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        log_fpath = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm)
        if check_path_exist(log_fpath):
            logger.info('The file had already been processed; %s' % log_fpath)
            return
        yy, mm = yymm[:2], yymm[-2:]
        #
        ap_poly, ns_poly = read_generate_polygon(ap_poly_fn), read_generate_polygon(ns_poly_fn)
        with open('%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open(log_fpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = ['time', 'vid', 'did', 'ap-or-not', 'ns-or-not']
                writer.writerow(new_headers)
                #
                for row in reader:
                    ap_or_not = ap_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']])))
                    np_or_not = ns_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']])))
                    new_row = [row[hid['time']], row[hid['vehicle-id']], row[hid['driver-id']], ap_or_not, np_or_not]
                    writer.writerow(new_row)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #14
0
    def InitUI(self):
        self.SetDoubleBuffered(True)
        #
        self.sgBorder_xy = GPS_xyDrawing.get_sgBoarder_xy()
        min_x, min_y = 1e400, 1e400
        for x, y in self.sgBorder_xy:
            if x < min_x:
                min_x = x
            if y < min_y:
                min_y = y
        self.translate_x, self.translate_y = -min_x + 10, -min_y + 10
        #
        self.Bind(wx.EVT_PAINT, self.OnPaint)
        # prepare stock objects.
        self.default_pen = self.create_pen(wx.BLACK, 1)
        self.default_font = self.create_font(8, wx.SWISS, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL)

        if check_path_exist(bg_img_fpath):
            bmp = wx.BitmapFromImage(wx.Image(bg_img_fpath).AdjustChannels(1.0, 1.0, 1.0, 0.4))
            self.bg_bmp = (bmp, bmp.GetWidth(), bmp.GetHeight())
        else:
            self.bg_bmp = None
        self.sgGrid_xy = GPS_xyDrawing.get_sgGrid_xy()
        self.encountered_zones = set()
        self.marked_zone = None
def process_tripBased():
    for y in range(9, 11):
        yyyy = '20%02d' % y
        logger.info('handle the file; %s' % yyyy)
        #
        statistics_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, yyyy)
        if check_path_exist(statistics_fpath):
            logger.info('The file had already been processed; %s' % yyyy)
            return
        yy = yyyy[2:]
        holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010
        with open(statistics_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['year', 'month', 'day', 'hour', 'weekEnd',
                      'driverID',
                      'locQTime', 'locEP', 'locDuration', 'locFare',
                      'locProductivity',
                      'locIn']
            drop_pick_cns = []
            for l0 in locations:
                for l1 in locations:
                    cn = 'D%s#P%s' % (l0, l1)
                    drop_pick_cns.append(cn)
                    header.append(cn)
            writer.writerow(header)
            for fn in get_all_files(economicProfit_ap_dpath, '%s%s*' % (economicProfit_ap_prefix, yy)):
                with open('%s/%s' % (economicProfit_ap_dpath, fn), 'rt') as r_csvfile:
                    reader = csv.reader(r_csvfile)
                    headers = reader.next()
                    hid = {h: i for i, h in enumerate(headers)}
                    for row in reader:
                        year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']])
                        did = int(row[hid['did']])
                        locQTime = float(row[hid['queueingTime']]) / SEC60
                        locEP = float(row[hid['economicProfit']]) / CENT
                        locDuration = float(row[hid['duration']]) / SEC60
                        locFare = float(row[hid['fare']]) / CENT
                        locProductivity = (locFare / (locQTime + locDuration)) * SEC60
                        locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0
                        weekEnd = 0
                        if (year, month, day) in holidays:
                            weekEnd = 1
                        if datetime.datetime(year, month, day).weekday() in WEEKENDS:
                            weekEnd = 1
                        l0, l1 = row[hid['prevEndTerminalAP']], row[hid['pickUpTerminalAP']]
                        drop_pick = 'D%s#P%s' % (l0, l1)
                        new_row = [
                            year, month, day, hour, weekEnd,
                            did,
                            locQTime, locEP, locDuration, locFare,
                            locProductivity,
                            locIn
                        ]
                        for dp_candidate in drop_pick_cns:
                            if dp_candidate == drop_pick:
                                new_row.append(1)
                            else:
                                new_row.append(0)
                        writer.writerow(new_row)
예제 #16
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1)
        sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year, _did1)
        if check_path_exist(ofpath):
            return None
        with open(ofpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['did',
                      'numObservations', 'numPrevDrivers',
                      'numSigRelationship',
                      'numPosCoef', 'numNegCoef',
                      'sigPosRelation', 'sigNegRelation']
            writer.writerow(header)
        #
        logger.info('Start loading; %s-%s' % (year, _did1))
        df = pd.read_csv(fpath)
        numObservations = len(df)
        did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1)
        if _did1 in did1_df.columns:
            did1_df = did1_df.drop([_did1], axis=1)
        prevDrivers = [cn for cn in did1_df.columns if cn != depVar]
        numPrevDrivers = len(prevDrivers)
        #
        sigRelatioin = {k: [] for k in ['pos', 'neg']}
        for _did0 in prevDrivers:
            num_encouters = sum(did1_df[_did0])
            if num_encouters < numObservations * MIN_PICKUP_RATIO:
                continue
            # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0:
            #     continue
            y = did1_df[depVar]
            X = did1_df[[_did0]]
            X = sm.add_constant(X)
            res = sm.OLS(y, X, missing='drop').fit()
            pv = res.pvalues[_did0]
            coef = res.params[_did0]
            if pv < SIGINIFICANCE_LEVEL:
                if coef < 0:
                    sigRelatioin['neg'] += [(_did0, coef)]
                elif coef > 0:
                    sigRelatioin['pos'] += [(_did0, coef)]
        with open(ofpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [_did1,
                       numObservations, numPrevDrivers,
                       len(sigRelatioin['pos']) + len(sigRelatioin['neg']),
                       len(sigRelatioin['pos']), len(sigRelatioin['neg']),
                       '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]), '&'.join([_did0 for _did0, _ in sigRelatioin['neg']])]
            writer.writerow(new_row)
        save_pickle_file(sig_fpath, sigRelatioin)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)), 'w') as f:
            f.write(format_exc())
        raise
    logger.info('End handling; %s' % fpath)
예제 #17
0
def run():
    if not check_path_exist(ssd_apIn_fpath):
        with open(ssd_apIn_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            headers = ['apQTime', 'apIn', 'did']
            writer.writerow(headers)
            for m in xrange(1, 13):
                yymm = '10%02d' % m
                if yymm in ['1010']:
                    continue
                logger.info('Start handling; %s' % yymm)
                ft_drivers = map(int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm)))
                ap_ep_fpath = '%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm)
                with open(ap_ep_fpath, 'rb') as r_csvfile:
                    reader = csv.reader(r_csvfile)
                    headers = reader.next()
                    hid = {h: i for i, h in enumerate(headers)}
                    handling_day = 0
                    for row in reader:
                        did = int(row[hid['did']])
                        if did not in ft_drivers:
                            continue
                        t = eval(row[hid['start-time']])
                        cur_dt = datetime.datetime.fromtimestamp(t)
                        if handling_day != cur_dt.day:
                            logger.info('...ing; %s(%dth)' % (yymm, handling_day))
                            handling_day = cur_dt.day
                        apIn = 1 if int(row[hid['trip-mode']]) == DIn_PIn else 0
                        apQTime = eval(row[hid['queueing-time']]) / float(SEC60)
                        new_row = [apQTime, apIn, did]
                        writer.writerow(new_row)
    #
    df = pd.read_csv(ssd_apIn_fpath)
    df = df[~(np.abs(df['apQTime'] - df['apQTime'].mean()) > (3 * df['apQTime'].std()))]
    minNumSample = 40
    with open(ssd_sensitivity_fpath, 'wb') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        headers = ['did', 'F_pValue', 'rSqure', 'rSqureAdj', 'coef_apIn', 'pValue_apIn', 'coef_const', 'pValue_const']
        writer.writerow(headers)
        for did in set(df['did']) :
            did_df = df[(df['did'] == did)]
            if len(did_df) < minNumSample:
                continue

            if len(did_df[(did_df['apIn'] == 0)]) < 4:
                continue
            y = did_df['apQTime']
            X = did_df['apIn']
            X = sm.add_constant(X)
            res = sm.OLS(y, X).fit()
            if np.isnan(res.f_pvalue):
                continue
            try:
                writer.writerow([did, res.f_pvalue, res.rsquared, res.rsquared_adj,
                                 res.params['apIn'], res.pvalues['apIn'], res.params['const'], res.pvalues['const']])
            except Exception as _:
                pass
예제 #18
0
def get_sgBoarder_xy():
    fpath = 'sgBorder_xy.pkl'
    if not check_path_exist(fpath):
        sgBorder_xy = []
        for lon, lat in sg_border:
            x, y = convert_GPS2xy(lon, lat)
            sgBorder_xy += [(x, y)]
        save_pickle_file(fpath, sgBorder_xy)
    else:
        sgBorder_xy = load_pickle_file(fpath)
    return sgBorder_xy
예제 #19
0
def get_sgZones():
    ofpath = 'sgZone.pkl'
    if check_path_exist(ofpath):
        sgZones = load_pickle_file(ofpath)
    else:
        sgZones = get_sg_zones()
        for z in sgZones.values():
            z.cCoor_xy = convert_GPS2xy(*z.cCoor_gps)
            z.polyPoints_xy = [convert_GPS2xy(*gps_coord) for gps_coord in z.polyPoints_gps]
            z.marked = False
        save_pickle_file(ofpath, sgZones)
    return sgZones
예제 #20
0
def get_sgRoards_xy():
    ofpath = 'sgRoards_xy.pkl'
    if check_path_exist(ofpath):
        sgRoards_xy = load_pickle_file(ofpath)
    else:
        sgRoards_xy = []
        for _, coords in get_SG_roads():
            road_fd = []
            for lon, lat in coords:
                road_fd += [convert_GPS2xy(lon, lat)]
            sgRoards_xy += [road_fd]
        save_pickle_file(ofpath, sgRoards_xy)
    return sgRoards_xy
예제 #21
0
def run(time_from, time_to):
    #
    # Step 1. Split Singapore into zones
    #
    if not check_path_exist(grid_info_fn):
        from taxi_common.sg_grid_zone import run as run_split_into_zones  # @UnresolvedImport
        hl_points, vl_points, zones = run_split_into_zones(rp_zone)
    else:
        hl_points, vl_points, zones = load_pickle_file(grid_info_fn)
    #
    # Step 2. Preprocess logs
    #
    processed_log_fn = get_processed_log_fn(time_from, time_to)
    if not check_path_exist(processed_log_fn):
        from preprocess_logs import run as run_preprocess_logs
        run_preprocess_logs(hl_points, vl_points, time_from, time_to)
    #
    # Step 3. Preprocess trips
    #
    processed_trip_fn = get_processed_trip_fn(time_from, time_to)
    if not check_path_exist(processed_trip_fn):
        from preprocess_trips import run as run_preprocess_trips 
        run_preprocess_trips(hl_points, vl_points, time_from, time_to)
예제 #22
0
def run(time_from, time_to):
    #
    # Step 1. Split Singapore into zones
    #
    if not check_path_exist(grid_info_fn):
        from taxi_common.sg_grid_zone import run as run_split_into_zones  # @UnresolvedImport
        hl_points, vl_points, zones = run_split_into_zones(rp_zone)
    else:
        hl_points, vl_points, zones = load_pickle_file(grid_info_fn)
    #
    # Step 2. Preprocess logs
    #
    processed_log_fn = get_processed_log_fn(time_from, time_to)
    if not check_path_exist(processed_log_fn):
        from preprocess_logs import run as run_preprocess_logs
        run_preprocess_logs(hl_points, vl_points, time_from, time_to)
    #
    # Step 3. Preprocess trips
    #
    processed_trip_fn = get_processed_trip_fn(time_from, time_to)
    if not check_path_exist(processed_trip_fn):
        from preprocess_trips import run as run_preprocess_trips
        run_preprocess_trips(hl_points, vl_points, time_from, time_to)
예제 #23
0
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        ifpath = '%s/%s%s.csv' % (if_dpath, if_prefix, yymm)
        if not check_path_exist(ifpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        ofpath = '%s/prevDrivers-%s%s.csv' % (if_dpath, if_prefix, yymm)
        # if check_path_exist(ofpath):
        #     logger.info('The processed; %s' % yymm)
        #     return None
        drivers = {}
        zones = generate_zones()
        handling_day = 0
        with open(ifpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            header = reader.next()
            hid = {h: i for i, h in enumerate(header)}
            with open(ofpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_header = header + ['prevDrivers']
                writer.writerow(new_header)
                for row in reader:
                    t = eval(row[hid['time']])
                    cur_dt = datetime.datetime.fromtimestamp(t)
                    if handling_day != cur_dt.day:
                        logger.info('Processing %s %dth day (month %d)' %
                                    (yymm, cur_dt.day, cur_dt.month))
                        handling_day = cur_dt.day
                    did = int(row[hid['did']])
                    zi, zj = int(row[hid['zi']]), int(row[hid['zj']])
                    try:
                        z = zones[(zi, zj)]
                    except KeyError:
                        continue
                    if not drivers.has_key(did):
                        drivers[did] = ca_driver_withPrevDrivers(did)
                    if did == 1 and t == eval('1233723600'):
                        print 'hi'

                    prevDrivers = drivers[did].find_prevDriver(t, z)
                    writer.writerow(row + ['&'.join(map(str, prevDrivers))])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        ifpath = '%s/%s%s.csv' % (if_dpath, if_prefix, yymm)
        if not check_path_exist(ifpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        ofpath = '%s/prevDrivers-%s%s.csv' % (if_dpath, if_prefix, yymm)
        # if check_path_exist(ofpath):
        #     logger.info('The processed; %s' % yymm)
        #     return None
        drivers = {}
        zones = generate_zones()
        handling_day = 0
        with open(ifpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            header = reader.next()
            hid = {h: i for i, h in enumerate(header)}
            with open(ofpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_header = header + ['prevDrivers']
                writer.writerow(new_header)
                for row in reader:
                    t = eval(row[hid['time']])
                    cur_dt = datetime.datetime.fromtimestamp(t)
                    if handling_day != cur_dt.day:
                        logger.info('Processing %s %dth day (month %d)' % (yymm, cur_dt.day, cur_dt.month))
                        handling_day = cur_dt.day
                    did = int(row[hid['did']])
                    zi, zj = int(row[hid['zi']]), int(row[hid['zj']])
                    try:
                        z = zones[(zi, zj)]
                    except KeyError:
                        continue
                    if not drivers.has_key(did):
                        drivers[did] = ca_driver_withPrevDrivers(did)
                    if did == 1 and t == eval('1233723600'):
                        print 'hi'


                    prevDrivers = drivers[did].find_prevDriver(t, z)
                    writer.writerow(row + ['&'.join(map(str, prevDrivers))])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #25
0
def get_sgGrid_xy():
    ofpath = 'sgGrid_xy.pkl'
    if check_path_exist(ofpath):
        sgGrid_xy = load_pickle_file(ofpath)
    else:
        sgGrid_xy = []
        lons, lats = generate_sg_grid()
        for lon in lons:
            sx, sy = convert_GPS2xy(lon, lats[0])
            ex, ey = convert_GPS2xy(lon, lats[-1])
            sgGrid_xy += [[(sx, sy), (ex, ey)]]
        for lat in lats:
            sx, sy = convert_GPS2xy(lons[0], lat)
            ex, ey = convert_GPS2xy(lons[-1], lat)
            sgGrid_xy += [[(sx, sy), (ex, ey)]]
        save_pickle_file(ofpath, sgGrid_xy)
    return sgGrid_xy
예제 #26
0
def log_last_day(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        y, m = int('20' + yymm[:2]), int(yymm[2:])
        # find the next month's first day
        if m == 12:
            next_y, next_m = y + 1, 1
        else:
            next_y, next_m = y, m + 1
        next_m_first_day = datetime.datetime(next_y, next_m, 1, 0)
        cur_m_last_day = next_m_first_day - datetime.timedelta(days=1)
        dd = '%02d' % cur_m_last_day.day
        ll_fpath = '%s/%s%s%s.csv' % (log_last_day_dpath, log_last_day_prefix, yymm, dd)
        if check_path_exist(ll_fpath):
            logger.info('The file had already been processed; %s' % ll_fpath)
            return
        #
        last_day_timestamp = time.mktime(cur_m_last_day.timetuple())
        log_fpath = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm)

        with open(log_fpath, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open(ll_fpath, 'wb') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow(headers)
                for row in reader:
                    t = eval(row[hid['time']])
                    if t <= last_day_timestamp:
                        continue
                    writer.writerow(row)
        print 'end the file; %s' % yymm
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #27
0
def run():
    drivers_dates = {}
    for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
        _, _date, _did = fn[:-len('.csv')].split('-')
        year = 2000 + int(_date[:2])
        month, day = map(int, [_date[2:4], _date[4:6]])
        dt = datetime.datetime(year, month, day)
        k = int(_did)
        if not drivers_dates.has_key(k):
            drivers_dates[k] = []
        drivers_dates[k] += [dt]
    #
    for did, dates in drivers_dates.iteritems():
        ofpath = '%s%d.pkl' % (if_prefix, did)
        if check_path_exist(ofpath):
            continue
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(
                        row[hid['time']]))
                    lon, lat = map(
                        eval,
                        [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [dt, x, y, int(row[hid['state']])]
        save_pickle_file(ofpath, dt_xy_state)
예제 #28
0
def process_file(yymm):
    fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm)
    if check_path_exist(fpath):
        return None


    print 'handle the file; %s' % yymm
    yy, mm = yymm[:2], yymm[-2:]
    #
    with open('%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm), 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        with open(fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['time', 'vid', 'did', 'ap-or-not', 'ns-or-not']
            writer.writerow(new_headers)
            #
            for row in reader:
                ap_or_not = ap_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']])))
                np_or_not = ns_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']])))
                new_row = [row[hid['time']], row[hid['vehicle-id']], row[hid['driver-id']], ap_or_not, np_or_not]
                writer.writerow(new_row)
    print 'end the file; %s' % yymm
예제 #29
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1)
        sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year,
                                                    _did1)
        if check_path_exist(ofpath):
            return None
        with open(ofpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = [
                'did', 'numObservations', 'numPrevDrivers',
                'numSigRelationship', 'numPosCoef', 'numNegCoef',
                'sigPosRelation', 'sigNegRelation'
            ]
            writer.writerow(header)
        #
        logger.info('Start loading; %s-%s' % (year, _did1))
        df = pd.read_csv(fpath)
        numObservations = len(df)
        did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1)
        if _did1 in did1_df.columns:
            did1_df = did1_df.drop([_did1], axis=1)
        prevDrivers = [cn for cn in did1_df.columns if cn != depVar]
        numPrevDrivers = len(prevDrivers)
        #
        sigRelatioin = {k: [] for k in ['pos', 'neg']}
        for _did0 in prevDrivers:
            num_encouters = sum(did1_df[_did0])
            if num_encouters < numObservations * MIN_PICKUP_RATIO:
                continue
            # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0:
            #     continue
            y = did1_df[depVar]
            X = did1_df[[_did0]]
            X = sm.add_constant(X)
            res = sm.OLS(y, X, missing='drop').fit()
            pv = res.pvalues[_did0]
            coef = res.params[_did0]
            if pv < SIGINIFICANCE_LEVEL:
                if coef < 0:
                    sigRelatioin['neg'] += [(_did0, coef)]
                elif coef > 0:
                    sigRelatioin['pos'] += [(_did0, coef)]
        with open(ofpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [
                _did1, numObservations, numPrevDrivers,
                len(sigRelatioin['pos']) + len(sigRelatioin['neg']),
                len(sigRelatioin['pos']),
                len(sigRelatioin['neg']),
                '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]),
                '&'.join([_did0 for _did0, _ in sigRelatioin['neg']])
            ]
            writer.writerow(new_row)
        save_pickle_file(sig_fpath, sigRelatioin)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)),
                  'w') as f:
            f.write(format_exc())
        raise
    logger.info('End handling; %s' % fpath)
예제 #30
0
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        yy, mm = yymm[:2], yymm[2:]
        trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % (
            taxi_home, yy, mm, yymm)
        trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % (
            taxi_home, yy, mm, yymm)
        log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm,
                                                            yymm)
        if not check_path_exist(trip_normal_fpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath,
                                            ss_drivers_prefix, yymm)
        if not check_path_exist(ss_drivers_fpath):
            logger.info('The file X exists; %s' % ss_drivers_fpath)
            return None
        ss_drivers = load_pickle_file(ss_drivers_fpath)
        x_points, y_points = get_sg_grid_xy_points()
        #
        ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix,
                                          yymm)
        if check_path_exist(ss_trips_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return None
        with open(ss_trips_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                'did', 'hour', 'zi', 'zj', 'time', 'day', 'month',
                'start-long', 'start-lat', 'distance', 'duration', 'fare',
                'queueingTime'
            ])
        with open(trip_normal_fpath, 'rb') as tripFileN:
            tripReaderN = csv.reader(tripFileN)
            tripHeaderN = tripReaderN.next()
            # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3,
            #  'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7,
            #  'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11,
            #  'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15,
            #  'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19}
            hidN = {h: i for i, h in enumerate(tripHeaderN)}
            with open(trip_ext_fpath, 'rb') as tripFileE:
                tripReaderE = csv.reader(tripFileE)
                tripHeaderE = tripReaderE.next()
                #
                # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3}
                #
                hidE = {h: i for i, h in enumerate(tripHeaderE)}
                with open(log_fpath, 'rb') as logFile:
                    logReader = csv.reader(logFile)
                    logHeader = logReader.next()
                    hidL = {h: i for i, h in enumerate(logHeader)}
                    handling_day = 0
                    drivers = {}
                    for rowN in tripReaderN:
                        rowE = tripReaderE.next()
                        didT = int(rowE[hidE['driver-id']])
                        if didT not in ss_drivers:
                            continue
                        tripTime = eval(rowN[hidN['start-time']])
                        cur_dtT = datetime.datetime.fromtimestamp(tripTime)
                        if handling_day != cur_dtT.day:
                            handling_day = cur_dtT.day
                            logger.info('Processing %s %dth day' %
                                        (yymm, cur_dtT.day))
                        if cur_dtT.weekday() in [FRI, SAT, SUN]:
                            continue
                        if cur_dtT.hour < AM10:
                            continue
                        if PM8 <= cur_dtT.hour:
                            continue
                        while True:
                            rowL = logReader.next()
                            logTime = eval(rowL[hidL['time']])
                            didL = int(rowL[hidL['driver-id']])
                            if didL not in ss_drivers:
                                continue
                            t = eval(rowL[hidL['time']])
                            cur_dtL = datetime.datetime.fromtimestamp(t)
                            if cur_dtL.weekday() in [FRI, SAT, SUN]:
                                continue
                            if cur_dtL.hour < AM10:
                                continue
                            if PM8 <= cur_dtL.hour:
                                continue
                            longitude, latitude = eval(
                                rowL[hidL['longitude']]), eval(
                                    rowL[hidL['latitude']])
                            zi, zj = bisect(x_points, longitude) - 1, bisect(
                                y_points, latitude) - 1
                            if zi < 0 or zj < 0:
                                continue
                            t, s = eval(rowL[hidL['time']]), eval(
                                rowL[hidL['state']])
                            z = (zi, zj)
                            cur_dt = datetime.datetime.fromtimestamp(t)
                            if handling_day != cur_dt.day:
                                handling_day = cur_dt.day
                                logger.info('Processing %s %dth day' %
                                            (yymm, cur_dt.day))
                            if not drivers.has_key(didL):
                                drivers[didL] = driver(didL, t, z, s)
                            else:
                                drivers[didL].update(t, z, s)
                            if tripTime <= logTime:
                                break
                        s_long, s_lat = eval(rowN[hidN['start-long']]), eval(
                            rowN[hidN['start-lat']])
                        zi, zj = bisect(x_points, s_long) - 1, bisect(
                            y_points, s_lat) - 1
                        if zi < 0 or zj < 0:
                            continue
                        if not drivers.has_key(didT):
                            continue
                        if drivers[didT].firstFreeStateTime == -1:
                            continue
                        queueingTime = tripTime - drivers[didT].zoneEnteredTime
                        if queueingTime < 0:
                            continue
                        with open(ss_trips_fpath, 'a') as w_csvfile:
                            writer = csv.writer(w_csvfile, lineterminator='\n')
                            writer.writerow([
                                didT, cur_dtT.hour, zi, zj, tripTime,
                                cur_dtT.day, cur_dtT.month, s_long, s_lat,
                                rowN[hidN['distance']], rowN[hidN['duration']],
                                rowN[hidN['fare']], queueingTime
                            ])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #31
0
def run():
    cg_dpath = dpaths['baseline', '2009', 'countGraph']
    cg_prefix = prefixs['baseline', '2009', 'countGraph']
    gp_dpath = dpaths['baseline', '2009', 'groupPartition']
    gp_prefix = prefixs['baseline', '2009', 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    #
    logger.info('Start handling SP_group_dpath')
    if not check_path_exist(gp_original_fpath):
        original_graph = {}
        for fn in get_all_files(cg_dpath, '%s*' % cg_prefix):
            count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn))
            logger.info('Start handling; %s' % fn)
            numEdges = len(count_graph)
            moduloNumber = numEdges / 10
            for i, ((did0, did1), w) in enumerate(count_graph.iteritems()):
                if i % moduloNumber == 0:
                    logger.info('Handling; %.2f' % (i / float(numEdges)))
                original_graph[did0, did1] = w
        save_pickle_file(gp_original_fpath, original_graph)
    else:
        original_graph = load_pickle_file(gp_original_fpath)
    #
    logger.info('igraph converting')
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    numEdges = len(original_graph)
    moduloNumber = numEdges / 10
    for i, ((did0, did1), w) in enumerate(original_graph.iteritems()):
        if i % moduloNumber == 0:
            logger.info('Handling; %.2f' % i / float(numEdges))
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
def process_file(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        trip_fpath = '%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm)
        trip_filtered_fpath = '%s/Filtered-%s%s.csv' % (trip_dpath,
                                                        trip_prefix, yymm)
        if check_path_exist(trip_fpath):
            logger.info('The file had already been processed; %s' % trip_fpath)
            return
        for fpath in [trip_fpath, trip_filtered_fpath]:
            with open(fpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = [
                    'vid', 'did', 'startTime', 'endTime', 'duration', 'fare',
                    'tripModeAP', 'tripModeNS', 'prevTripEndTime', 'year',
                    'month', 'day', 'hour', 'pickUpTerminalAP',
                    'prevEndTerminalAP'
                ]
                writer.writerow(new_headers)
        yy, mm = yymm[:2], yymm[-2:]
        yyyy = str(2000 + int(yy))
        normal_file = taxi_home + '/%s/%s/trips/trips-%s-normal.csv' % (
            yyyy, mm, yymm)
        ext_file = taxi_home + '/%s/%s/trips/trips-%s-normal-ext.csv' % (
            yyyy, mm, yymm)
        #
        year, month = int(yyyy), int(mm)
        ap_polygons, ns_polygon = get_ap_polygons(), get_ns_polygon()
        vehicle_prev_trip_position_time = {}
        #
        with open(normal_file, 'rb') as r_csvfile1:
            reader1 = csv.reader(r_csvfile1)
            headers1 = reader1.next()
            # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3,
            #  'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7,
            #  'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11,
            #  'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15,
            #  'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19}
            hid1 = {h: i for i, h in enumerate(headers1)}
            with open(ext_file, 'rb') as r_csvfile2:
                reader2 = csv.reader(r_csvfile2)
                headers2 = reader2.next()
                # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3}
                hid2 = {h: i for i, h in enumerate(headers2)}
                for row1 in reader1:
                    row2 = reader2.next()
                    #
                    vid = row1[hid1['vehicle-id']]
                    st_ts, et_ts = row1[hid1['start-time']], row1[
                        hid1['end-time']]
                    dur, fare = row1[hid1['duration']], row1[hid1['fare']]
                    day, hour = int(row1[hid1['start-day']]), int(
                        row1[hid1['start-hour']])
                    s_long, s_lat = eval(row1[hid1['start-long']]), eval(
                        row1[hid1['start-lat']])
                    e_long, e_lat = eval(row1[hid1['end-long']]), eval(
                        row1[hid1['end-lat']])

                    c_sl_ap, c_el_ap = False, False
                    c_sl_ter, c_el_ter = 'X', 'X'
                    for ap_polygon in ap_polygons:
                        if not c_sl_ap:
                            res = ap_polygon.is_including((s_long, s_lat))
                            if res:
                                c_sl_ap = res
                                c_sl_ter = ap_polygon.name
                        if not c_el_ap:
                            res = ap_polygon.is_including((e_long, e_lat))
                            if res:
                                c_el_ap = res
                                c_el_ter = ap_polygon.name
                    c_sl_ns, c_el_ns = ns_polygon.is_including(
                        (s_long, s_lat)), ns_polygon.is_including(
                            (e_long, e_lat))
                    did = row2[hid2['driver-id']]
                    #
                    if not vehicle_prev_trip_position_time.has_key(vid):
                        # ASSUMPTION
                        # If this trip is the driver's first trip in a month,
                        # let's assume that the previous trip occurred at outside of the airport and Night safari
                        # and also assume that the previous trip's end time is the current trip's start time
                        # False means the trip occur at outside of the airport or Night safari
                        vehicle_prev_trip_position_time[vid] = ('X', OUT, OUT,
                                                                st_ts)
                    pt_el_ter, pt_el_ap, pt_el_ns, pt_time = vehicle_prev_trip_position_time[
                        vid]
                    ap_trip_mode, ns_trip_mode = None, None
                    #
                    if pt_el_ap == IN and c_sl_ap == IN: ap_trip_mode = DIn_PIn
                    elif pt_el_ap == IN and c_sl_ap == OUT:
                        ap_trip_mode = DIn_POut
                    elif pt_el_ap == OUT and c_sl_ap == IN:
                        ap_trip_mode = DOut_PIn
                    elif pt_el_ap == OUT and c_sl_ap == OUT:
                        ap_trip_mode = DOut_POut
                    else:
                        assert False
                    #
                    if pt_el_ns == IN and c_sl_ns == IN: ns_trip_mode = DIn_PIn
                    elif pt_el_ns == IN and c_sl_ns == OUT:
                        ns_trip_mode = DIn_POut
                    elif pt_el_ns == OUT and c_sl_ns == IN:
                        ns_trip_mode = DOut_PIn
                    elif pt_el_ns == OUT and c_sl_ns == OUT:
                        ns_trip_mode = DOut_POut
                    else:
                        assert False
                    #
                    vehicle_prev_trip_position_time[vid] = (c_el_ter, c_el_ap,
                                                            c_el_ns, et_ts)
                    #
                    with open(trip_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [
                            vid, did, st_ts, et_ts, dur, fare, ap_trip_mode,
                            ns_trip_mode, pt_time, year, month, day, hour,
                            c_sl_ter, pt_el_ter
                        ]
                        writer.writerow(new_row)
                    #
                    # For filtered version
                    # Only consider trips whose start time is before 2 AM and after 6 AM
                    #
                    if AM2 <= hour and hour <= AM5:
                        continue
                    need2skip = False
                    for ys, ms, ds, hs in error_hours:
                        yyyy0 = 2000 + int(ys)
                        mm0, dd0, hh0 = map(int, [ms, ds, hs])
                        if (year == yyyy0) and (month == mm0) and (
                                day == dd0) and (hour == hh0):
                            need2skip = True
                    if need2skip: continue
                    #
                    with open(trip_filtered_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [
                            vid, did, st_ts, et_ts, dur, fare, ap_trip_mode,
                            ns_trip_mode, pt_time, year, month, day, hour,
                            c_sl_ter, pt_el_ter
                        ]
                        writer.writerow(new_row)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #33
0
def process_file(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath, queueingTime_ap_prefix, yymm)
        queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath, queueingTime_ns_prefix, yymm)
        if check_path_exist(queueingTime_ap_fpath) and check_path_exist(queueingTime_ns_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return
        #
        logger.info('load pickle files; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm)
        crossingTime_ap, crossingTime_ns = load_pickle_file(ap_pkl_fpath), load_pickle_file(ns_pkl_fpath)
        #
        logger.info('initiate csv files; %s' % yymm)
        with open(queueingTime_ap_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['did',
                           'startTime', 'endTime', 'duration', 'fare',
                           'tripMode', 'queueJoinTime', 'queueingTime',
                           'year', 'month', 'day', 'hour',
                           'pickUpTerminalAP', 'prevEndTerminalAP']
            writer.writerow(new_headers)
        with open(queueingTime_ns_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['did',
                           'startTime', 'endTime', 'duration', 'fare',
                           'tripMode', 'queueJoinTime', 'queueingTime',
                           'year', 'month', 'day', 'hour']
            writer.writerow(new_headers)
        #
        logger.info('start recording; %s' % yymm)
        with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h : i for i, h in enumerate(headers)}
            for row in reader:
                did = row[hid['did']]
                et, duration = row[hid['endTime']], row[hid['duration']]
                fare = row[hid['fare']]
                year, month = row[hid['year']], row[hid['month']]
                day, hour = row[hid['day']], row[hid['hour']]
                pickUpTerminalAP, prevEndTerminalAP = row[hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']]
                #
                ap_tm, ns_tm = int(row[hid['tripModeAP']]), int(row[hid['tripModeNS']])
                vid, st, prev_tet = row[hid['vid']], eval(row[hid['startTime']]), eval(row[hid['prevTripEndTime']])
                #
                # Airport trip
                #
                if ap_tm != DIn_POut or ap_tm != DOut_POut:
                    queueing_time = None
                    if ap_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ap_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ap[vid], st)
                            queue_join_time = crossingTime_ap[vid][i - 1] if i != 0 else crossingTime_ap[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [did,
                                   st, et, duration, fare,
                                   ap_tm, queue_join_time, queueing_time,
                                   year, month, day, hour,
                                   pickUpTerminalAP, prevEndTerminalAP]
                        append_record(queueingTime_ap_fpath, new_row)
                #
                # Night Safari
                #
                if ns_tm != DIn_POut or ns_tm != DOut_POut:
                    queueing_time = None
                    if ns_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ns_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ns[vid], st)
                            queue_join_time = crossingTime_ns[vid][i - 1] if i != 0 else crossingTime_ns[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [did,
                                   st, et, duration, fare,
                                   ns_tm, queue_join_time, queueing_time,
                                   year, month, day, hour]
                        append_record(queueingTime_ns_fpath, new_row)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #34
0
def process_files(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        productivity_fpath = '%s/%s%s.csv' % (productivity_dpath,
                                              productivity_prefix, yymm)
        if check_path_exist(productivity_fpath):
            logger.info('Already handled; %s' % yymm)
            return
        begin_datetime = datetime.datetime(2009, 1, 1, 0)
        last_datetime = datetime.datetime(2011, 2, 1, 0)
        hourly_stats, time_period_order = {}, []
        while begin_datetime < last_datetime:
            year, month, day, hour = begin_datetime.year, begin_datetime.month, begin_datetime.day, begin_datetime.hour
            k = (year, month, day, hour)
            hourly_stats[k] = [
                0 for _ in range(
                    len([
                        ALL_DUR, ALL_FARE, ALL_NUM, AP_DUR, AP_FARE, AP_QUEUE,
                        AP_NUM, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM
                    ]))
            ]
            time_period_order.append(k)
            begin_datetime += datetime.timedelta(hours=1)
        st_label, et_label, dur_label, fare_label = 'startTime', 'endTime', 'duration', 'fare'
        qt_label = 'queueingTime'
        #
        logger.info('Productive duration; %s' % yymm)
        with open(
                '%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm),
                'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                year, month = int(row[hid['year']]), int(row[hid['month']])
                day, hour = int(row[hid['day']]), int(row[hid['hour']])
                hourly_stats[(year, month, day, hour)][ALL_DUR] += eval(row[
                    hid['pro-dur']]) * SEC60  # unit change; Minute -> Second
        #
        logger.info('Total fare; %s' % yymm)
        with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm),
                  'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                st_ts, et_ts = eval(row[hid[st_label]]), eval(
                    row[hid[et_label]])
                dur, fare = eval(row[hid[dur_label]]), eval(
                    row[hid[fare_label]])
                sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare,
                                  ALL_FARE, ALL_NUM, None)

        #
        logger.info('Sum up fare, duration and queue time; %s' % yymm)
        for dir_path, file_prefix, id_DUR, id_FARE, id_QUEUE, id_NUM in [
            (queueingTime_ap_dpath, queueingTime_ap_prefix, AP_DUR, AP_FARE,
             AP_QUEUE, AP_NUM),
            (queueingTime_ns_dpath, queueingTime_ns_prefix, NS_DUR, NS_FARE,
             NS_QUEUE, NS_NUM)
        ]:
            with open('%s/%s%s.csv' % (dir_path, file_prefix, yymm),
                      'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    st_ts, et_ts = eval(row[hid[st_label]]), eval(
                        row[hid[et_label]])
                    dur, fare = eval(row[hid[dur_label]]), eval(
                        row[hid[fare_label]])
                    qt = eval(row[hid[qt_label]])
                    #
                    sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare,
                                      id_FARE, id_NUM, id_DUR)
                    sum_queueing_time(hourly_stats, st_ts, qt, id_QUEUE)
        #
        logger.info('Generate .csv file; %s' % yymm)
        with open(productivity_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = [
                'year', 'month', 'day', 'hour', 'allDuration', 'allFare',
                'allNum', 'apDuration', 'apFare', 'apQueueingTime', 'apNum',
                'nsDuration', 'nsFare', 'nsQueueingTime', 'nsNum'
            ]
            writer.writerow(header)
            for year, month, day, hour in time_period_order:
                all_dur, all_fare, all_num, \
                ap_dur, ap_fare, ap_qt, ap_num, \
                ns_dur, ns_fare, ns_qt, ns_num = hourly_stats[(year, month, day, hour)]
                #
                writer.writerow([
                    year, month, day, hour, all_dur, all_fare, all_num, ap_dur,
                    ap_fare, ap_qt, ap_num, ns_dur, ns_fare, ns_qt, ns_num
                ])
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
def process_file(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        trip_fpath = '%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm)
        trip_filtered_fpath = '%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm)
        if check_path_exist(trip_fpath):
            logger.info('The file had already been processed; %s' % trip_fpath)
            return
        for fpath in [trip_fpath, trip_filtered_fpath]:
            with open(fpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = ['vid', 'did',
                               'startTime', 'endTime', 'duration', 'fare',
                               'tripModeAP', 'tripModeNS', 'prevTripEndTime',
                               'year', 'month', 'day', 'hour',
                               'pickUpTerminalAP', 'prevEndTerminalAP']
                writer.writerow(new_headers)
        yy, mm = yymm[:2], yymm[-2:]
        yyyy = str(2000 + int(yy))
        normal_file = taxi_home + '/%s/%s/trips/trips-%s-normal.csv' % (yyyy, mm, yymm)
        ext_file = taxi_home + '/%s/%s/trips/trips-%s-normal-ext.csv' % (yyyy, mm, yymm)
        #
        year, month = int(yyyy), int(mm)
        ap_polygons, ns_polygon = get_ap_polygons(), get_ns_polygon()
        vehicle_prev_trip_position_time = {}
        #
        with open(normal_file, 'rb') as r_csvfile1:
            reader1 = csv.reader(r_csvfile1)
            headers1 = reader1.next()
            # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3,
            #  'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7,
            #  'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11,
            #  'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15,
            #  'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19}
            hid1 = {h : i for i, h in enumerate(headers1)}
            with open(ext_file, 'rb') as r_csvfile2:
                reader2 = csv.reader(r_csvfile2)
                headers2 = reader2.next()
                # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3}
                hid2 = {h : i for i, h in enumerate(headers2)}
                for row1 in reader1:
                    row2 = reader2.next()
                    #
                    vid = row1[hid1['vehicle-id']]
                    st_ts, et_ts = row1[hid1['start-time']], row1[hid1['end-time']]
                    dur, fare = row1[hid1['duration']], row1[hid1['fare']]
                    day, hour = int(row1[hid1['start-day']]), int(row1[hid1['start-hour']])
                    s_long, s_lat = eval(row1[hid1['start-long']]), eval(row1[hid1['start-lat']])
                    e_long, e_lat = eval(row1[hid1['end-long']]), eval(row1[hid1['end-lat']])

                    c_sl_ap, c_el_ap = False, False
                    c_sl_ter, c_el_ter = 'X', 'X'
                    for ap_polygon in ap_polygons:
                        if not c_sl_ap:
                            res = ap_polygon.is_including((s_long, s_lat))
                            if res:
                                c_sl_ap = res
                                c_sl_ter = ap_polygon.name
                        if not c_el_ap:
                            res = ap_polygon.is_including((e_long, e_lat))
                            if res:
                                c_el_ap = res
                                c_el_ter = ap_polygon.name
                    c_sl_ns, c_el_ns = ns_polygon.is_including((s_long, s_lat)), ns_polygon.is_including((e_long, e_lat))
                    did = row2[hid2['driver-id']]
                    #
                    if not vehicle_prev_trip_position_time.has_key(vid):
                        # ASSUMPTION
                        # If this trip is the driver's first trip in a month,
                        # let's assume that the previous trip occurred at outside of the airport and Night safari
                        # and also assume that the previous trip's end time is the current trip's start time
                        # False means the trip occur at outside of the airport or Night safari
                        vehicle_prev_trip_position_time[vid] = ('X', OUT, OUT, st_ts)
                    pt_el_ter, pt_el_ap, pt_el_ns, pt_time = vehicle_prev_trip_position_time[vid]
                    ap_trip_mode, ns_trip_mode = None, None
                    #
                    if pt_el_ap == IN and c_sl_ap == IN: ap_trip_mode = DIn_PIn
                    elif pt_el_ap == IN and c_sl_ap == OUT: ap_trip_mode = DIn_POut
                    elif pt_el_ap == OUT and c_sl_ap == IN: ap_trip_mode = DOut_PIn
                    elif pt_el_ap == OUT and c_sl_ap == OUT: ap_trip_mode = DOut_POut
                    else: assert False
                    #
                    if pt_el_ns == IN and c_sl_ns == IN: ns_trip_mode = DIn_PIn
                    elif pt_el_ns == IN and c_sl_ns == OUT: ns_trip_mode = DIn_POut
                    elif pt_el_ns == OUT and c_sl_ns == IN: ns_trip_mode = DOut_PIn
                    elif pt_el_ns == OUT and c_sl_ns == OUT: ns_trip_mode = DOut_POut
                    else: assert False
                    #
                    vehicle_prev_trip_position_time[vid] = (c_el_ter, c_el_ap, c_el_ns, et_ts)
                    #
                    with open(trip_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [vid, did,
                                   st_ts, et_ts, dur, fare,
                                   ap_trip_mode, ns_trip_mode, pt_time,
                                   year, month, day, hour,
                                   c_sl_ter, pt_el_ter]
                        writer.writerow(new_row)
                    #
                    # For filtered version
                    # Only consider trips whose start time is before 2 AM and after 6 AM
                    #
                    if AM2 <= hour and hour <= AM5:
                        continue
                    need2skip = False
                    for ys, ms, ds, hs in error_hours:
                        yyyy0 = 2000 + int(ys)
                        mm0, dd0, hh0 = map(int, [ms, ds, hs])
                        if (year == yyyy0) and (month == mm0) and (day == dd0) and (hour == hh0):
                            need2skip = True
                    if need2skip: continue
                    #
                    with open(trip_filtered_fpath, 'a') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_row = [vid, did,
                                   st_ts, et_ts, dur, fare,
                                   ap_trip_mode, ns_trip_mode, pt_time,
                                   year, month, day, hour,
                                   c_sl_ter, pt_el_ter]
                        writer.writerow(new_row)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #36
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        tm = 'spendingTime'
        st_graph_dpath = dpaths[tm, year, 'influenceGraph']
        st_graph_prefix = prefixs[tm, year, 'influenceGraph']
        SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix,
                                          reducerID)
        if check_path_exist(SP_graph_fpath):
            return None
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        SP_graph, RP_graph = {}, {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' %
                            (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)
            numObservations = len(did1_df)
            minDFResiduals = numObservations * MIN_RATIO_RESIDUAL
            did1_df = did1_df.drop([
                'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did',
                'roamingTime'
            ],
                                   axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            candi_dummies = []
            num_iter = 1
            while True:
                for i, vs in enumerate(zip(*did1_df.values)):
                    if did1_df.columns[i] == tm:
                        continue
                    if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter:
                        candi_dummies.append(did1_df.columns[i])
                numIndepVariables = len(candi_dummies)
                if numIndepVariables == 0:
                    break
                if numObservations < numIndepVariables + minDFResiduals:
                    candi_dummies = []
                    num_iter += 1
                else:
                    break
            if not candi_dummies:
                continue
            y = did1_df[tm]
            X = did1_df[candi_dummies]
            X = sm.add_constant(X)
            SP_res = sm.OLS(y, X, missing='drop').fit()
            # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL:
            significant_drivers = set()
            for _did0, pv in SP_res.pvalues.iteritems():
                if _did0 == 'const':
                    continue
                if pv < SIGINIFICANCE_LEVEL:
                    significant_drivers.add(_did0)
            positive_ef_drivers = set()
            for _did0, cof in SP_res.params.iteritems():
                if _did0 == 'const':
                    continue
                if cof > 0:
                    positive_ef_drivers.add(_did0)
            for _did0 in significant_drivers.difference(positive_ef_drivers):
                SP_graph[int(_did0), did1] = SP_res.params[_did0]
        #
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(SP_graph_fpath, SP_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)),
                  'w') as f:
            f.write(format_exc())
        raise
예제 #37
0
def process_file(fpath):
    def regression(dv, df):
        oc_dv = 'roamingTime' if dv == 'spendingTime' else 'spendingTime'
        rdf = df.copy(deep=True).drop([oc_dv], axis=1)
        candi_dummies = []
        num_iter = 1
        while True:
            for i, vs in enumerate(zip(*rdf.values)):
                if rdf.columns[i] == dv:
                    continue
                if sum(vs) > len(rdf) * MIN_PICKUP_RATIO * num_iter:
                    candi_dummies.append(rdf.columns[i])
            if len(rdf) <= len(candi_dummies):
                candi_dummies = []
                num_iter += 1
            else:
                break
        y = rdf[dv]
        X = rdf[candi_dummies]
        X = sm.add_constant(X)
        return sm.OLS(y, X, missing='drop').fit()

    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        st_graph_dpath = dpaths['spendingTime', year, 'influenceGraph']
        st_graph_prefix = prefixs['spendingTime', year, 'influenceGraph']
        SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix,
                                          reducerID)
        rt_graph_dpath = dpaths['roamingTime', year, 'influenceGraph']
        rt_graph_prefix = prefixs['roamingTime', year, 'influenceGraph']
        RP_graph_fpath = '%s/%s%s.pkl' % (rt_graph_dpath, rt_graph_prefix,
                                          reducerID)
        if check_path_exist(SP_graph_fpath):
            return None
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        SP_graph, RP_graph = {}, {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' %
                            (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)
            did1_df = did1_df.drop(
                ['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did'],
                axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            SP_res = regression('spendingTime', did1_df)
            if SP_res.f_pvalue < SIGINIFICANCE_LEVEL:
                significant_drivers = set()
                for _did0, pv in SP_res.pvalues.iteritems():
                    if _did0 == 'const':
                        continue
                    if pv < SIGINIFICANCE_LEVEL:
                        significant_drivers.add(_did0)
                positive_ef_drivers = set()
                for _did0, cof in SP_res.params.iteritems():
                    if _did0 == 'const':
                        continue
                    if cof > 0:
                        positive_ef_drivers.add(_did0)
                for _did0 in significant_drivers.difference(
                        positive_ef_drivers):
                    SP_graph[int(_did0), did1] = SP_res.params[_did0]
            #
            # RP_res = regression('roamingTime', did1_df)
            # if RP_res.f_pvalue < SIGINIFICANCE_LEVEL:
            #     significant_drivers = set()
            #     for _did0, pv in RP_res.pvalues.iteritems():
            #         if _did0 == 'const':
            #             continue
            #         if pv < SIGINIFICANCE_LEVEL:
            #             significant_drivers.add(_did0)
            #     positive_ef_drivers = set()
            #     for _did0, cof in RP_res.params.iteritems():
            #         if _did0 == 'const':
            #             continue
            #         if cof > 0:
            #             positive_ef_drivers.add(_did0)
            #     for _did0 in significant_drivers.difference(positive_ef_drivers):
            #         RP_graph[int(_did0), did1] = RP_res.params[_did0]
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(SP_graph_fpath, SP_graph)
        # save_pickle_file(RP_graph_fpath, RP_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)),
                  'w') as f:
            f.write(format_exc())
        raise
예제 #38
0
def process_file(yymm):
    def record_crossing_time(path_to_csv_file,
                             veh_ap_crossing_time, veh_last_log_ap_or_not,
                             veh_ns_crossing_time, veh_last_log_ns_or_not):
        with open(path_to_csv_file, 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                t, vid = eval(row[hid['time']]), row[hid['vid']]
                ap_or_not, ns_or_not = eval(row[hid['ap-or-not']]), eval(row[hid['ns-or-not']])
                #
                if not veh_last_log_ap_or_not.has_key(vid):
                    if ap_or_not == IN:
                        # the first log's position was occurred in the AP zone
                        assert not veh_ap_crossing_time.has_key(vid)
                        veh_ap_crossing_time[vid] = [t]
                else:
                    assert veh_last_log_ap_or_not.has_key(vid)
                    if veh_last_log_ap_or_not[vid] == OUT and ap_or_not == IN:
                        veh_ap_crossing_time.setdefault(vid, [t]).append(t)
                #
                if not veh_last_log_ns_or_not.has_key(vid):
                    if ns_or_not == IN:
                        # the first log's position was occurred in the NS zone
                        assert not veh_ns_crossing_time.has_key(vid)
                        veh_ns_crossing_time[vid] = [t]
                else:
                    assert veh_last_log_ns_or_not.has_key(vid)
                    if veh_last_log_ns_or_not[vid] == OUT and ns_or_not == IN:
                        veh_ns_crossing_time.setdefault(vid, [t]).append(t)
                #
                veh_last_log_ap_or_not[vid] = ap_or_not
                veh_last_log_ns_or_not[vid] = ns_or_not
        return veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not
    #
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm)
        if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath):
            return None
        print 'handle the file; %s' % yymm
        veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {}
        veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {}
        if yymm not in ['0901', '1001', '1011']:
            y, m = int(yymm[:2]), int(yymm[2:])
            prev_m = m - 1
            prev_yymm = '%02d%02d' %(y, prev_m)
            prev_fn = get_all_files(log_last_day_dpath, '%s%s*.csv' % (log_last_day_prefix, prev_yymm))[0]
            path_to_last_day_csv_file = '%s/%s' % (log_last_day_dpath, prev_fn)
            veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \
                            record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                                 veh_ns_crossing_time, veh_last_log_ns_or_not)
        path_to_csv_file = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm)
        veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \
                record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not,
                                     veh_ns_crossing_time, veh_last_log_ns_or_not)
        #
        save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time)
        save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #39
0
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        yy, mm = yymm[:2], yymm[2:]
        trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % (taxi_home, yy, mm, yymm)
        trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % (taxi_home, yy, mm, yymm)
        log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm)
        if not check_path_exist(trip_normal_fpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)
        if not check_path_exist(ss_drivers_fpath):
            logger.info('The file X exists; %s' % ss_drivers_fpath)
            return None
        ss_drivers = load_pickle_file(ss_drivers_fpath)
        x_points, y_points = get_sg_grid_xy_points()
        #
        ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm)
        if check_path_exist(ss_trips_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return None
        with open(ss_trips_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['did',
                             'hour', 'zi', 'zj',
                             'time', 'day', 'month',
                             'start-long', 'start-lat',
                             'distance', 'duration', 'fare',
                             'queueingTime'])
        with open(trip_normal_fpath, 'rb') as tripFileN:
            tripReaderN = csv.reader(tripFileN)
            tripHeaderN = tripReaderN.next()
            # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3,
            #  'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7,
            #  'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11,
            #  'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15,
            #  'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19}
            hidN = {h: i for i, h in enumerate(tripHeaderN)}
            with open(trip_ext_fpath, 'rb') as tripFileE:
                tripReaderE = csv.reader(tripFileE)
                tripHeaderE = tripReaderE.next()
                #
                # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3}
                #
                hidE = {h: i for i, h in enumerate(tripHeaderE)}
                with open(log_fpath, 'rb') as logFile:
                    logReader = csv.reader(logFile)
                    logHeader = logReader.next()
                    hidL = {h: i for i, h in enumerate(logHeader)}
                    handling_day = 0
                    drivers = {}
                    for rowN in tripReaderN:
                        rowE = tripReaderE.next()
                        didT = int(rowE[hidE['driver-id']])
                        if didT not in ss_drivers:
                            continue
                        tripTime = eval(rowN[hidN['start-time']])
                        cur_dtT = datetime.datetime.fromtimestamp(tripTime)
                        if handling_day != cur_dtT.day:
                            handling_day = cur_dtT.day
                            logger.info('Processing %s %dth day' % (yymm, cur_dtT.day))
                        if cur_dtT.weekday() in [FRI, SAT, SUN]:
                            continue
                        if cur_dtT.hour < AM10:
                            continue
                        if PM8 <= cur_dtT.hour:
                            continue
                        while True:
                            rowL = logReader.next()
                            logTime = eval(rowL[hidL['time']])
                            didL = int(rowL[hidL['driver-id']])
                            if didL not in ss_drivers:
                                continue
                            t = eval(rowL[hidL['time']])
                            cur_dtL = datetime.datetime.fromtimestamp(t)
                            if cur_dtL.weekday() in [FRI, SAT, SUN]:
                                continue
                            if cur_dtL.hour < AM10:
                                continue
                            if PM8 <= cur_dtL.hour:
                                continue
                            longitude, latitude = eval(rowL[hidL['longitude']]), eval(rowL[hidL['latitude']])
                            zi, zj = bisect(x_points, longitude) - 1, bisect(y_points, latitude) - 1
                            if zi < 0 or zj < 0:
                                continue
                            t, s = eval(rowL[hidL['time']]), eval(rowL[hidL['state']])
                            z = (zi, zj)
                            cur_dt = datetime.datetime.fromtimestamp(t)
                            if handling_day != cur_dt.day:
                                handling_day = cur_dt.day
                                logger.info('Processing %s %dth day' % (yymm, cur_dt.day))
                            if not drivers.has_key(didL):
                                drivers[didL] = driver(didL, t, z, s)
                            else:
                                drivers[didL].update(t, z, s)
                            if tripTime <= logTime:
                                break
                        s_long, s_lat = eval(rowN[hidN['start-long']]), eval(rowN[hidN['start-lat']])
                        zi, zj = bisect(x_points, s_long) - 1, bisect(y_points, s_lat) - 1
                        if zi < 0 or zj < 0:
                            continue
                        if not drivers.has_key(didT):
                            continue
                        if drivers[didT].firstFreeStateTime == -1:
                            continue
                        queueingTime = tripTime - drivers[didT].zoneEnteredTime
                        if queueingTime < 0:
                            continue
                        with open(ss_trips_fpath, 'a') as w_csvfile:
                            writer = csv.writer(w_csvfile, lineterminator='\n')
                            writer.writerow([didT,
                                             cur_dtT.hour, zi, zj,
                                             tripTime, cur_dtT.day, cur_dtT.month,
                                             s_long, s_lat,
                                             rowN[hidN['distance']], rowN[hidN['duration']], rowN[hidN['fare']],
                                             queueingTime])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #40
0
def run():
    cg_dpath = dpaths['baseline', '2009', 'countGraph']
    cg_prefix = prefixs['baseline', '2009', 'countGraph']
    gp_dpath = dpaths['baseline', '2009', 'groupPartition']
    gp_prefix = prefixs['baseline', '2009', 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon'])
    #
    logger.info('Start handling SP_group_dpath')
    if not check_path_exist(gp_original_fpath):
        original_graph = {}
        for fn in get_all_files(cg_dpath, '%s*' % cg_prefix):
            count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn))
            logger.info('Start handling; %s' % fn)
            numEdges = len(count_graph)
            moduloNumber = numEdges / 10
            for i, ((did0, did1), w) in enumerate(count_graph.iteritems()):
                if i % moduloNumber== 0:
                    logger.info('Handling; %.2f' % (i / float(numEdges)))
                original_graph[did0, did1] = w
        save_pickle_file(gp_original_fpath, original_graph)
    else:
        original_graph = load_pickle_file(gp_original_fpath)
    #
    logger.info('igraph converting')
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    numEdges = len(original_graph)
    moduloNumber = numEdges / 10
    for i, ((did0, did1), w) in enumerate(original_graph.iteritems()):
        if i % moduloNumber == 0:
            logger.info('Handling; %.2f' % i / float(numEdges))
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
예제 #41
0
def process_file(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath,
                                                 queueingTime_ap_prefix, yymm)
        queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath,
                                                 queueingTime_ns_prefix, yymm)
        if check_path_exist(queueingTime_ap_fpath) and check_path_exist(
                queueingTime_ns_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return
        #
        logger.info('load pickle files; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath,
                                        crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath,
                                        crossingTime_ns_prefix, yymm)
        crossingTime_ap, crossingTime_ns = load_pickle_file(
            ap_pkl_fpath), load_pickle_file(ns_pkl_fpath)
        #
        logger.info('initiate csv files; %s' % yymm)
        with open(queueingTime_ap_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode',
                'queueJoinTime', 'queueingTime', 'year', 'month', 'day',
                'hour', 'pickUpTerminalAP', 'prevEndTerminalAP'
            ]
            writer.writerow(new_headers)
        with open(queueingTime_ns_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode',
                'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour'
            ]
            writer.writerow(new_headers)
        #
        logger.info('start recording; %s' % yymm)
        with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm),
                  'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                did = row[hid['did']]
                et, duration = row[hid['endTime']], row[hid['duration']]
                fare = row[hid['fare']]
                year, month = row[hid['year']], row[hid['month']]
                day, hour = row[hid['day']], row[hid['hour']]
                pickUpTerminalAP, prevEndTerminalAP = row[
                    hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']]
                #
                ap_tm, ns_tm = int(row[hid['tripModeAP']]), int(
                    row[hid['tripModeNS']])
                vid, st, prev_tet = row[hid['vid']], eval(
                    row[hid['startTime']]), eval(row[hid['prevTripEndTime']])
                #
                # Airport trip
                #
                if ap_tm != DIn_POut or ap_tm != DOut_POut:
                    queueing_time = None
                    if ap_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ap_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ap[vid], st)
                            queue_join_time = crossingTime_ap[vid][
                                i - 1] if i != 0 else crossingTime_ap[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [
                            did, st, et, duration, fare, ap_tm,
                            queue_join_time, queueing_time, year, month, day,
                            hour, pickUpTerminalAP, prevEndTerminalAP
                        ]
                        append_record(queueingTime_ap_fpath, new_row)
                #
                # Night Safari
                #
                if ns_tm != DIn_POut or ns_tm != DOut_POut:
                    queueing_time = None
                    if ns_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ns_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ns[vid], st)
                            queue_join_time = crossingTime_ns[vid][
                                i - 1] if i != 0 else crossingTime_ns[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [
                            did, st, et, duration, fare, ns_tm,
                            queue_join_time, queueing_time, year, month, day,
                            hour
                        ]
                        append_record(queueingTime_ns_fpath, new_row)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #42
0
def process_files(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        productivity_fpath = '%s/%s%s.csv' % (productivity_dpath, productivity_prefix, yymm)
        if check_path_exist(productivity_fpath):
            logger.info('Already handled; %s' % yymm)
            return
        begin_datetime = datetime.datetime(2009, 1, 1, 0)
        last_datetime = datetime.datetime(2011, 2, 1, 0)
        hourly_stats, time_period_order = {}, []
        while begin_datetime < last_datetime:
            year, month, day, hour = begin_datetime.year, begin_datetime.month, begin_datetime.day, begin_datetime.hour
            k = (year, month, day, hour)
            hourly_stats[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM,
                                                     AP_DUR, AP_FARE, AP_QUEUE, AP_NUM,
                                                     NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))]
            time_period_order.append(k)
            begin_datetime += datetime.timedelta(hours=1)
        st_label, et_label, dur_label, fare_label = 'startTime', 'endTime', 'duration', 'fare'
        qt_label = 'queueingTime'
        #
        logger.info('Productive duration; %s' % yymm)
        with open('%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                year, month = int(row[hid['year']]), int(row[hid['month']])
                day, hour = int(row[hid['day']]), int(row[hid['hour']])
                hourly_stats[(year, month, day, hour)][ALL_DUR] += eval(row[hid['pro-dur']]) * SEC60  # unit change; Minute -> Second
        #
        logger.info('Total fare; %s' % yymm)
        with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                st_ts, et_ts = eval(row[hid[st_label]]), eval(row[hid[et_label]])
                dur, fare = eval(row[hid[dur_label]]), eval(row[hid[fare_label]])
                sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, ALL_FARE, ALL_NUM, None)

        #
        logger.info('Sum up fare, duration and queue time; %s' % yymm)
        for dir_path, file_prefix, id_DUR, id_FARE, id_QUEUE, id_NUM in [(queueingTime_ap_dpath, queueingTime_ap_prefix,
                                                                          AP_DUR, AP_FARE, AP_QUEUE, AP_NUM),
                                                                         (queueingTime_ns_dpath, queueingTime_ns_prefix,
                                                                          NS_DUR, NS_FARE, NS_QUEUE, NS_NUM)]:
            with open('%s/%s%s.csv' % (dir_path, file_prefix, yymm), 'rb') as r_csvfile:
                reader = csv.reader(r_csvfile)
                headers = reader.next()
                hid = {h: i for i, h in enumerate(headers)}
                for row in reader:
                    st_ts, et_ts = eval(row[hid[st_label]]), eval(row[hid[et_label]])
                    dur, fare = eval(row[hid[dur_label]]), eval(row[hid[fare_label]])
                    qt = eval(row[hid[qt_label]])
                    #
                    sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, id_FARE, id_NUM, id_DUR)
                    sum_queueing_time(hourly_stats, st_ts, qt, id_QUEUE)
        #
        logger.info('Generate .csv file; %s' % yymm)
        with open(productivity_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            header = ['year', 'month', 'day', 'hour',
                      'allDuration', 'allFare', 'allNum',
                      'apDuration', 'apFare', 'apQueueingTime', 'apNum',
                      'nsDuration', 'nsFare', 'nsQueueingTime', 'nsNum']
            writer.writerow(header)
            for year, month, day, hour in time_period_order:
                all_dur, all_fare, all_num, \
                ap_dur, ap_fare, ap_qt, ap_num, \
                ns_dur, ns_fare, ns_qt, ns_num = hourly_stats[(year, month, day, hour)]
                #
                writer.writerow([year, month, day, hour,
                                 all_dur, all_fare, all_num,
                                 ap_dur, ap_fare, ap_qt, ap_num,
                                 ns_dur, ns_fare, ns_qt, ns_num
                                 ])
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
예제 #43
0
def process_file(fpath):
    logger.info('Start handling; %s' % fpath)
    _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-')
    try:
        tm = 'spendingTime'
        st_graph_dpath = dpaths[tm, year, 'influenceGraph']
        st_graph_prefix = prefixs[tm, year, 'influenceGraph']
        SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID)
        if check_path_exist(SP_graph_fpath):
            return None
        #
        logger.info('Start loading; %s-%s' % (year, reducerID))
        df = pd.read_csv(fpath)
        SP_graph, RP_graph = {}, {}
        num_drivers = len(set(df['did']))
        for i, did1 in enumerate(set(df['did'])):
            if i % 10 == 0:
                logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID))
            did1_df = df[(df['did'] == did1)].copy(deep=True)
            numObservations = len(did1_df)
            minDFResiduals = numObservations * MIN_RATIO_RESIDUAL
            did1_df = did1_df.drop(['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'roamingTime'], axis=1)
            if '%d' % did1 in did1_df.columns:
                did1_df = did1_df.drop(['%d' % did1], axis=1)
            #
            candi_dummies = []
            num_iter = 1
            while True:
                for i, vs in enumerate(zip(*did1_df.values)):
                    if did1_df.columns[i] == tm:
                        continue
                    if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter:
                        candi_dummies.append(did1_df.columns[i])
                numIndepVariables = len(candi_dummies)
                if numIndepVariables == 0:
                    break
                if numObservations < numIndepVariables + minDFResiduals:
                    candi_dummies = []
                    num_iter += 1
                else:
                    break
            if not candi_dummies:
                continue
            y = did1_df[tm]
            X = did1_df[candi_dummies]
            X = sm.add_constant(X)
            SP_res = sm.OLS(y, X, missing='drop').fit()
            # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL:
            significant_drivers = set()
            for _did0, pv in SP_res.pvalues.iteritems():
                if _did0 == 'const':
                    continue
                if pv < SIGINIFICANCE_LEVEL:
                    significant_drivers.add(_did0)
            positive_ef_drivers = set()
            for _did0, cof in SP_res.params.iteritems():
                if _did0 == 'const':
                    continue
                if cof > 0:
                    positive_ef_drivers.add(_did0)
            for _did0 in significant_drivers.difference(positive_ef_drivers):
                SP_graph[int(_did0), did1] = SP_res.params[_did0]
        #
        logger.info('Start pickling; %s-%s' % (year, reducerID))
        save_pickle_file(SP_graph_fpath, SP_graph)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f:
            f.write(format_exc())
        raise
예제 #44
0
def process_file(yymm):
    ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix,
                                        yymm)
    ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix,
                                        yymm)
    if not (check_path_exist(ap_pkl_file_path)
            and check_path_exist(ns_pkl_file_path)):
        return None
    #
    # Load pickle files
    #
    ap_crossing_time, ns_crossing_time = load_pickle_file(
        ap_pkl_file_path), load_pickle_file(ns_pkl_file_path)
    #
    # Initiate csv files
    #
    ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm)
    ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm)
    if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath):
        return None
    print 'handle the file; %s' % yymm
    for fpath in [ap_trip_fpath, ns_trip_fpath]:
        with open(fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'tid', 'vid', 'did', 'start-time', 'end-time', 'duration',
                'fare', 'prev-trip-end-time', 'trip-mode', 'queue—join-time',
                'queueing-time'
            ]
            writer.writerow(new_headers)
    #
    with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        for row in reader:
            tid, did = row[hid['tid']], row[hid['did']]
            et, duration = row[hid['end-time']], row[hid['duration']]
            fare = row[hid['fare']]
            #
            ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int(
                row[hid['ns-trip-mode']])
            vid, st, prev_tet = row[hid['vid']], eval(
                row[hid['start-time']]), eval(row[hid['prev-trip-end-time']])
            #
            for tm, crossing_time, fpath in [
                (ap_tm, ap_crossing_time, ap_trip_fpath),
                (ns_tm, ns_crossing_time, ns_trip_fpath)
            ]:
                if tm == DIn_POut or tm == DOut_POut:
                    continue
                if tm == DIn_PIn:
                    queue_join_time = prev_tet
                elif tm == DOut_PIn:
                    try:
                        i = bisect(crossing_time[vid], st)
                    except KeyError:
                        print '%s-tid-%s' % (yymm, row[hid['tid']])
                        continue
                    queue_join_time = crossing_time[vid][
                        i - 1] if i != 0 else crossing_time[vid][0]
                with open(fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    queueing_time = st - queue_join_time
                    if queueing_time < Q_LIMIT_MIN:
                        queueing_time = Q_LIMIT_MIN
                    new_row = [
                        tid, vid, did, st, et, duration, fare, prev_tet, tm,
                        queue_join_time, queueing_time
                    ]
                    writer.writerow(new_row)
    print 'end the file; %s' % yymm
예제 #45
0
def process_files(yymm):
    productivity_fpath = '%s/%s%s.csv' % (productivity_dir,
                                          productivity_prefix, yymm)
    if check_path_exist(productivity_fpath):
        return None

    print 'handle the file; %s' % yymm
    begin_datetime = datetime.datetime(2009, 1, 1, 0)
    last_datetime = datetime.datetime(2011, 2, 1, 0)
    hourly_stats, time_period_order = {}, []
    while begin_datetime < last_datetime:
        yyyy, mm, dd, hh = begin_datetime.year, begin_datetime.month, begin_datetime.day, begin_datetime.hour
        k = (yyyy, mm, dd, hh)
        hourly_stats[k] = [
            0 for _ in range(
                len([
                    ALL_DUR, ALL_FARE, ALL_NUM, AP_DUR, AP_FARE, AP_QUEUE,
                    AP_NUM, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM
                ]))
        ]
        time_period_order.append(k)
        begin_datetime += datetime.timedelta(hours=1)
    #
    st_label, et_label, dur_label, fare_label = 'start-time', 'end-time', 'duration', 'fare'
    qt_label = 'queueing-time'
    # Productive duration
    print yymm, 'Productive duration'
    yyyy, mm = 2000 + int(yymm[:2]), int(yymm[2:])
    with open('%s/%s%s.csv' % (shift_pro_dur_dir, shift_pro_dur_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        for row in reader:
            dd, hh = eval(row[hid['dd']]), eval(row[hid['hh']])
            hourly_stats[(yyyy, mm, dd, hh)][ALL_DUR] += eval(
                row[hid['pro-dur']]) * SEC60  # unit change; Minute -> Second
    # Total fare
    print yymm, 'Total fare'
    with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        for row in reader:
            st_ts, et_ts = eval(row[hid[st_label]]), eval(row[hid[et_label]])
            dur, fare = eval(row[hid[dur_label]]), eval(row[hid[fare_label]])
            sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, ALL_FARE,
                              ALL_NUM, None)

    # Sum up fare, duration and queue time
    print yymm, 'Sum up fare, duration and queue time'
    for dir_path, file_prefix, id_DUR, id_FARE, id_QUEUE, id_NUM in [
        (ap_trips_dir, ap_trip_prefix, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM),
        (ns_trips_dir, ns_trip_prefix, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM)
    ]:
        with open('%s/%s%s.csv' % (dir_path, file_prefix, yymm),
                  'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                st_ts, et_ts = eval(row[hid[st_label]]), eval(
                    row[hid[et_label]])
                dur, fare = eval(row[hid[dur_label]]), eval(
                    row[hid[fare_label]])
                qt = eval(row[hid[qt_label]])
                #
                sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare,
                                  id_FARE, id_NUM, id_DUR)
                sum_queueing_time(hourly_stats, st_ts, qt, id_QUEUE)
    # Generate .csv file
    print yymm, 'Generate .csv file'
    with open(productivity_fpath, 'wb') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = [
            'yy', 'mm', 'dd', 'hh', 'all-duration', 'all-fare', 'all-num',
            'ap-duration', 'ap-fare', 'ap-queueing-time', 'ap-num',
            'ns-duration', 'ns-fare', 'ns-queueing-time', 'ns-num'
        ]
        writer.writerow(header)
        for yyyy, mm, dd, hh in time_period_order:
            all_dur, all_fare, all_num, \
            ap_dur, ap_fare, ap_qt, ap_num, \
            ns_dur, ns_fare, ns_qt, ns_num = hourly_stats[(yyyy, mm, dd, hh)]
            #
            writer.writerow([
                yyyy - 2000, mm, dd, hh, all_dur, all_fare, all_num, ap_dur,
                ap_fare, ap_qt, ap_num, ns_dur, ns_fare, ns_qt, ns_num
            ])
    print 'end the file; %s' % yymm