Пример #1
0
def load_schedule(tdate,dpath):
	trips = gtfs.load_trips(tdate,dpath)
	stop_times, tz_sched = gtfs.load_stop_times(tdate,dpath)
	tcal=gtfs.TransitCalendar(tdate,dpath)
	active_services = tcal.get_service_ids(tdate)
	active_trips = trips.service_id.isin(active_services)
	active_stops = stop_times.reset_index().set_index('trip_id').loc[active_trips]
	active_stops['sched_hour'] = active_stops.arrival_time.str[:2].astype(int)
	active_stops['sched_arrival_time'] = active_stops.arrival_time.apply(ttools.parseTime)
	sched_times = active_stops.join(trips['route_id'],how='left')
	sched_times = sched_times.reset_index().sort(['route_id','sched_arrival_time'])
	sec = ttools.datetime.timedelta(seconds=1)
	sched_times['sched_headway'] = sched_times.groupby(['route_id','stop_id'])['sched_arrival_time'].diff()/sec
	sched_times.set_index(['trip_id','stop_id'],inplace=True,verify_integrity=True)
	return sched_times
Пример #2
0
def load_schedule(tdate, dpath):
    trips = gtfs.load_trips(tdate, dpath)
    stop_times, tz_sched = gtfs.load_stop_times(tdate, dpath)
    tcal = gtfs.TransitCalendar(tdate, dpath)
    active_services = tcal.get_service_ids(tdate)
    active_trips = trips.service_id.isin(active_services)
    active_stops = stop_times.reset_index().set_index(
        'trip_id').loc[active_trips]
    active_stops['sched_hour'] = active_stops.arrival_time.str[:2].astype(int)
    active_stops['sched_arrival_time'] = active_stops.arrival_time.apply(
        ttools.parseTime)
    sched_times = active_stops.join(trips['route_id'], how='left')
    sched_times = sched_times.reset_index().sort(
        ['route_id', 'sched_arrival_time'])
    sec = ttools.datetime.timedelta(seconds=1)
    sched_times['sched_headway'] = sched_times.groupby(
        ['route_id', 'stop_id'])['sched_arrival_time'].diff() / sec
    sched_times.set_index(['trip_id', 'stop_id'],
                          inplace=True,
                          verify_integrity=True)
    return sched_times
scheduled arrival time at one stop for one example trip.
"""

import os
import pandas as pd

# these two modules are homemade
import gtfs
import arrivals
import time
os.chdir('/gpfs2/projects/project-bus_capstone_2016/workspace/share')

# get all the schedule data. (subset can be created later)
trips = gtfs.load_trips('gtfs/')
stops = gtfs.load_stops('gtfs/')
stop_times, tz_sched = gtfs.load_stop_times('gtfs/')
print 'Finished loading GTFS data.'

# get the sample of parsed AVL data.  Beware, takes a few minutes.
bustime = pd.read_csv('newdata_parsed.csv')#,parse_dates=dt_columns)
qstr = ('Trip == "MTA NYCT_MV_B6-Weekday-SDon-038500_M5_203" or '
    'Trip == "MTA NYCT_MV_B6-Weekday-SDon-036500_M5_202" or '
    'Trip == "MTA NYCT_MV_B6-Weekday-SDon-040000_M5_204"')
bustime = bustime.query(qstr)
bustime.drop_duplicates(['vehicleID','RecordedAtTime'],inplace=True)
bustime.set_index(['Line','Trip','TripDate','vehicleID','RecordedAtTime'],
                  inplace=True,drop=True,verify_integrity=True)

# for now, use a truncated data set.  just get data for one line (M5).
tripDateLookup = "2016-06-13"
lineLookup = "MTA NYCT_M5"

# For each trip id
# 	For each record (different time stamps?)
# 		Get position
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

import gtfs
import arrivals
os.chdir('/green-projects/project-bus_capstone_2016/workspace/share')

trips = gtfs.load_trips('gtfs/')
stops = gtfs.load_stops('gtfs/')
stop_times = gtfs.load_stop_times('gtfs/')
print 'Finished loading GTFS data.'
import gtfs #homemade module
os.chdir('/gpfs2/projects/project-bus_capstone_2016/workspace/share')

schedule_samples = ['2015-01-04','2015-04-05','2015-06-27','2015-07-06','2015-09-05','2015-09-15','2015-10-12']
for i in range(len(schedule_samples)):
    print 'Beginning season of ' + schedule_samples[i]
    if i == len(schedule_samples)-1:
        break
    base = datetime.strptime(schedule_samples[i], '%Y-%m-%d')
    numdays = datetime.strptime(schedule_samples[i+1], '%Y-%m-%d') - base
    date_list = [base + ttools.datetime.timedelta(days=(x-1)) for x in range(0, numdays.days)]

    ss = schedule_samples[i]
    # get all the schedule data. (subset can be created later)
    trips = gtfs.load_trips(ss,'gtfs/')
    stop_times, tz_sched = gtfs.load_stop_times(ss,'gtfs/')
    stop_times['arrival_time'] = pd.to_timedelta(stop_times['arrival_time'])
    print 'Finished loading season schedule'

    # pd.DataFrame(columns=['date','count','mean','std','min','25%','50%','75%','max']).to_csv(ss+'_schedules.csv',index=False)
    pd.DataFrame(columns=['route_id','measure_name','measure','schedule_date']).to_csv(ss+'_schedules.csv')
    for dd in date_list:
        ds = datetime.strftime(dd,'%Y-%m-%d')
        try:     
            tcal=gtfs.TransitCalendar(ds)
            day_services = tcal.get_service_ids(ds)
            day_trips = trips.service_id.isin(day_services)
            day_stops = stop_times.reset_index(level=1).loc[day_trips]
            day_stops.set_index('stop_id',append=True,inplace=True)
            trip_durations = day_stops.groupby(level=(0))['arrival_time'].max()- day_stops.groupby(level=(0))['arrival_time'].min()
            trip_durations = pd.DataFrame(trip_durations).join(trips['route_id'],how='left').set_index('route_id',append=True)
def first_ping_index(row):
    # for a row from stop_times, return indexes of nearby points from KDTree
    trip_id = row.name[0]  # trip_id is contained in the row index
    tree = trees.xs((trip_id, "2016-06-13"), level=(1, 2)).values[0]
    nearby = tree.query_ball_point([row[0][0], row[0][1]], r=0.001)
    if len(nearby) == 0:  # sometimes there are no nearby points
        return None
    else:
        return min(nearby)


# get all the schedule data. (subset can be created later)
trips = gtfs.load_trips("gtfs/")
stops = gtfs.load_stops("gtfs/")
stop_times, tz_sched = gtfs.load_stop_times("gtfs/")
print "Finished loading GTFS data."

# get the sample of parsed AVL data.  Beware, large files take more time.
bustime = pd.read_csv("newdata_parsed.csv")  # ,parse_dates=dt_columns)
bustime.drop_duplicates(["vehicleID", "RecordedAtTime"], inplace=True)
bustime["Trip"] = bustime["Trip"].str.replace("MTA NYCT_", "")
bustime.set_index(
    ["Line", "Trip", "TripDate", "vehicleID", "RecordedAtTime"], inplace=True, drop=True, verify_integrity=True
)

# for demonstration, use a subset. Just get data for one line (M5) on one day.
tripDateLookup = "2016-06-13"
lineLookup = "MTA NYCT_M5"
bustime = bustime.xs((lineLookup, tripDateLookup), level=(0, 2), drop_level=False)
# note that the AVL dataframe must be sorted by timestammp, since iloc[]
Пример #7
0
                            how='left')
    masker = filtered.apply(valid_stop, axis=1)
    filtered.drop('stop_id', axis=1, inplace=True)
    return filtered[masker]


if __name__ == '__main__':
    infile = sys.argv[1]
    sched_date = sys.argv[2]
    gtfspath = sys.argv[3]
    outfile = sys.argv[1][:-4] + '_cleaned.csv'

    # get the sample of parsed AVL data.  Beware, large files take more time.
    bustime = pd.read_csv(infile, header=None)
    bustime.columns = [
        'ROUTE_ID', 'latitude', 'longitude', 'recorded_time', 'vehicle_id',
        'TRIP_ID', 'trip_date', 'SHAPE_ID', 'STOP_ID', 'distance_stop',
        'distance_shape', 'status', 'destination'
    ]

    bustime.drop_duplicates(['vehicle_id', 'recorded_time'], inplace=True)
    bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTA NYCT_', '')
    bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTABC_', '')
    bustime['STOP_ID'] = bustime['STOP_ID'].str.replace('MTA_', '')
    print 'Finished loading Bus Time data.'

    stop_times = gtfs.load_stop_times(sched_date, 'gtfs/')[0]
    print 'Finished loading GTFS data.'
    filtered = filter_invalid_stops(bustime, stop_times)
    filtered.to_csv(outfile, index=False)
Пример #8
0
    valid_stops = st.groupby(level=0)['stop_id'].apply(list)
    filtered = avl_df.merge(pd.DataFrame(valid_stops),left_on='TRIP_ID',
                            right_index=True,how='left')
    masker = filtered.apply(valid_stop,axis=1)
    filtered.drop('stop_id',axis=1,inplace=True)
    return filtered[masker]

if __name__=='__main__':
    infile = sys.argv[1]
    sched_date = sys.argv[2]
    gtfspath = sys.argv[3]
    outfile = sys.argv[1][:-4]+'_cleaned.csv'

    # get the sample of parsed AVL data.  Beware, large files take more time.
    bustime = pd.read_csv(infile,header=None)
    bustime.columns = ['ROUTE_ID','latitude','longitude','recorded_time',
                       'vehicle_id','TRIP_ID','trip_date','SHAPE_ID',
                       'STOP_ID','distance_stop','distance_shape','status',
                       'destination']
    
    bustime.drop_duplicates(['vehicle_id','recorded_time'],inplace=True)
    bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTA NYCT_','')
    bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTABC_','')
    bustime['STOP_ID'] = bustime['STOP_ID'].str.replace('MTA_','')
    print 'Finished loading Bus Time data.'

    stop_times = gtfs.load_stop_times(sched_date,'gtfs/')[0]
    print 'Finished loading GTFS data.'
    filtered = filter_invalid_stops(bustime,stop_times)
    filtered.to_csv(outfile,index=False)