示例#1
0
def create_traffic_results_df(db_loc, table_name, trip_index):
    """                                  
    Pulls the data from the traffic database and performs some 
        processing.

    :param: db_loc: location of the db locations
    :type: db_loc: string
    
    :param: table_name: name of the table to query
    :type: table_name: string
    
    :param: trip_index: trip index number
    :type: trip_index: int
    
    :return: traffic_data_df: dataframe that contains the date and trip 
        duration
    :type: traffic_data_df: pandas dataframe
    """
    traffic_conn = sf.create_connection(db_loc)
    sql_query = """select * from %s where 
                    trip_index = %g""" % (table_name, trip_index)
    traffic_data_df = pd.read_sql_query(sql_query, traffic_conn)
    # convert the date column to datetime
    traffic_data_df['date'] = pd.to_datetime(traffic_data_df['date'],
                                             format="%Y-%m-%d")
    # convert the duration in traffic to minutes
    traffic_data_df[
        'duration_in_traffic'] = traffic_data_df['duration_in_traffic'] / 60
    traffic_data_df = traffic_data_df[['date', 'duration_in_traffic']]
    traffic_data_df = traffic_data_df.set_index('date')
    traffic_data_df.index = pd.DatetimeIndex(traffic_data_df.index)
    return traffic_data_df
示例#2
0
def create_transit_results_df(db_loc, table_name, trip_id, stop_id,
                              train_sched_trip_duration):
    """                                  
    Pulls the data from the gtfs-rt transit database and performs some 
        processing.

    :param: db_loc: location of the db locations
    :type: db_loc: string
    
    :param: table_name: name of the table to query
    :type: table_name: string
    
    :param: trip_id: train number
    :type: trip_id: string

    :param: stop_id: stop id number
    :type: stop_id: string

    :param: train_sched_trip_duration: the scheduled length of the train trip
    :type: train_sched_trip_duration: float
    
    :return: transit_data_df: dataframe that contains the date and trip 
        duration
    :type: transit_data_df: pandas dataframe
    """
    transit_conn = sf.create_connection(db_loc)
    sql_query = """select * from %s where 
                    trip_id = %g and stop_id = %g """ % (table_name, trip_id,
                                                         stop_id)
    transit_data_df = pd.read_sql_query(sql_query, transit_conn)
    # convert the date column to datetimes
    transit_data_df['TrainStartDate'] = pd.to_datetime(
        transit_data_df['TrainStartDate'], format="%Y-%m-%d")
    # calculate the length of the train trip
    transit_data_df['train_duration'] = (transit_data_df['DeperatureDelay'] +
                                         train_sched_trip_duration) / 60
    transit_data_df.rename(index=str,
                           columns={"TrainStartDate": "date"},
                           inplace=True)
    transit_data_df = transit_data_df[['date', 'train_duration']]
    transit_data_df = transit_data_df.set_index('date')
    transit_data_df.index = pd.DatetimeIndex(transit_data_df.index)
    return transit_data_df
示例#3
0
# delete test database
ff.remove_files([config.test_siri_data_sql])

# create the siri table
sf.create_transit_data_siri_table(config.siri_table_name,
                                  config.test_siri_data_sql)
# determine the files to import
file_to_import = ff.find_files_that_filename_contain(config.test_file_dir,
                                                     'siri')
schedule_monitor = pd.read_csv(config.schedule_monitor_csv, index_col=0)
schedule_monitor.set_index(['trip_id', 'stop_id'], inplace=True)
time_index = 0
for file in file_to_import:
    # import the json file
    with open(file, 'r') as f:
        data = json.load(f)
    # convert json to pandas
    parsed_data = dcf.parse_siri_transit_data(data, time_index)
    parsed_data_with_delays = dcf.compare_actual_to_schedule(
        parsed_data, schedule_monitor)
    # try updating entries
    sf.update_entries(config.test_siri_data_sql, config.siri_table_name,
                      parsed_data_with_delays, columns_to_compare)
    time_index = time_index + 1

conn = sf.create_connection(config.test_siri_data_sql)
cursor = conn.cursor()
parsed_data_with_delays = sf.prepare_pandas_to_sql(parsed_data_with_delays,
                                                   ttd.gtfs_dict)
# copy the data to a temporary table
parsed_data_with_delays.to_sql('temp', conn, if_exists='replace', index=False)
示例#4
0
# determine the minimum and maximum date
first_date = min(first_date_traffic, first_date_transit)
last_date = max(last_date_traffic, last_date_transit)
# round the first date to the bginning of the month for plotting
date_min = first_date.replace(day=1)
# round the last date to the end of the month for plotting
date_max = last_date.replace(day=1, month=last_date.month + 1)

trip_ind = 0

trip_id = schedule_trips.iloc[trip_ind]['trip_id']
stop_id = schedule_trips.iloc[trip_ind]['stop_id_stop']
train_sched_trip_duration = schedule_trips.iloc[trip_ind][
    'sched_trip_duration_secs']

transit_conn = sf.create_connection(config.gtfs_rt_data_sql)
gtfs_rt_data_df = da.create_transit_results_df(config.gtfs_rt_data_sql,
                                               'transit_data_gtfs_rt', trip_id,
                                               stop_id,
                                               train_sched_trip_duration)
traffic_data_df = da.create_traffic_results_df(config.traffic_data_sql,
                                               'traffic_data', trip_ind)
title_str = 'test'
leg_str = ['Car', 'Train']
# create pandas series with the missing dates = NaN
date_range = pd.date_range(date_min.date(), date_max.date())
traffic_data_df = traffic_data_df.reindex(date_range, fill_value=np.NAN)
# merge the
trip_date = traffic_data_df.merge(gtfs_rt_data_df,
                                  left_index=True,
                                  right_index=True,
示例#5
0
def create_plots(csv_path_in, traffic_db_loc, results_db_loc, ecdf_dir,
                 hist_dir, time_dir):
    """
    Plots the results and postprocesses the data to determine statistics for
    the train trips

    :param: csv_path_in: file location for schedule_trips.csv
    :type: csv_path_in: string
    
    :param: results_db_loc: file location to store the post processed data. 
        This includes the mean and standard deviation of the trip times.
    :type: results_db_loc: string   
    
    :param: ecdf_dir: directory to store the emperical distribution plots
    :type: ecdf_dir: string 
    
    :param: hist_dir: directory to store the histograms
    :type: hist_dir: string 
    
    :param: time_dir: directory to store the time history data
    :type: time_dir: string 
    """
    # read in the schedule trips
    schedule_trips = pd.read_csv(csv_path_in, index_col=0)
    # query the first and last date in the database
    first_date = min_max_date(traffic_db_loc, 'traffic_data', 'utc_time',
                              'min')
    last_date = min_max_date(traffic_db_loc, 'traffic_data', 'utc_time', 'max')
    # round the first date to the bginning of the month for plotting
    date_min = first_date.replace(day=1)
    # round the last date to the end of the month for plotting
    date_max = last_date.replace(day=1, month=last_date.month + 1)
    # connect to the data database
    traffic_conn = sf.create_connection(traffic_db_loc)

    for trip_index in list(schedule_trips.index):
        print("plotting trip_index = " + str(trip_index))
        # read in the data for a given trip_index
        sql_query = """select * from traffic_data where 
                        trip_index = %g""" % trip_index
        traffic_data_df = pd.read_sql_query(sql_query, traffic_conn)
        # convert the date column to datetime
        traffic_data_df['date'] = pd.to_datetime(traffic_data_df['date'],
                                                 format="%Y-%m-%d")
        # convert the duration in traffic to minutes
        traffic_data_df['duration_in_traffic'] = traffic_data_df[
            'duration_in_traffic'] / 60
        # scheduled trip time in minutes
        sched_trip_time = schedule_trips[
            schedule_trips['trip_index'] ==
            trip_index]['sched_trip_duration_secs'].values[0] / 60
        duration_in_traffic = traffic_data_df['duration_in_traffic'].values
        start_station_str = traffic_data_df['start_station'].iloc[0]
        end_station_str = traffic_data_df['end_station'].iloc[0]
        train_number = traffic_data_df['trip_id'].iloc[0]
        title_str = ('Train ' + str(train_number) + ' - ' + start_station_str +
                     ' to ' + end_station_str)
        filename = title_str + '.png'
        # plot the histogram of the data
        fig = plt.figure()
        n, bins, patches = plt.hist(duration_in_traffic,
                                    normed=1,
                                    facecolor='green',
                                    alpha=0.75)
        plt.title(title_str)
        plt.xlabel('Trip Duration [minutes]')
        plt.ylabel('Probability')
        fig.savefig(os.path.join(hist_dir, filename), bbox_inches='tight')
        plt.close(fig)
        # emperical cumulative density
        sorted_duration_in_traffic = np.sort(duration_in_traffic)
        fig = plt.figure()
        plt.plot(sorted_duration_in_traffic,
                 np.linspace(0, 1, len(duration_in_traffic), endpoint=False))
        plt.xlabel('Trip Duration [minutes]')
        plt.ylabel('ECDF')
        plt.title(title_str)
        fig.savefig(os.path.join(ecdf_dir, filename), bbox_inches='tight')
        plt.close(fig)
        # Trip duration versus date
        # create pandas series with the missing dates = NaN
        trip_date = traffic_data_df[['date', 'duration_in_traffic']]
        trip_date = trip_date.set_index('date')
        date_range = pd.date_range(first_date.date(), last_date.date())
        trip_date.index = pd.DatetimeIndex(trip_date.index)
        trip_date = trip_date.reindex(date_range, fill_value=np.NAN)
        # plot the time series data
        fig, ax = plt.subplots()
        trip_date.plot(style='o-', legend=False, ax=ax)
        plt.ylabel('Trip Duration [minutes]')
        plt.title(title_str)
        ax.fmt_xdata = mdates.DateFormatter('%Y-%m-%d')
        # rotate and align the tick labels so they look better
        fig.autofmt_xdate()
        # set the axes so that it starts and ends on the first of a month
        ax.set_xlim(date_min, date_max)
        fig.savefig(os.path.join(time_dir, filename), bbox_inches='tight')
        plt.close(fig)
        # determine the fraction of trips greater than the train time
        trip_fract = sum(sorted_duration_in_traffic > sched_trip_time) / len(
            sorted_duration_in_traffic)
        take_train = int(trip_fract >= take_train_fract)
        duration_in_traffic_mean = np.mean(duration_in_traffic)
        duration_in_traffic_std = np.std(duration_in_traffic)
        count = len(traffic_data_df.index)
        data = (trip_index, str(train_number), start_station_str,
                end_station_str, duration_in_traffic_mean,
                duration_in_traffic_std, trip_fract, take_train,
                sched_trip_time, count, filename)
        sf.insert_results(results_db_loc, data)
示例#6
0
ff.remove_files([cmp_transit_data_sql])

sf.create_transit_data_siri_table(config.siri_table_name, cmp_transit_data_sql)

sf.create_transit_data_gtfs_rt_table(config.gfts_rt_table_name,
                                     cmp_transit_data_sql)
# copy the data into the temp table
sql_attach = ("ATTACH DATABASE '%s' AS %s" %
              (config.siri_data_sql, 'siri_data'))
sql_copy = ('INSERT INTO %s SELECT * FROM %s.%s' %
            (config.siri_table_name, 'siri_data', config.siri_table_name))
sql_attach2 = ("ATTACH DATABASE '%s' AS %s" %
               (config.gtfs_rt_data_sql, 'gtfs_rt_data'))
sql_copy2 = (
    'INSERT INTO %s SELECT * FROM %s.%s' %
    (config.gfts_rt_table_name, 'gtfs_rt_data', config.gfts_rt_table_name))
sql_cmd = sf.delete_entries_in_common(config.siri_table_name,
                                      config.gfts_rt_table_name,
                                      columns_to_compare)

try:
    conn = sf.create_connection(cmp_transit_data_sql)
    cursor = conn.cursor()
    cursor.execute(sql_attach)
    cursor.execute(sql_copy)
    cursor.execute(sql_attach2)
    cursor.execute(sql_copy2)
    cursor.execute(sql_cmd)
finally:
    if conn:
        conn.close()