def create_traffic_results_df(db_loc, table_name, trip_index): """ Pulls the data from the traffic database and performs some processing. :param: db_loc: location of the db locations :type: db_loc: string :param: table_name: name of the table to query :type: table_name: string :param: trip_index: trip index number :type: trip_index: int :return: traffic_data_df: dataframe that contains the date and trip duration :type: traffic_data_df: pandas dataframe """ traffic_conn = sf.create_connection(db_loc) sql_query = """select * from %s where trip_index = %g""" % (table_name, trip_index) traffic_data_df = pd.read_sql_query(sql_query, traffic_conn) # convert the date column to datetime traffic_data_df['date'] = pd.to_datetime(traffic_data_df['date'], format="%Y-%m-%d") # convert the duration in traffic to minutes traffic_data_df[ 'duration_in_traffic'] = traffic_data_df['duration_in_traffic'] / 60 traffic_data_df = traffic_data_df[['date', 'duration_in_traffic']] traffic_data_df = traffic_data_df.set_index('date') traffic_data_df.index = pd.DatetimeIndex(traffic_data_df.index) return traffic_data_df
def create_transit_results_df(db_loc, table_name, trip_id, stop_id, train_sched_trip_duration): """ Pulls the data from the gtfs-rt transit database and performs some processing. :param: db_loc: location of the db locations :type: db_loc: string :param: table_name: name of the table to query :type: table_name: string :param: trip_id: train number :type: trip_id: string :param: stop_id: stop id number :type: stop_id: string :param: train_sched_trip_duration: the scheduled length of the train trip :type: train_sched_trip_duration: float :return: transit_data_df: dataframe that contains the date and trip duration :type: transit_data_df: pandas dataframe """ transit_conn = sf.create_connection(db_loc) sql_query = """select * from %s where trip_id = %g and stop_id = %g """ % (table_name, trip_id, stop_id) transit_data_df = pd.read_sql_query(sql_query, transit_conn) # convert the date column to datetimes transit_data_df['TrainStartDate'] = pd.to_datetime( transit_data_df['TrainStartDate'], format="%Y-%m-%d") # calculate the length of the train trip transit_data_df['train_duration'] = (transit_data_df['DeperatureDelay'] + train_sched_trip_duration) / 60 transit_data_df.rename(index=str, columns={"TrainStartDate": "date"}, inplace=True) transit_data_df = transit_data_df[['date', 'train_duration']] transit_data_df = transit_data_df.set_index('date') transit_data_df.index = pd.DatetimeIndex(transit_data_df.index) return transit_data_df
# delete test database ff.remove_files([config.test_siri_data_sql]) # create the siri table sf.create_transit_data_siri_table(config.siri_table_name, config.test_siri_data_sql) # determine the files to import file_to_import = ff.find_files_that_filename_contain(config.test_file_dir, 'siri') schedule_monitor = pd.read_csv(config.schedule_monitor_csv, index_col=0) schedule_monitor.set_index(['trip_id', 'stop_id'], inplace=True) time_index = 0 for file in file_to_import: # import the json file with open(file, 'r') as f: data = json.load(f) # convert json to pandas parsed_data = dcf.parse_siri_transit_data(data, time_index) parsed_data_with_delays = dcf.compare_actual_to_schedule( parsed_data, schedule_monitor) # try updating entries sf.update_entries(config.test_siri_data_sql, config.siri_table_name, parsed_data_with_delays, columns_to_compare) time_index = time_index + 1 conn = sf.create_connection(config.test_siri_data_sql) cursor = conn.cursor() parsed_data_with_delays = sf.prepare_pandas_to_sql(parsed_data_with_delays, ttd.gtfs_dict) # copy the data to a temporary table parsed_data_with_delays.to_sql('temp', conn, if_exists='replace', index=False)
# determine the minimum and maximum date first_date = min(first_date_traffic, first_date_transit) last_date = max(last_date_traffic, last_date_transit) # round the first date to the bginning of the month for plotting date_min = first_date.replace(day=1) # round the last date to the end of the month for plotting date_max = last_date.replace(day=1, month=last_date.month + 1) trip_ind = 0 trip_id = schedule_trips.iloc[trip_ind]['trip_id'] stop_id = schedule_trips.iloc[trip_ind]['stop_id_stop'] train_sched_trip_duration = schedule_trips.iloc[trip_ind][ 'sched_trip_duration_secs'] transit_conn = sf.create_connection(config.gtfs_rt_data_sql) gtfs_rt_data_df = da.create_transit_results_df(config.gtfs_rt_data_sql, 'transit_data_gtfs_rt', trip_id, stop_id, train_sched_trip_duration) traffic_data_df = da.create_traffic_results_df(config.traffic_data_sql, 'traffic_data', trip_ind) title_str = 'test' leg_str = ['Car', 'Train'] # create pandas series with the missing dates = NaN date_range = pd.date_range(date_min.date(), date_max.date()) traffic_data_df = traffic_data_df.reindex(date_range, fill_value=np.NAN) # merge the trip_date = traffic_data_df.merge(gtfs_rt_data_df, left_index=True, right_index=True,
def create_plots(csv_path_in, traffic_db_loc, results_db_loc, ecdf_dir, hist_dir, time_dir): """ Plots the results and postprocesses the data to determine statistics for the train trips :param: csv_path_in: file location for schedule_trips.csv :type: csv_path_in: string :param: results_db_loc: file location to store the post processed data. This includes the mean and standard deviation of the trip times. :type: results_db_loc: string :param: ecdf_dir: directory to store the emperical distribution plots :type: ecdf_dir: string :param: hist_dir: directory to store the histograms :type: hist_dir: string :param: time_dir: directory to store the time history data :type: time_dir: string """ # read in the schedule trips schedule_trips = pd.read_csv(csv_path_in, index_col=0) # query the first and last date in the database first_date = min_max_date(traffic_db_loc, 'traffic_data', 'utc_time', 'min') last_date = min_max_date(traffic_db_loc, 'traffic_data', 'utc_time', 'max') # round the first date to the bginning of the month for plotting date_min = first_date.replace(day=1) # round the last date to the end of the month for plotting date_max = last_date.replace(day=1, month=last_date.month + 1) # connect to the data database traffic_conn = sf.create_connection(traffic_db_loc) for trip_index in list(schedule_trips.index): print("plotting trip_index = " + str(trip_index)) # read in the data for a given trip_index sql_query = """select * from traffic_data where trip_index = %g""" % trip_index traffic_data_df = pd.read_sql_query(sql_query, traffic_conn) # convert the date column to datetime traffic_data_df['date'] = pd.to_datetime(traffic_data_df['date'], format="%Y-%m-%d") # convert the duration in traffic to minutes traffic_data_df['duration_in_traffic'] = traffic_data_df[ 'duration_in_traffic'] / 60 # scheduled trip time in minutes sched_trip_time = schedule_trips[ schedule_trips['trip_index'] == trip_index]['sched_trip_duration_secs'].values[0] / 60 duration_in_traffic = traffic_data_df['duration_in_traffic'].values start_station_str = traffic_data_df['start_station'].iloc[0] end_station_str = traffic_data_df['end_station'].iloc[0] train_number = traffic_data_df['trip_id'].iloc[0] title_str = ('Train ' + str(train_number) + ' - ' + start_station_str + ' to ' + end_station_str) filename = title_str + '.png' # plot the histogram of the data fig = plt.figure() n, bins, patches = plt.hist(duration_in_traffic, normed=1, facecolor='green', alpha=0.75) plt.title(title_str) plt.xlabel('Trip Duration [minutes]') plt.ylabel('Probability') fig.savefig(os.path.join(hist_dir, filename), bbox_inches='tight') plt.close(fig) # emperical cumulative density sorted_duration_in_traffic = np.sort(duration_in_traffic) fig = plt.figure() plt.plot(sorted_duration_in_traffic, np.linspace(0, 1, len(duration_in_traffic), endpoint=False)) plt.xlabel('Trip Duration [minutes]') plt.ylabel('ECDF') plt.title(title_str) fig.savefig(os.path.join(ecdf_dir, filename), bbox_inches='tight') plt.close(fig) # Trip duration versus date # create pandas series with the missing dates = NaN trip_date = traffic_data_df[['date', 'duration_in_traffic']] trip_date = trip_date.set_index('date') date_range = pd.date_range(first_date.date(), last_date.date()) trip_date.index = pd.DatetimeIndex(trip_date.index) trip_date = trip_date.reindex(date_range, fill_value=np.NAN) # plot the time series data fig, ax = plt.subplots() trip_date.plot(style='o-', legend=False, ax=ax) plt.ylabel('Trip Duration [minutes]') plt.title(title_str) ax.fmt_xdata = mdates.DateFormatter('%Y-%m-%d') # rotate and align the tick labels so they look better fig.autofmt_xdate() # set the axes so that it starts and ends on the first of a month ax.set_xlim(date_min, date_max) fig.savefig(os.path.join(time_dir, filename), bbox_inches='tight') plt.close(fig) # determine the fraction of trips greater than the train time trip_fract = sum(sorted_duration_in_traffic > sched_trip_time) / len( sorted_duration_in_traffic) take_train = int(trip_fract >= take_train_fract) duration_in_traffic_mean = np.mean(duration_in_traffic) duration_in_traffic_std = np.std(duration_in_traffic) count = len(traffic_data_df.index) data = (trip_index, str(train_number), start_station_str, end_station_str, duration_in_traffic_mean, duration_in_traffic_std, trip_fract, take_train, sched_trip_time, count, filename) sf.insert_results(results_db_loc, data)
ff.remove_files([cmp_transit_data_sql]) sf.create_transit_data_siri_table(config.siri_table_name, cmp_transit_data_sql) sf.create_transit_data_gtfs_rt_table(config.gfts_rt_table_name, cmp_transit_data_sql) # copy the data into the temp table sql_attach = ("ATTACH DATABASE '%s' AS %s" % (config.siri_data_sql, 'siri_data')) sql_copy = ('INSERT INTO %s SELECT * FROM %s.%s' % (config.siri_table_name, 'siri_data', config.siri_table_name)) sql_attach2 = ("ATTACH DATABASE '%s' AS %s" % (config.gtfs_rt_data_sql, 'gtfs_rt_data')) sql_copy2 = ( 'INSERT INTO %s SELECT * FROM %s.%s' % (config.gfts_rt_table_name, 'gtfs_rt_data', config.gfts_rt_table_name)) sql_cmd = sf.delete_entries_in_common(config.siri_table_name, config.gfts_rt_table_name, columns_to_compare) try: conn = sf.create_connection(cmp_transit_data_sql) cursor = conn.cursor() cursor.execute(sql_attach) cursor.execute(sql_copy) cursor.execute(sql_attach2) cursor.execute(sql_copy2) cursor.execute(sql_cmd) finally: if conn: conn.close()