# Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Read in Overage Data csv_name = "feburaryaddedutc" trips_df = pd.read_csv(os.path.join("data", csv_name + ".csv"), index_col=[0], dtype=str) # Drop duplicate records by Operator and TripID trips_df = trips_df.drop_duplicates(["Operator", "TripID"]) print(len(trips_df['TripID'])) # Fill missing with 0 trips_df.fillna(0, inplace=True) # Create a new trip_id that includes the first letter of the operator name trips_df['UniqueTripID'] = trips_df['Operator'].str[0] + trips_df['TripID'] # Drop records without an Operator trips_df = trips_df[trips_df['Operator'] != 0] # Create clean version of operator trips_df['OperatorClean'] = trips_df['Operator'].map(lambda x: x.split(" ")[0].lower().replace('limebike', 'lime')) # Set enddate equal to startdate if missing trips_df['EndDate'] = np.where(trips_df['EndDate'] == 0, trips_df['StartDate'], trips_df['EndDate']) trips_df['endposct'] = np.where(trips_df['endposct'] == 0, trips_df['posct'], trips_df['endposct']) trips_df['endutc'] = np.where(trips_df['endutc'] == 0, trips_df['startutc'], trips_df['endutc']) # Output final dataframe outname = csv_name + "pipe_delimited" trips_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Database create_dockless_trips_org(cur) # Load to Database uf.aws_load(outname, "dockless_trips_org", cur) # Commit changes to database conn.commit()
uf.set_env_path() conn, cur = uf.aws_connect() # Loop through all CSVs in cabi trip data folder cabi_trip_dir = r'./data/Cabi_Trips_MemberType' # Convert trip data from CSV to dataframe combined_df = trips_to_df(cabi_trip_dir) # Sort by StartDate and Add trip_id combined_df.sort_values(['Startdate'], inplace=True) combined_df.reset_index(inplace=True) combined_df.drop(['index'], axis=1, inplace=True) combined_df['trip_id'] = combined_df.index + 1 # Output dataframe as CSV outname = "Cabi_Trips_MemberType" combined_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table in AWS create_cabi_trips_membertype(cur) # Load to Database uf.aws_load(outname, "cabi_trips_membertype", cur) # Commit changes to database conn.commit()
conn, cur = uf.aws_connect() # Gather Outage data start_date = datetime.datetime(2018, 4, 1) end_date = datetime.datetime(2018, 4, 30) date_list = date_list(start_date, end_date) df_list = gather_out_data(date_list) # Combine daily dataframes combined_df = pd.concat(df_list, axis=0) # Drop any fields that do not have numeric terminal number combined_df = combined_df[combined_df['Terminal Number'].astype(str).str.isdigit()] # Add outage_id continuing from last record in AWS table out_id_df = pd.read_sql("""SELECT outage_id from cabi_out_hist order by outage_id desc LIMIT 1 """, con=conn) last_outage_id = out_id_df['outage_id'].iloc[0] combined_df.reset_index(inplace=True) combined_df['outage_id'] = combined_df.index + 1 + last_outage_id # Drop unneeded fields combined_df.drop(['index', 'Station Name'], axis=1, inplace=True) # Output dataframe as CSV outname = "CaBi_Tracker_Outage_History_From_" + start_date.strftime('%Y-%m-%d') + "_To_" + end_date.strftime('%Y-%m-%d') combined_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Database create_cabi_out_hist(cur) # Load to Database uf.aws_load(outname, "cabi_out_hist", cur) # Commit changes to database conn.commit()
summary_loc = os.path.join("data", "Summary data.xlsx") summary_df = pd.read_excel(summary_loc, sheet_name="Summary") # Lowercase Operator and replace limebike with lime summary_df['Operator'] = summary_df['Operator'].str.lower().str.replace( 'limebike', 'lime').str.split(' ').str[0] # Standardize month summary_df['Month'] = summary_df['Month'].apply( lambda x: convert_month_toDT(x, cur)) # Keep only Trips and Bikes keep_cols = ['Operator', 'Month', 'TotalTrips', 'TotalBikes'] summary_df = summary_df[keep_cols] # Drop Records without Trips and Bikes summary_df = summary_df[summary_df['TotalBikes'].notnull()] # Output dataframe as CSV outname = "dockless_summary" summary_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table in AWS create_dockless_summary(cur) # Load to Database uf.aws_load(outname, "dockless_summary", cur) # Commit changes to database conn.commit()
def create_cabi_system(cur): # This script creates the CaBi System AWS table cur.execute(""" DROP TABLE cabi_system; CREATE TABLE cabi_system( region_id serial PRIMARY KEY, name varchar(100), code text ) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Pull CaBi System Regions regions_df = pull_region_info() # Merge on region code regions_df = regions_df.merge(region_code(), on='name', how='left') regions_df = regions_df[['region_id', 'name', 'code']] # Output dataframe as CSV outname = "CaBi_System" regions_df.to_csv(outname + ".csv", index=False, sep='|') # Create Table create_cabi_system(cur) # Load to Database uf.aws_load(outname, "cabi_system", cur) # Commit changes to database conn.commit()
polygon geography); """) if __name__ == '__main__': # Set environmental variable uf.set_env_path() # Connect to AWS conn, cur = uf.aws_connect() # Download DC ANCs and Neighborhood Clusters From Open Data DC Website as JSONs json_id_dict = { 'ngh': "f6c703ebe2534fc3800609a07bad8f5b_17", 'anc': "fcfbf29074e549d8aff9b9c708179291_1" } for json_name, json_id in json_id_dict.items(): combined_df = extract_json(json_id) # Output dataframe as CSV outname = "json_name" combined_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Database if json_name == 'ngh': create_ngh(cur) else: create_anc(cur) # Load to Database uf.aws_load(outname, json_name, cur) # Commit changes to database conn.commit()
name varchar(200), region_id integer, rental_methods varchar(200), rental_url varchar(200), short_name varchar(20), station_id integer PRIMARY KEY) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Pull CaBi System Regions station_df = pull_station_info() # Default any missing region ids to DC (only example as of 2/21 is new station at Anacostia Park) station_df['region_id'] = np.where(station_df['region_id'].isnull(), 42, station_df['region_id']) station_df['region_id'] = station_df['region_id'].astype(int) # Output dataframe as CSV outname = "CaBi_Stations_Temp" station_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Database create_cabi_stations_temp(cur) # Load to Database uf.aws_load(outname, "cabi_stations_temp", cur) # Commit changes to database conn.commit()
final_dfs = [second_df, cabi_df, dless_df] df_final = reduce( lambda left, right: pd.merge( left, right, how="left", left_index=True, right_index=True), final_dfs) df_final.drop_duplicates(inplace=True) # Define numeric columns for create table statement prior to resetting index to bring back date field numeric_cols = ", ".join([col + " numeric" for col in df_final.columns]) df_final.fillna(0, inplace=True) df_final.reset_index(inplace=True) # Output final DataFrame outname = "final_db_pipe_delimited" df_final.to_csv(outname + ".csv", index=False, sep='|') # CREATE TABLE on AWS with and without timestamp TIMESTR = time.strftime("%Y%m%d_%H%M%S") db_name = "final_db_" + TIMESTR cur.execute( sql.SQL("CREATE TABLE {0}(date date PRIMARY KEY," + numeric_cols + ")").format(sql.Identifier(db_name))) cur.execute( "DROP TABLE final_db; CREATE TABLE final_db(date date PRIMARY KEY," + numeric_cols + ")") # Load to Database uf.aws_load(outname, db_name, cur) uf.aws_load(outname, 'final_db', cur) # Commit changes to database conn.commit()
def create_jump_price(cur): # This script loads data to the jump_price AWS table cur.execute(""" DROP TABLE jump_price; CREATE TABLE jump_price( min_seconds integer PRIMARY KEY, max_seconds integer, cost numeric ) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Read in Overage Data csv_name = "JumpTripCosts" overage_df = pd.read_csv(csv_name + ".csv") # Output final dataframe to memory outname = csv_name + "pipe_delimited" overage_df.to_csv(outname + ".csv", index=False, sep='|') # Create Table create_jump_price(cur) # Load to Database uf.aws_load(outname, "jump_price", cur) # Commit changes to database conn.commit()
conn, cur = uf.aws_connect() # Load Ofo user from report provided by DDOT user_report_loc = os.path.join("data", "User ids.xlsx") ofo_df = pd.read_excel(user_report_loc, sheet_name="ofo User IDs") # Drop second row since it's a second header ofo_df = ofo_df[ofo_df['October'] != 'UserID'] ofo_df_stack = ofo_df.stack() ofo_df_stack = ofo_df_stack.reset_index() # Define Month in Datetime ofo_df_stack['usage_month'] = ofo_df_stack['level_1'].apply( convert_month_toDT) ofo_df_stack.drop(['level_0', 'level_1'], axis=1, inplace=True) ofo_df_stack.columns = ['user_id', 'usage_month'] # Count trips by User and month ofo_count_df = ofo_df_stack.groupby(['user_id', 'usage_month']).size() ofo_count_df = ofo_count_df.reset_index() ofo_count_df.columns = ['user_id', 'usage_month', 'trips'] ofo_count_df.sort_values(['user_id', 'usage_month'], inplace=True) # Output dataframe as CSV outname = "ofo_users" ofo_count_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table in AWS create_ofo_users(cur) # Load to Database uf.aws_load(outname, "ofo_users", cur) # Commit changes to database conn.commit()
results_df = pd.concat([results_df, first_record_df], axis=0) results_df = results_df[:-1] # Fill in zeros missing data results_df.fillna(0, inplace=True) # Convert precipIntensityMaxTime to integer results_df['precipintensitymaxtime'] = results_df[ 'precipintensitymaxtime'].astype('int') # Drop weather fields new to 2018 drop_cols = [ 'ozone', 'uvindex', 'uvindextime', 'windgust', 'windgusttime', 'time' ] for drop_col in drop_cols: if drop_col in results_df.columns: results_df = results_df.drop([drop_col], axis=1) # Reorder columns based on current table results_df['day_time'] = results_df['day_time'].astype(int) results_df = results_df[first_record_df.columns] # Output final dataframe TIMESTR = time.strftime("%Y%m%d_%H%M%S") outname = "Dark_Sky_From_" + start_date.strftime( '%Y-%m-%d') + "_To_" + end_date.strftime('%Y-%m-%d') results_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table create_dark_sky_raw(cur) # Load to Database uf.aws_load(outname, "dark_sky_raw", cur) # Commit changes to database conn.commit()
if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Load raw CSV as Dataframe raw_df = pd.read_csv(os.path.join("data", "bikes_per_day.csv")) # Lowercase provider and replace limebike with lime raw_df['provider'] = raw_df['provider'].str.lower().str.replace( 'limebike', 'lime') # Replace data outliers with reasonable number for Jump jump_outlier_mask = (raw_df['ride_date'].isin( ['2018-02-22', '2018-02-23'])) & (raw_df['provider'] == 'jump') raw_df.loc[jump_outlier_mask, 'daily_bikes_available'] = 71 print(raw_df.tail(20)) # Output dataframe as CSV outname = "dockless_bikes_api" raw_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table in AWS create_dockless_bikes_api(cur) # Load to Database uf.aws_load(outname, "dockless_bikes_api", cur) # Commit changes to database conn.commit()
def create_cabi_price(cur): # This script creates the CaBi Price AWS table cur.execute(""" DROP TABLE cabi_price; CREATE TABLE cabi_price( min_seconds integer PRIMARY KEY, max_seconds integer, casual_cost numeric, member_cost numeric ) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Read in Overage Data csv_name = "CaBiOverageCosts" overage_df = pd.read_csv(csv_name + ".csv") # Output final dataframe outname = csv_name + "pipe_delimited" overage_df.to_csv(outname + ".csv", index=False, sep='|') # Create Database create_cabi_price(cur) # Load to Database uf.aws_load(outname, "cabi_price", cur) # Commit changes to database conn.commit()
df_dc_month = df_dc_month.append(year_month_1718) # Fill down growth rates for population and bike df_dc_month['grow_rate'] = df_dc_month['grow_rate'].fillna(method='ffill') df_dc_month['pct_bike'] = df_dc_month['pct_bike'].fillna(method='ffill') df_dc_month = df_dc_month.reset_index() # Apply growth rate to population figures for x in range(11): building_month() df_dc_month['citypop'] = df_dc_month['citypop'].fillna(value=0) for x in df_dc_month.index: apply_rate() # Keep final fields and drop any additional records df_dc_month['day'] = 1 df_dc_month['pop_date'] = pd.to_datetime( df_dc_month[['year', 'month', 'day']]) df_dc_month = df_dc_month[['pop_date', 'citypop', 'grow_rate', 'pct_bike']] df_dc_month = df_dc_month.dropna(how='any') # Output dataframe as CSV outname = "dc_pop" df_dc_month.to_csv(outname + ".csv", index=False, sep='|') # Create Table create_dc_pop(cur) # Load to Database uf.aws_load(outname, "dc_pop", cur) # Commit changes to database conn.commit()
'Local Purchase Month', 'Total Number Bikes Purchased' ] df = df[keep_cols] df.columns = ['Local Purchase Month', membership_type] # set month as index df.set_index(['Local Purchase Month'], inplace=True) # Append dataframe to list df_list.append(df) # Combine Dataframe combined_df = pd.concat(df_list, axis=1) combined_df.reset_index(inplace=True) combined_df['Local Purchase Month'] = combined_df[ 'Local Purchase Month'].astype('datetime64[ns]') # Fill missing with zeros combined_df = combined_df.fillna(0) # Output dataframe as CSV outname = "cabi_membership" combined_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table in AWS create_cabi_membership(cur) # Load to Database uf.aws_load(outname, "cabi_membership", cur) # Commit changes to database conn.commit()
DROP TABLE IF EXISTS """ + ml_name + """; CREATE TABLE """ + ml_name + """( date date, dless numeric, error numeric, neg_error numeric, t date, yhat numeric, ytest numeric, dless_impact numeric); """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() for ml_file in os.listdir(os.path.join("data", "ml")): # Read in Overage Data csv_name = ml_file.replace(".csv", "") ml_df = pd.read_csv(os.path.join("data", "ml", csv_name + ".csv")) # Output final dataframe outname = csv_name + "pipe_delimited" ml_df.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Database create_ml(cur, ml_name=csv_name) # Load to Database uf.aws_load(outname, csv_name, cur) # Commit changes to database conn.commit()
# Loads data to the dockless_price AWS table cur.execute(""" DROP TABLE dockless_price; CREATE TABLE dockless_price( min_seconds integer PRIMARY KEY, max_seconds integer, limebike numeric, spin numeric, ofo numeric, mobike numeric ) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Read in Overage Data csv_name = "DocklessTripCosts" overage_df = pd.read_csv(csv_name + ".csv") # Output final dataframe outname = csv_name + "pipe_delimited" overage_df.to_csv(outname + ".csv", index=False, sep='|') # Create Table create_dockless_price(cur) # Load to Database uf.aws_load(outname, "dockless_price", cur) # Commit changes to database conn.commit()
if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Connect to Google Calendar CAL = gcal_authorization() # Pull all WABA Events results_df = pull_waba_events(CAL) # Limit calendar events to those that are significant based on subjective filtering sign_events_df = limit_sign_events(results_df) # Combine start date and start date time sign_events_df['final_date'] = combine_dates() # Set final date as index and keep only summary and id sign_events_df = sign_events_df[['id', 'final_date', 'summary']] sign_events_df.sort_values(['final_date'], inplace=True) # Output full WABA calendar TIMESTR = time.strftime("%Y%m%d_%H%M%S") filename = "WABA_Calendar_" + TIMESTR + ".csv" filepath = os.path.join(filename) results_df.to_csv(filepath, index=True) # Output significant events outname = "WABA_Significant_Events" + TIMESTR sign_events_df.to_csv(outname + ".csv", index=False, sep='|') # Create Database create_bike_events(cur) # Load to Database uf.aws_load(outname, "bike_events", cur) # Commit changes to database conn.commit()
if __name__ == "__main__": # Pull CaBi Trips Data from AWS S3 Store links = get_links() download_links(links) # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Loop through all CSVs in cabi trip data folder cabi_trip_dir = '../cabi_trip_data' # Convert trip data from CSV to dataframe combined_df = trips_to_df(cabi_trip_dir) # Add trip_id continuing from last record in AWS table trip_id_df = pd.read_sql( """SELECT trip_id from cabi_trips order by outage_id desc LIMIT 1 """, con=conn) last_trip_id = trip_id_df['trip_id'].iloc[0] combined_df.reset_index(inplace=True) combined_df['trip_id'] = combined_df.index + 1 + last_trip_id # Drop unneeded fields combined_df.drop(['index'], axis=1, inplace=True) # Output dataframe as CSV outname = "CaBi_Trip_Data" combined_df.to_csv(outname + ".csv", index=False, sep='|') # Create Table create_cabi_trips(cur) # Load to Database uf.aws_load(outname, "cabi_trips", cur) # Commit changes to database conn.commit()
# Use beautifulsoup? soup = BeautifulSoup(r.text, "lxml") # Extract schedule table from HTML schedule = soup.find('div', attrs={'class': 'overthrow table_container'}) # Create dataframe schedule_df = pd.read_html(str(schedule))[0] # Drop extra header rows embeded in dataframe schedule_df = schedule_df[schedule_df['Gm#'] != 'Gm#'] # Format date field schedule_df['Date'] = format_date_field(schedule_df['Date']) # Append games from current iteration (season) to list schedule_df_list.append(schedule_df) # Concat all the dataframe into one big schedule nats_sched = pd.concat(schedule_df_list, axis=0) nats_sched.rename(columns={'Unnamed: 2': 'BoxScore', 'Unnamed: 4': 'Home_Away'}, inplace=True) # Keep only games played nats_sched = nats_sched[nats_sched['BoxScore'] == 'boxscore'] # Fill in zeros for integer columns integer_columns = ['R', 'RA', 'Inn', 'Rank', 'Attendance'] nats_sched[integer_columns] = nats_sched[integer_columns].fillna(value=0) # Output dataframe as CSV outname = "nats_attendance" nats_sched.to_csv(os.path.join("data", outname + ".csv"), index=False, sep='|') # Create Table in AWS create_nats_attendance(cur) # Load to Database uf.aws_load(outname, "nats_attendance", cur) # Commit changes to database conn.commit()
tranwork integer, carpool integer, riders integer, trantime integer, qtrantim integer, qtranwor integer) ) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Load CSV FROM IPMUS Download as Dataframe csv_name = 'usa_00007' acs_df = pd.read_csv(csv_name + ".csv") # Set Primary Key acs_df.reset_index(inplace=True) # Fill all na with zeros acs_df.fillna(0, inplace=True) # Output dataframe as CSV outname = csv_name + "_pipe_delimited" acs_df.to_csv(outname + ".csv", index=False, sep='|') # Create Database create_acs(cur) # Load to Database uf.aws_load(outname, "acs", cur) # Commit changes to database conn.commit()
def create_metro_hours(cur): # This script creates the bike events AWS table cur.execute(""" DROP TABLE IF EXISTS metro_hours; CREATE TABLE metro_hours ( day_of_week integer, op_status text, start_time time, end_time time) """) if __name__ == "__main__": # Connect to AWS uf.set_env_path() conn, cur = uf.aws_connect() # Load CSV FROM IPMUS Download as Dataframe csv_name = 'metro_hours' metro_hours_df = pd.read_csv(os.path.join("data", csv_name) + ".csv") # Output dataframe as CSV outname = csv_name + "_pipe_delimited" metro_hours_df.to_csv(os.path.join("data", outname) + ".csv", index=False, sep='|') # Create Database create_metro_hours(cur) # Load to Database uf.aws_load(outname, "metro_hours", cur) # Commit changes to database conn.commit()