def to_redshift(): pr.connect_to_redshift( dbname='pricing', host='pricing.cfefnwtyvvt2.us-east-1.rds.amazonaws.com', port='5432', user='******', password='******') print('Connect to RedShift') return pr.redshift_to_pandas('select * from kroton_pricing.bd_ccr')
def get_raw(sample_flag): with open('credentials.json') as json_data: credentials = json.load(json_data) pr.connect_to_redshift(dbname = 'muni', host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com', port = '5439', user = credentials['user'], password = credentials['password']) if sample_flag: df = pr.redshift_to_pandas("""select * from vehicle_monitoring limit 1000""") df.to_csv('data/vehicle_monitoring_sample.csv', index=False) else: df = pr.redshift_to_pandas("""select * from vehicle_monitoring""") df.to_csv('data/vehicle_monitoring.csv', index=False) pr.close_up_shop() return df
def rs_data_select(query): pr.connect_to_redshift(dbname=DBNAME, host=HOST, port=PORT, user=RS_ID, password=RS_PW) df = pr.redshift_to_pandas(query) pr.close_up_shop() df = df.round(2) return df
def db_pandas_query(query): """ Read Redshift table into a pandas data frame """ pr.connect_to_redshift(dbname=DB_NAME, host=DB_HOST, port=DB_PORT, user=DB_USER, password=DB_PASSWORD) data = pr.redshift_to_pandas(query) pr.close_up_shop() return data
def get_distributions(sample_flag): with open('credentials.json') as json_data: credentials = json.load(json_data) pr.connect_to_redshift( dbname='muni', host='jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com', port='5439', user=credentials['user'], password=credentials['password']) if sample_flag: df = pr.redshift_to_pandas( """select departure_time_hour, departure_stop_id, arrival_stop_id, shape, scale, shape*scale as mean from distributions_gamma limit 1000""") df.to_csv('data/distributions_gamma_sample.csv', index=False) else: df = pr.redshift_to_pandas( """select departure_time_hour, departure_stop_id, arrival_stop_id, shape, scale, shape*scale as mean from distributions_gamma""") df.to_csv('data/distributions_gamma.csv', index=False) pr.close_up_shop() return df
def get_distributions(): with open('credentials.json') as json_data: credentials = json.load(json_data) pr.connect_to_redshift(dbname = 'muni', host = 'jonobate.c9xvjgh0xspr.us-east-1.redshift.amazonaws.com', port = '5439', user = credentials['user'], password = credentials['password']) df = pr.redshift_to_pandas("""select *, convert_timezone('US/Pacific', departure_time_hour) as local_departure_time_hour from distributions_gamma""") pr.close_up_shop() return df
def stops_to_durations(): connect_to_redshift() df = pr.redshift_to_pandas("""select a.* from (select data_frame_ref, stop_id from stop_events group by data_frame_ref, stop_id) a left join (select data_frame_ref, departure_stop_id from trip_durations group by data_frame_ref, departure_stop_id) b on a.data_frame_ref = b.data_frame_ref and a.stop_id = b.departure_stop_id where b.data_frame_ref is null and b.departure_stop_id is null and a.data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE())) order by a.data_frame_ref, a.stop_id;""") n_days_dep_stops = df.shape[0] for i, row in df.iterrows(): data_frame_ref = row['data_frame_ref'] dep_stop_id = row['stop_id'] print("Processing data_frame_ref {}, departure_stop_id {} ({} of {})". format(data_frame_ref, dep_stop_id, (i + 1), n_days_dep_stops)) pr.exec_commit("""insert into trip_durations select a.data_frame_ref, a.trip_id, a.stop_id as departure_stop_id, a.stop_time as departure_time, a.stop_time_unix as departure_time_unix, s.stop_id as arrival_stop_id, s.stop_time as arrival_time, s.stop_time_unix as arrival_time_unix, s.stop_time_unix - a.stop_time_unix as trip_duration, date_trunc('hour', a.stop_time) as departure_time_hour from (select * from stop_events where data_frame_ref = '{}' and stop_id = {}) a join stop_events s on a.data_frame_ref = s.data_frame_ref and a.trip_id = s.trip_id and s.stop_time_unix > a.stop_time_unix""".format( data_frame_ref, dep_stop_id)) pr.close_up_shop()
db_name = "info7374dbassignment2"#-------------------------------------Redshift: Database Name for gaming data master_username = "******"#----------------------------------------Redshift: Admin Username master_password = "******"#---------------------------------Redshift: Admin Password hostname = "info7374clusterproject.cwtvmzfhaqaf.us-east-1.redshift.amazonaws.com" #----------------Redshift: Hostname for database port_number = 5439 #----------------Redshift: Port Number for databse pr.connect_to_redshift(dbname = db_name , host = hostname, port = port_number, user = master_username, password =master_password) online = pr.redshift_to_pandas('select * from sales') online.head(5) # drop the row missing customerID online = online[online.customerid.notnull()] # extract year, month and day online['invoiceday'] = online.invoicedate.apply(lambda x: dt.datetime(x.year, x.month, x.day)) online.head() monthly_unique_customers_df = online.set_index('invoiceday')['customerid'].resample('M').nunique() monthly_unique_customers_df pd.DataFrame(monthly_unique_customers_df)['invoicedate']=pd.DataFrame(monthly_unique_customers_df).index
DOWNTIMEDOWNTIMEfrom flask import Flask, jsonify, json import pandas as pd import pandas_redshift as pr pr.connect_to_redshift(dbname = 'habladb', host = 'habla-ai.csvoexx0fghm.us-west-2.redshift.amazonaws.com', port = 5439, user = '******', password = '******') # MEAN UPTIME AND DOWNTIME ACROSS MULTIPLE PLANTS LAMB_WESTON_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.lbw_mean_uptime t') LAMB_WESTON_DOWNTIME = pr.redshift_to_pandas('SELECT t.* FROM public.lbw_mean_downtime t') # PASCO UPTIME BY DAY BY LINE PASCO_L1_S6_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s6_up_groupby t') PASCO_L1_S7_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s7_up_groupby t') PASCO_L1_S8_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s8_up_groupby t') PASCO_L1_S9_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s9_up_groupby t') PASCO_L1_S10_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l1_s10_up_groupby t') PASCO_L2_S1_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s1_up t') PASCO_L2_S2_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s2_up t') PASCO_L2_S3_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s3_up t') PASCO_L2_S4_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s4_up t') PASCO_L2_S5_UPTIME = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_l2_s5_up t') # PASCO REASON LEVEL 1 PASCO_L1_S6_RLVL1 = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_rlv1_l1s6 t') PASCO_L1_S7_RLVL1 = pr.redshift_to_pandas('SELECT t.* FROM public.pasco_rlv1_l1s7 t')
password = credentials['password']) def connect_to_s3(): with open('credentials.json') as json_data: credentials = json.load(json_data) pr.connect_to_s3(aws_access_key_id = credentials['aws_access_key_id'], aws_secret_access_key = credentials['aws_secret_access_key'], bucket = 'jonobate-bucket') if __name__ == '__main__': #Get raw data from processing connect_to_redshift() print('Getting vehicle_monitoring data from Redshift...') df = pr.redshift_to_pandas("""select * from vehicle_monitoring where data_frame_ref not in (select distinct data_frame_ref from stop_events) and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE()));""") pr.close_up_shop() #Parse into stop events df = raw_to_stops(df) #Write results to stop_events connect_to_s3() connect_to_redshift() print('Writing stop_events data to Redshift...') pr.pandas_to_redshift(data_frame = df, redshift_table_name = 'stop_events', append = True) #Get stop events for processing
pr.pandas_to_redshift(data_frame = df, redshift_table_name = 'analytics.trip_fact') dfroutes = (df.groupby(['\"start station id\"', '\"end station id\"']).size() \ .sort_values(ascending=False) \ .reset_index(name='count')) dfroutes.columns = ['start_station_id','end_station_id','count'] #print(type(dfroutes)) pr.pandas_to_redshift(data_frame = dfroutes, redshift_table_name = 'analytics.most_used_routes', append=True) dataframecount=pr.redshift_to_pandas("select * from analytics.most_used_routes") newdataframecount=pd.DataFrame(columns=('start_station_id', 'end_station_id', 'num_trips')) print(dataframecount) for index, row in df.iterrows(): for routeindex, routerow in dataframecount.iterrows(): if int(row['\"start station id\"']) == int(routerow['start_station_id']) and int(row['\"end station id\"']) == int(routerow['end_station_id']): routerow['num_trips'] += 1 break newdataframecount.append({'start_station_id': row['\"start station id\"']}, {'end_station_id': row['\"end station id\"']}, {'num_trips': 1})
from flask import Flask, jsonify, json import pandas as pd import pandas_redshift as pr pr.connect_to_redshift( dbname='habladb', host='habla-ai.csvoexx0fghm.us-west-2.redshift.amazonaws.com', port=5439, user='******', password='******') pasco_L1_S6_down = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s6_down t') pasco_L1_S6_up = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s6_up t') pasco_L1_S7_down = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s7_down t') pasco_L1_S7_up = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s7_up t') pasco_L1_S8_down = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s8_down t') pasco_L1_S8_up = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s8_up t') pasco_L1_S9_down = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s9_down t') pasco_L1_S9_up = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s9_up t') pasco_L1_S10_down = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s10_down t') pasco_L1_S10_up = pr.redshift_to_pandas( 'SELECT t.* FROM public.pasco_l1_s10_up t')
def business_loss(): images = [ join("customer_lifetime_value/", f) for f in listdir("./static/customer_lifetime_value") ] db_name = "info7374dbassignment2" #-------------------------------------Redshift: Database Name for gaming data master_username = "******" #----------------------------------------Redshift: Admin Username master_password = "******" #---------------------------------Redshift: Admin Password hostname = "info7374clusterproject.cwtvmzfhaqaf.us-east-1.redshift.amazonaws.com" #----------------Redshift: Hostname for database port_number = 5439 #----------------Redshift: Port Number for databse pr.connect_to_redshift(dbname=db_name, host=hostname, port=port_number, user=master_username, password=master_password) data = pr.redshift_to_pandas('select * from sales') data = data.drop_duplicates() data = data[pd.notnull(data['customerid'])] data = data[(data['quantity'] > 0)] #most bought product data['description'].value_counts()[:10] #which customer bought the most number of items? cust_data = pd.DataFrame() cust_data['customerid'] = list(set(data['customerid'])) cust_data = cust_data.set_index('customerid') for cust_id in cust_data.index: cust_data.at[cust_id, 'Number_of_items'] = (len( data[data['customerid'] == cust_id]['description'])) cust_data = cust_data.sort_values('Number_of_items', ascending=False) # stockcode description states data = data[[ 'customerid', 'invoicedate', 'invoiceno', 'quantity', 'unitprice' ]] #Calulate total purchase data['TotalPurchase'] = data['quantity'] * data['unitprice'] data_group = data.groupby('customerid').agg({ 'invoicedate': lambda date: (date.max() - date.min()).days, 'invoiceno': lambda num: len(num), 'quantity': lambda quant: quant.sum(), 'TotalPurchase': lambda price: price.sum() }) # Change the name of columns data_group.columns = [ 'num_days', 'num_transactions', 'num_units', 'spent_money' ] data_group.head() # Average Order Value data_group['avg_order_value'] = data_group['spent_money'] / data_group[ 'num_transactions'] purchase_frequency = sum( data_group['num_transactions']) / data_group.shape[0] # Repeat Rate repeat_rate = data_group[ data_group.num_transactions > 1].shape[0] / data_group.shape[0] #Churn Rate churn_rate = 1 - repeat_rate purchase_frequency, repeat_rate, churn_rate # Profit Margin data_group['profit_margin'] = data_group['spent_money'].astype( 'float') * 0.05 # Customer Value data_group['CLV'] = (data_group['avg_order_value'].astype('float') * purchase_frequency) / churn_rate #Customer Lifetime Value data_group['cust_lifetime_value'] = data_group['CLV'].astype( 'float') * data_group['profit_margin'].astype('float') data_group.head() clv = data_group.loc[:, "cust_lifetime_value"].mean() / 1000000 # drop the row missing customerID data = data[data.customerid.notnull()] # extract year, month and day data['invoiceday'] = data.invoicedate.apply( lambda x: dt.datetime(x.year, x.month, x.day)) data.head() monthly_unique_customers_df = data.set_index( 'invoiceday')['customerid'].resample('M').nunique() pd.DataFrame(monthly_unique_customers_df)['invoicedate'] = pd.DataFrame( monthly_unique_customers_df).index df = pd.DataFrame(monthly_unique_customers_df).reset_index() Customer_count = df.loc[:, "customerid"].mean() df["CustomerIDshift"] = [0] + list(df["customerid"][:-1]) df["ChurnRate"] = (df["CustomerIDshift"] - df["customerid"]) / df["CustomerIDshift"] df.rename(columns={'invoiceday': 'Month'}, inplace=True) df['ChurnRate'][0] = 1 data = df.drop(columns=['customerid', 'CustomerIDshift']) table1 = data table1 from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt from statsmodels.tsa.arima_model import ARIMA data = data.set_index('Month') data.index model = ARIMA(data, order=(2, 1, 0)) model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors # residuals = pd.DataFrame(model_fit.resid) # residuals.plot() # plt.show() # residuals.plot(kind='kde') # plt.show() X = data.values history = [x for x in X] test = [ '2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30', '2019-05-31', '2019-06-31' ] predictions = list() for t in range(len(test)): model = ARIMA(history, order=(2, 1, 0)) model_fit = model.fit(disp=0) output = model_fit.forecast() yhat = output[0] history.append(yhat) predictions.append(yhat) print('predicted=%f' % (yhat)) print(predictions) i = 0 yes_array = [] for value in predictions: print(predictions[i]) yes_array.append(predictions[i]) i += 1 df_toplot = pd.DataFrame({"ChurnRate": yes_array, "Month": test}) df_toplot["Business_Loss"] = df["ChurnRate"] * clv * Customer_count x = df_toplot["Business_Loss"].astype(int) df_toplot['Business Loss'] = x final_df = df_toplot del final_df['Business_Loss'] table2 = final_df table1 = table1.to_html(classes="data") table2 = table2.to_html(classes="data") return render_template('business_loss.html', tables=[table1, table2], titles=["Blah", "Churn Rate", "Future Churn Rate"], images=images, clv=clv)
import pandas_redshift as pr pr.connect_to_redshift( dbname="dev", host= 'redshift-cluster-1.cajhj66uu5bu.ap-northeast-1.redshift.amazonaws.com', port='5439', user='******', password='******') df = pr.redshift_to_pandas('select * from test')
print(dir(repos)) connection = engine.connect() connection.execute(repos.execution_options(autocommit=True)) connection.close() return file_json pr_redshift = pr.connect_to_redshift( dbname='dev', host='redshift-cluster-1.cug5ajtfsvsw.us-west-2.redshift.amazonaws.com', port=5439, user='******', password='******') for table in redshift_tables: data = pr.redshift_to_pandas('select * from table') data_ = data['data'] def get_modules_and_for_position(file): imports = [] #Get all imported Modules result = re.findall(r"(?<!from)import (\w+)[\n.]|from\s+(\w+)\s+import", file) # imports=[i for imp in result for i in imp if len(i)and i not in imports] for imp in result: for i in imp: if len(i) and i not in imports: imports.append(i)
str_s3bucket = secrets.str_s3bucket str_s3subdirectory = secrets.str_s3subdirectory # delete secrets.py del secrets # create pandas-redshift connection pr.connect_to_redshift(dbname=str_dbname, host=str_host, port=str_port, user=str_user, password=str_pw) # create dataframe from redshift query sql_query = "SELECT * FROM <database>.<schema>.<table>;" df = pr.redshift_to_pandas(sql_query) print("Shape of dataframe: ", df.shape) # create sample dataframe for upload df_upload = pd.DataFrame({ 'a_col': ['red', 'green', 'blue'], 'b_col': [1, 2, 3], 'c_col': [True, False, True], 'd_col': ['2020-01-01', '2020-02-04', '2020-03-06'], }) # ============================================================= # Write a pandas DataFrame to redshift. Requires access to an S3 bucket and previously running pr.connect_to_redshift. # If the table currently exists IT WILL BE DROPPED and then the pandas DataFrame will be put in it's place. # If you set append = True the table will be appended to (if it exists).
def raw_to_stops(): connect_to_redshift() connect_to_s3() #Load stop data df_stop_times = pd.read_csv('gtfs/stop_times.txt') print('Getting vehicle_monitoring data from Redshift...') df = pr.redshift_to_pandas("""select data_frame_ref from vehicle_monitoring where data_frame_ref not in (select distinct data_frame_ref from stop_events) and data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE())) group by data_frame_ref""") n_days = df.shape[0] for i, row in df.iterrows(): data_frame_ref = row['data_frame_ref'] print("Processing data_frame_ref {} ({} of {})".format( data_frame_ref, (i + 1), n_days)) df_cur = pr.redshift_to_pandas("""select * from vehicle_monitoring where data_frame_ref = '{}';""".format( data_frame_ref)) #Only bother with this if we actually have data... if df_cur.shape[0] == 0: print("No data for {}, skipping...".format(data_frame_ref)) else: #Convert datetimes df_cur['recorded_time'] = pd.to_datetime(df_cur['recorded_time']) df_cur['valid_until_time'] = pd.to_datetime( df_cur['valid_until_time']) df_cur['data_frame_ref'] = pd.to_datetime(df_cur['data_frame_ref']) df_cur['expected_arrival_time'] = pd.to_datetime( df_cur['expected_arrival_time']) df_cur['expected_departure_time'] = pd.to_datetime( df_cur['expected_departure_time']) #Sort values, reset index df_cur = df_cur.sort_values( ['data_frame_ref', 'journey_ref', 'recorded_time']) df_cur = df_cur.reset_index(drop=True) df_cur['join_index'] = df_cur.index.astype(int) #Create offset dataframe df_next = df_cur[[ 'data_frame_ref', 'journey_ref', 'recorded_time', 'stop_point_ref', 'stop_point_name' ]] df_next = df_next.add_suffix('_next') df_next['join_index'] = df_next.index df_next['join_index'] = df_next['join_index'].astype(int) - 1 #Join data to offset data df_stops = df_cur.merge(df_next, on='join_index') #Filter to stop events df_stops = df_stops[ (df_stops['data_frame_ref'] == df_stops['data_frame_ref_next']) & (df_stops['journey_ref'] == df_stops['journey_ref_next']) & (df_stops['stop_point_ref'] != df_stops['stop_point_ref_next'])] #Add in stop time column df_stops['stop_time'] = df_stops['recorded_time'] + ( df_stops['recorded_time_next'] - df_stops['recorded_time']) / 2 #Drop uneeded columns df_stops = df_stops[[ 'data_frame_ref', 'journey_ref', 'stop_point_ref', 'stop_time' ]] #Create output dataframe df_final = pd.DataFrame(columns=[ 'data_frame_ref', 'trip_id', 'stop_id', 'stop_time', 'stop_time_unix' ]) n_trips = len(df_stops['journey_ref'].unique()) #For each trip on that day... for j, trip_id in enumerate(df_stops['journey_ref'].unique()): print(" Processing trip_id {} ({} of {})".format( trip_id, (j + 1), n_trips)) #Get actual data for this trip. Rename columns to match stop data. df_stops_actual = df_stops[df_stops['journey_ref'] == trip_id].rename( index=str, columns={ "journey_ref": "trip_id", "stop_point_ref": "stop_id" }) #Get stop data for this trip df_stops_all = df_stop_times[df_stop_times['trip_id'] == trip_id] #Fix to deal with the fact that that stop_ids are in a slightly different format df_stops_all['stop_id'] = ( '1' + df_stops_all['stop_id'].astype(str)).astype(int) #Merge dataframes todether df_merged = df_stops_all.merge(df_stops_actual, on=['trip_id', 'stop_id'], how='left') #Create unix time column df_merged['stop_time_unix'] = ( df_merged['stop_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') #Interpolate timestamps for missing stop events df_merged['stop_time_unix'] = df_merged[ 'stop_time_unix'].interpolate(limit_area='inside') #Convert back to actual timestamps df_merged['stop_time'] = pd.to_datetime( df_merged['stop_time_unix'], origin='unix', unit='s') #Fill missing data_frame_refs df_merged['data_frame_ref'] = df_merged[ 'data_frame_ref'].fillna(data_frame_ref) #Drop uneeeded columns df_merged = df_merged[[ 'data_frame_ref', 'trip_id', 'stop_id', 'stop_time', 'stop_time_unix' ]] #Remove NaNs (occurs if we are missing data at the start or end of a journey) df_merged = df_merged.dropna(subset=['stop_time']) #Add to final data frame df_final = pd.concat([df_final, df_merged]) #Only bother with this if we actually have stop events... if df_final.shape[0] == 0: print("No stop events for {}, skipping...".format( data_frame_ref)) else: pr.pandas_to_redshift(data_frame=df_final, redshift_table_name='stop_events', append=True) pr.close_up_shop()
def durs_to_dists(): connect_to_redshift() connect_to_s3() #Note: this processes data not already in distributions. Assumes we do one hour at a time, no subdividing of hours. df = pr.redshift_to_pandas("""select a.* from (select data_frame_ref, departure_time_hour from trip_durations group by data_frame_ref, departure_time_hour) a left join (select data_frame_ref, departure_time_hour from distributions_gamma group by data_frame_ref, departure_time_hour) b on a.data_frame_ref = b.data_frame_ref and a.departure_time_hour = b.departure_time_hour where b.data_frame_ref is null and b.departure_time_hour is null and a.data_frame_ref < trunc(convert_timezone('US/Pacific', GETDATE())) order by a.data_frame_ref, a.departure_time_hour;""") #Randomize order, so we can get some samples from everywhere... df = df.sample(frac=1).reset_index(drop=True) n_days_hours = df.shape[0] #For each day and departure stop: for i, row in df.iterrows(): data_frame_ref = row['data_frame_ref'] departure_time_hour = row['departure_time_hour'] print( "Processing data_frame_ref {}, departure_time_hour {} ({} of {})". format(data_frame_ref, departure_time_hour, (i + 1), n_days_hours)) #Calculate base timestamps for this day minutes = pd.DataFrame(np.arange(0, 60), columns=['minute']) minutes['key'] = 0 df_hour = pr.redshift_to_pandas("""select *, date_trunc('min', departure_time) as departure_time_minute from trip_durations where data_frame_ref = '{}' and departure_time_hour = '{}' """. format(data_frame_ref, departure_time_hour)) results = [] n_dep_stops = len(df_hour['departure_stop_id'].unique()) #For each arrival stop: for j, departure_stop_id in enumerate( df_hour['departure_stop_id'].unique()): print("Processing departure_stop_id {} ({} of {})".format( departure_stop_id, (j + 1), n_dep_stops)) #For each departure stop: for k, arrival_stop_id in enumerate( df_hour[df_hour['departure_stop_id'] == departure_stop_id]['arrival_stop_id'].unique()): #Select data df_dist = df_hour[ (df_hour['departure_stop_id'] == departure_stop_id) & (df_hour['arrival_stop_id'] == arrival_stop_id)] #Create date array date = pd.DataFrame([departure_time_hour], columns=['departure_time_hour']) date['key'] = 0 #Create base array base = date.merge(minutes) base['departure_time_minute'] = base[ 'departure_time_hour'] + pd.to_timedelta(base.minute, unit='m') base = base[['departure_time_minute']] base['departure_time_minute_unix'] = ( base['departure_time_minute'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') df_dist = base.merge(df_dist, on='departure_time_minute', how='left') df_dist = df_dist.fillna(method='bfill') df_dist['total_journey_time'] = df_dist[ 'arrival_time_unix'] - df_dist['departure_time_minute_unix'] df_dist = df_dist.dropna(subset=['total_journey_time']) data = df_dist['total_journey_time'] try: # fit dist to data params = st.gamma.fit(data, floc=True) y, x = np.histogram(data) x = (x + np.roll(x, -1))[:-1] / 2.0 # Separate parts of parameters arg = params[:-2] loc = params[-2] scale = params[-1] # Calculate fitted PDF and error with fit in distribution pdf = st.gamma.pdf(x, loc=loc, scale=scale, *arg) sse = np.sum(np.power(y - pdf, 2.0)) results.append([ data_frame_ref, departure_time_hour, departure_stop_id, arrival_stop_id, arg[0], scale, sse ]) except Exception as e: print(e) continue #Only bother with this if we actually have stop events... if len(results) == 0: print( "No distributions for data_frame_ref {}, departure_time_hour {}, skipping..." .format(data_frame_ref, departure_time_hour)) else: print("Writing distributions to Redshift...") df_results = pd.DataFrame(results, columns=[ 'data_frame_ref', 'departure_time_hour', 'departure_stop_id', 'arrival_stop_id', 'shape', 'scale', 'sse' ]) pr.pandas_to_redshift(data_frame=df_results, redshift_table_name='distributions_gamma', append=True) pr.close_up_shop()