def get_regression_features(zone_choice, arrival_choice=-1, cut_choice=-1, cap_mode_choice=2): zone_path, zone_file_prefix = get_zone_output_path(zone_choice, root) df = pd.read_csv(os.path.join(zone_path, zone_file_prefix + 'Summary.csv')) df = unstack_summary_df(df, zone=zone_choice).dropna(subset=['capacity']) df = df.fillna(0) df['cut'] = to_categories(df['cut']) if cut_choice in [0, 1]: df = df[df['cut'] == cut_choice] elif cut_choice == 2: df = df[df['cut'] != cut_choice] if arrival_choice != -1: df = df[df['arrival'] == arrival_choice] y = df["discount"] if cap_mode_choice == 0: x = df[['eco', 'capacity']] elif cap_mode_choice == 1: x = df[['eco', 'capacity', 'capacity_avg_day']] else: x = df[['eco', 'capacity', 'capacity_avg_day', 'capacity_avg_global']] return x, y, df
def create_summary_by_zone(zones): """ Creates summary file in zone folder Args: zones (iterable): Zone number """ for zone in zones: zone_path, zone_file_prefix = get_zone_output_path(zone, data_path) df_avail, df_order, df_steer, df_cap = load_data_file_for_zone( zone, data_path) slots_observed, cslots_observed = get_slots_observed( zone_path, zone_file_prefix, df_avail, df_order.columns, check_existing=False) slots_active, cslots_active, slots_offered, cslots_offered = get_slots_active( zone_path, zone_file_prefix, df_order, slots_observed) summary = summarize(zone_path, zone_file_prefix, df_avail, df_order, df_steer, df_cap, slots_offered, cslots_offered) summary = summary.reset_index() summary['EVENT_DTM'] = pd.to_datetime(summary['EVENT_DTM']) plot_arrivals(zone_path, zone_file_prefix, summary, '60min', 'mean', 'ALL')
def create_eco_and_discount_from_steering(zone): """ Reads steering file and creates eco and discount files from it Args: zone (str): Zone number in float Returns: (pd.DataFrame): eco_df, discount_df """ zone_path, zone_file_prefix = get_zone_output_path(zone, data_path) df_avail, df_order, df_steer, df_cap = load_data_file_for_zone( zone, data_path)
def split_summary_by_day_and_cut(): """ Returns: """ zlist = ['700.0', '500.0'] for zone in zlist: zone_path, zone_file_prefix = get_zone_output_path(zone, data_path) summary_zone = pd.read_csv( os.path.join(zone_path, 'Zone_' + zone[:-2] + '_Summary.csv')) summary_zone['EVENT_DTM'] = pd.to_datetime(summary_zone['EVENT_DTM']) slots_zone_offered, cslots_zone_offered = read_offered_slots(zone_path) for day in ['0', '1', '2', '3', '4', '5', '6']: plot_arrivals( os.path.join(zone_path, day), os.path.join(zone_file_prefix, '_Arrival_Day_' + day), summary_zone, '60min', 'mean', day) # save offered slots for each day # summary df by day for cutcat in ['BEFORE_CUT1', 'BEFORE_CUT2', 'MISSED_BOTH_CUTS']: print(zone, day, cutcat) cut_path, cut_prefix = get_cut_path(day, cutcat, zone_path, zone_file_prefix) summary = summary_zone.loc[ (summary_zone['ARRIVAL_CAT'] == cutcat) & (summary_zone['ARRIVAL_DAY'] == int(day))] slots_active, cslots_active, slots_offered, cslots_offered = get_slots_active( cut_path, cut_prefix, summary[slots_zone_offered + ['SLOT_CHOICE']], slots_zone_offered) cols = summary.columns[0:16].tolist() + ['SLOTS_AVAILABLE', 'NO_PURCHASE'] + slots_offered + \ ['CNO_PURCHASE'] + cslots_offered + [col + '_Eco' for col in slots_offered] + \ [col + '_Discount' for col in slots_offered] + [col + '_Capacity' for col in slots_offered] summary[cols].to_csv(os.path.join(cut_path, cut_prefix + 'Summary.csv'), index=False) del slots_offered, cslots_offered, slots_active, cslots_active, if pd.size(summary): plot_arrivals(cut_path, cut_prefix, summary[cols], '60min', 'mean', day) plt.close() del summary, cut_path, cut_prefix
def get_regression_features_ranked(zone_choice, arrival_choice=-1, cut_choice=-1, cap_mode_choice=2): zone_path, zone_file_prefix = get_zone_output_path(zone_choice, root) df = pd.DataFrame() path = os.path.join(zone_path, 'RankData', zone_file_prefix + 'Arrival_Day_{}' + '_Summary.csv') for days in range(7): df = df.append(pd.read_csv(path.format(days))) df = unstack_summary_df_ranked( df, zone=zone_choice).dropna(subset=['capacity']) df = df.fillna(0) df['cut'] = to_categories(df['cut']) if cut_choice in [0, 1]: df = df[df['cut'] == cut_choice] elif cut_choice == 2: df = df[df['cut'] != cut_choice] if arrival_choice != -1: df = df[df['arrival'] == arrival_choice] features = ['eco', 'arrival', 'cut', 'slot'] y = df["discount"] if cap_mode_choice == 0: x = df[features + ['capacity']] elif cap_mode_choice == 1: x = df[features + ['capacity', 'capacity_avg_day']] else: x = df[features + ['capacity', 'capacity_avg_day', 'capacity_avg_global']] slot = x['slot'].str.split("_", n=1, expand=True) x['day'] = slot[0] x['slot'] = slot[1] x = col_to_one_hot(x, 'slot', prefix='slot', drop_first=True) x = col_to_one_hot(x, 'day', prefix='day', drop_first=True) return x, y, df
len(gr_cols)) print('Iteration=', i, 'loglikelihood =', log_likeli, 'beta_disc', beta_coef[-3], 'beta_eco', beta_coef[-2], 'beta_gr', beta_coef[-1]) if np.linalg.norm(beta_coef[:-1] - beta[:-1]) < 10**-6 or i > 500: predict_prob_df = pd.DataFrame(Q, columns=['NO_PURCHASE'] + slots_offered) beta_df = pd.DataFrame([np.array(beta_coef)], columns=['NO_PURCHASE'] + slots_offered + ['Discount', 'Eco', 'Gr']) predict_prob_df.to_csv(Location + filename + 'predprobfeatures.csv') beta_df.to_csv(Location + filename + 'betafeatures.csv') del summary, predict_prob_df, design_df, features_df, assortment_df, choice_df, design, features, assortment, choice break return beta_df.iloc[0] if __name__ == "__main__": zone_path, zone_file_prefix = get_zone_output_path(zone, root) summary = pd.read_csv( '/Users/anupamtripathi/PycharmProjects/RA_/results/gr/500.0/gr_unranked_stacked_arrival_0_cut_-1_cap_2.csv' ) summary = summary.drop(['NO_PURCHASE_Eco', 'NO_PURCHASE_Discount'], axis=1) summary = summary.dropna() slots_offered = pd.read_csv( os.path.join(zone_path, zone_file_prefix + 'SlotsOfferedTitle.csv')) MMfeaturesBoot(zone_path, zone_file_prefix, summary, slots_offered['slotsOffered'].tolist())
def unstack_summary_df_ranked(summary_df, zone, root='data', check_saved=False, save=False): # do for summary """ Unstacks the summary dataframe Args: summary_df (dataframe): summary dataframe Returns: unstacked dataframe """ zone_path, zone_file_prefix = get_zone_output_path(zone, root) if os.path.exists( os.path.join(zone_path, 'RankData', zone_file_prefix + 'unstacked.csv')) and check_saved: return pd.read_csv(os.path.join(zone_path, 'RankData', zone_file_prefix + 'unstacked.csv'), index_col=0) summary_df['primary_key'] = summary_df['EVENT_DTM'].astype( str) + '-' + summary_df['CUSTOMER_ID'].astype(str) slots_per_day = u.get_slots_per_day_for_zone_ranked(zone, root) slots, cslots = u.get_slots_observed_ranked(zone_path, zone_file_prefix, summary_df) # get capacities sums summary_df['capacity_sum_global'] = summary_df.loc[:, slots[0] + '_Capacity':slots[-1] + '_Capacity'].fillna( 0).sum(axis=1) for day in range(7): slot_start = slot_end if day != 0 else list( summary_df.columns).index(slots[0] + '_Capacity') slot_end = slot_start + slots_per_day[day + 1] summary_df['capacity_sum_day_' + str(day + 1)] = summary_df.iloc[:, slot_start:slot_end].fillna( 0).sum(axis=1) summary_df = summary_df.fillna(0) df_slots = pd.DataFrame() for n in tqdm(slots + ['NO_PURCHASE']): features = pd.DataFrame() features['primary_key'] = summary_df['primary_key'] features['arrival'] = summary_df['ARRIVAL_DAY'] features['cut'] = summary_df['ARRIVAL_CAT'] features['slot'] = n features['day'] = n[0] if n != 'NO_PURCHASE': features['capacity'] = summary_df[n + '_Capacity'] features['discount'] = summary_df[n + '_Discount'] features['eco'] = summary_df[n + '_Eco'] features['capacity_avg_global'] = ( summary_df['capacity_sum_global'] - summary_df[n + '_Capacity']) / (len(slots) - 1) features['capacity_avg_day'] = ( summary_df['capacity_sum_day_' + str(n[0])] - summary_df[n + '_Capacity']) / (slots_per_day[int(n[0])] - 1) else: features['day'] = n features['capacity'] = 1 features['discount'] = 0 features['eco'] = 0 features['capacity_avg_global'] = 1 features['capacity_avg_day'] = 1 features['order'] = summary_df[n] features['avail'] = summary_df['C' + n] df_slots = df_slots.append(features) df_slots = u.col_to_one_hot(df_slots, 'slot', prefix='slot', delete=False) if save: df_slots.to_csv( os.path.join(zone_path, 'RankData', zone_file_prefix + 'unstacked.csv')) return df_slots
def create_rank_data_by_zone(zone, computername): slots_offered_rank = [] location, filename = get_zone_output_path(zone, computername) hourlist = get_hour_list(zone) summary = pd.read_csv(os.path.join(location, filename + 'Summary.csv')) slots_offered, _ = u.get_slots_active_ranked(location, filename, summary) for i in range(1, 8): for j in list(hourlist.values()): slots_offered_rank = np.append(slots_offered_rank, str(i) + '_' + j) summaries = [] for day in ['0', '1', '2', '3', '4', '5', '6']: rank_folder = os.path.join(location, 'RankData') rank_file_prefix = filename + 'Arrival_Day_{}_Summary.csv'.format(day) summary_day = summary[summary['ARRIVAL_DAY'] == int(day)] if day == '1': daylist = { '2': '1', '3': '2', '4': '3', '5': '4', '6': '5', '0': '6', '1': '7' } elif day == '2': daylist = { '2': '7', '3': '1', '4': '2', '5': '3', '6': '4', '0': '5', '1': '6' } elif day == '3': daylist = { '2': '6', '3': '7', '4': '1', '5': '2', '6': '3', '0': '4', '1': '5' } elif day == '4': daylist = { '2': '5', '3': '6', '4': '7', '5': '1', '6': '2', '0': '3', '1': '4' } elif day == '5': daylist = { '2': '4', '3': '5', '4': '6', '5': '7', '6': '1', '0': '2', '1': '3' } elif day == '6': daylist = { '2': '3', '3': '4', '4': '5', '5': '6', '6': '7', '0': '1', '1': '2' } elif day == '0': daylist = { '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '0': '7', '1': '1' } summary_day = summary_day.rename( columns=lambda x: daylist[x[0]] + '_' + hourlist[x[2:15]] + x[15:] if x in [col for col in slots_offered] + [ col + '_Eco' for col in slots_offered ] + [col + '_Discount' for col in slots_offered ] + [col + '_Capacity' for col in slots_offered] else x) summary_day = summary_day.rename( columns=lambda x: x[0] + daylist[x[1]] + '_' + hourlist[x[3:16]] + x[16:] if x in ['C' + col for col in slots_offered] else x) summary_day['SLOT_CHOICE'] = summary_day['SLOT_CHOICE'].fillna( 'NO_PURCHASE') summary_day['SLOT_CHOICE'] = summary_day['SLOT_CHOICE'].apply( lambda row: daylist[row[0]] + '_' + hourlist[row[2:15]] + row[15:] if row in [col for col in slots_offered] else row) summary_day.to_csv(os.path.join( rank_folder, filename + 'Arrival_Day_{}_Summary.csv'.format(day)), index=False) summaries.append(summary_day) summary = pd.concat(summaries, axis=0, sort=False) print(slots_offered, list(summary.columns)) summary = summary[[ 'EVENT_DTM', 'CUSTOMER_ID', 'ORDER_ID', 'ARRIVAL_CAT', 'ARRIVAL_DAY', 'DAY_OF_ORDER', 'CUT_OFF_1', 'CUT_OFF_2', 'SLOT_CHOICE', 'SLOT_STAMP', 'WINDOWS_STEERING', 'ZONE', 'ESTIMATED_SUBTOTAL', 'TOTAL_TIMESLOTS', 'CENSORED', 'TOTAL_ORDER' ] + [col for col in np.append('NO_PURCHASE', slots_offered_rank)] + [ 'C' + col for col in np.append('NO_PURCHASE', slots_offered_rank) ] + [col + '_Eco' for col in slots_offered_rank] + [col + '_Discount' for col in slots_offered_rank] + [col + '_Capacity' for col in slots_offered_rank]] summary['CUT2'] = summary['ARRIVAL_CAT'].apply( lambda x: 1 if x == 'BEFORE_CUT2' else 0) for day in range(0, 7): summary[str(day) + '_ARRIVAL'] = summary['ARRIVAL_DAY'].apply( lambda x: 1 if x == day else 0) summary.to_csv(os.path.join(rank_folder, filename + 'SummaryNew.csv'), index=False)