def remove_columns(df): """Removes columns not needed for prediction. Args: df (pandas dataframe obj): Pandas dataframe Returns: pandas dataframe obj: Pandas dataframe with only shipper, std_weight, zone, sender_state, recipient_state, distance, sender_pop, sender_pop_density, sender_houses, recipient_pop, recipient_pop_density, recipient_houses, same_msa, sender_in_msa, rec_in_msa, week_number, day_of_week, month, Y columns """ print("Removing unnecessary columns...") print(f"Starting with {report_df_stats(df)}\nRemoving columns...") start_time = time.time() columns_kept = [ 'shipper', 'std_weight', 'zone', 'sender_state', 'recipient_state', 'distance', 'sender_pop', 'sender_pop_density', 'sender_houses', 'recipient_pop', 'recipient_pop_density', 'recipient_houses', 'same_msa', 'sender_in_msa', 'rec_in_msa', 'week_number', 'day_of_week', 'month', 'Y' ] df = df[columns_kept] nan_rows = df[df.isnull().T.any().T] nan_rows.to_csv('null.csv') print(f"Ending with {report_df_stats(df)}.") utilities.print_elapsed_time(start_time) return df
def add_features(df): """Combines other functions to add features to dataframe. Args: df (pandas dataframe obj): Pandas dataframe that must contain sender_zip, recipient_zip, shipment_date, weight, package_count columns. Returns: pandas dataframe obj: Pandas dataframe with new columns sender_pop, sender_pop_density, sender_houses, recipient_pop, recipient_pop_density, recipient_houses, week_number, day_of_week, month, std_weight, distance, sender_in_msa, rec_in_msa, same_msa. """ print("Adding features...") print(f"Starting with {report_df_stats(df)}") start_time = time.time() # Create std_weight (weight/package count) df['std_weight'] = df['weight'] / df['package_count'] # Add date time features based on shipment date print("Adding datetime features based on shipment date...") df['week_number'] = df['shipment_date'].dt.week df['day_of_week'] = df['shipment_date'].dt.dayofweek df['month'] = df['shipment_date'].dt.month # Add distance between sender and recipient zips print("Adding distance...") df['distance'] = get_distance(df['sender_zip'].values, df['recipient_zip'].values) # Add sender_in_MSA, rec_in_MSA, same_MSA bools df = add_MSA_features(df) # Add population, population density, no. housing units add_zip_details(df) print(f"Ending with {report_df_stats(df)}.") utilities.print_elapsed_time(start_time) return df
def split_scale_data(X, y): '''Split into training and test datasets, and scale data Args: X (pandas dataframe obj): Pandas dataframe with dummified categorical features and other features. y (pandas dataframe obj): Pandas dataframe with target variable i.e. time window number. Returns: Dictionary of numpy arrays: X_train: the matrix of training data y_train: the array of training labels X_test: the matrix of testing data y_test: the array of testing labels ''' # Split data print("Splitting and scaling data...") print("Splitting data...") start_time = time.time() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=71) # Scale the variables print("Scaling data...") scaler = preprocessing.MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Save scaler for use in prediction print("Saving scaler...") scaler_path = os.path.join(paths.model_scaler_dir, "scaler_" + model_id + ".pkl.z") joblib.dump(scaler, scaler_path) print("Scaler saved in", scaler_path) # return training and testing data datadict = { 'X_train': X_train, 'y_train': y_train, 'X_test': X_test, 'y_test': y_test } # Save data dictionary file print("\nSaving data dictionary...") datadict_path = os.path.join(paths.data_delivery_prediction_datadict_dir, "datadict_" + model_id + ".npz") np.savez(datadict_path, X_train=datadict['X_train'], y_train=datadict['y_train'], X_test=datadict['X_test'], y_test=datadict['y_test']) print("Data dictionary saved in", datadict_path) # Print sizes print("\nTraining and testing dataset sizes") print("X_train", X_train.shape, "y_train", y_train.shape) print("X_test", X_test.shape, "y_test", y_test.shape) utilities.print_elapsed_time(start_time) return datadict
def one_hot_encode(df): """Categorize and apply one-hot-encoding to all categorical columns. Args: df (pandas dataframe obj): Pandas dataframe with only columns used for prediction Returns: X (pandas dataframe obj): Pandas dataframe with dummified categorical features and other features. y (pandas dataframe obj): Pandas dataframe with target variable i.e. time window number. """ print("Categorizing columns...") print(f"Starting with {report_df_stats(df)}") start_time = time.time() cat_cols = [ 'shipper', 'zone', 'sender_state', 'recipient_state', 'week_number', 'day_of_week', 'month' ] float_cols = [ 'std_weight', 'distance', 'sender_pop', 'sender_pop_density', 'sender_houses', 'recipient_pop', 'recipient_pop_density', 'recipient_houses' ] df[cat_cols] = df[cat_cols].astype('category') df[float_cols] = df[float_cols].astype('float64') print("Number of nulls in each column:") print(df.isnull().sum()) print("\nReplacing nulls with mean/mode...") df[float_cols] = df[float_cols].fillna(df[float_cols].mean()) df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode()) print("Number of nulls in each column after replacement:") print(df.isnull().sum(), "\n") print("Applying one hot encoding on categorical columns...") y = df.Y df = df.drop(columns=['Y']) # Record feature names for current model feature_names = {'feature_names': df.columns.values} X = pd.get_dummies(df) feature_names['feature_names_dummified'] = X.columns.values print("\nFeature names:", feature_names['feature_names_dummified']) # Save feature names for use in prediction features_path = os.path.join(paths.data_delivery_prediction_features_dir, "feature_names_" + model_id + ".npz") np.savez(features_path, feature_names=feature_names['feature_names'], feature_names_dummified=feature_names['feature_names_dummified']) print(f"Feature names stored in {features_path}\n") print(f"Ending with {report_df_stats(X)}.") utilities.print_elapsed_time(start_time) return X, y
def add_time_windows(df): """Adds time windows as target variable, based on delivery date and time. Calculates number of business days (excl weekends, shipment date, federal holidays) for delivery. Adds delivery time to number of business days to get time-in-transit. Creates window thresholds based on discussion with Jose. Lastly assign time windows to shipments based on time-in-transit. Args: df (pandas dataframe obj): Pandas dataframe that must contain shipment_date, delivery_date, delivery_time. Returns: pandas dataframe obj: Pandas dataframe with new columns days_in_transit, days_taken_float (time-in-transit), Y (target variable i.e. time window) """ print("Adding time windows i.e. target variable...") print(f"Starting with {report_df_stats(df)}") start_time = time.time() # Calculate days in transit (exclude shipment date, holidays, weekends) start_date = df['shipment_date'].min() end_date = df['shipment_date'].max() calendar = USFederalHolidayCalendar() holidays = calendar.holidays(start_date, end_date).date.tolist() shipment_dates = [d.date() for d in df['shipment_date']] delivery_dates = [d.date() for d in df['delivery_date']] # -1 because we will add transit time df['days_in_transit'] = np.busday_count( shipment_dates, delivery_dates, holidays=holidays) - 1 # Convert days in transit/delivery time to days taken (with decimals)) # e.g. if parcel reaches at 12.00pm on 2nd business day, days taken is 1.5 delivery_percentage_of_day = [ (timedelta.total_seconds(d) / timedelta(days=1).total_seconds()) for d in df['delivery_time'] ] df['days_taken_float'] = df['days_in_transit'] + delivery_percentage_of_day # Keep rows from -1 to 5 days in transit. The rest are rare occurrences. max_days_to_keep = 5 df = df[df['days_in_transit'].isin(np.arange(-1, max_days_to_keep))] # Assign time windows time_window_thresholds = create_time_window_thresholds() tqdm.pandas(desc="Assign time window") df['Y'] = df.progress_apply(lambda x: assign_time_window( x['days_taken_float'], time_window_thresholds), axis=1) print(f"Ending with {report_df_stats(df)}.") utilities.print_elapsed_time(start_time) return df
def format_cost_savings(pred, pred_proba, rates_df): """Formats dataframe for predict one output Args: pred (list): List of predicted ground delivery time window number based on mode of predicted probability distribution. pred_proba (2D array): Predicted probability distribution across time windows. Shape = (no. time windows, no. of test shipments). rates_df (pandas dataframe obj): Dataframe with service type, ship cost, ground cost, cost savings, scheduled window numbers, scheduled time windows as columns Returns: Pandas dataframe object: Dataframe with service type, ship cost, ground cost, cost savings, scheduled time windows, predicted time windows, cumulative probability as columns """ start_time = time.time() print("Calculating cost savings and probabilities...") # Load dict that maps time window numbers to time windows windows_cmu = joblib.load(paths.windows_cmu) # Insert predicted time window into dataframe # windows_cmu maps time window number to corresponding time window try: rates_df['pred_ground_window'] = windows_cmu[pred] except TypeError: print("Shipper did not return any results") pass # Insert predicted probability distribution array object into each cell in column rates_df['pred_ground_window_pdf'] = 0 rates_df['pred_ground_window_pdf'] = rates_df[ 'pred_ground_window_pdf'].astype(object) for i in range(len(rates_df)): rates_df.at[i, 'pred_ground_window_pdf'] = pred_proba # For each probability distribution array object, slice the array with scheduled window number # Sum of the sliced array is the predicted cumulative probability of shipment arriving before or in scheduled window rates_df['pred_probability'] = rates_df.apply(lambda x: np.sum(x[ 'pred_ground_window_pdf'][:x['scheduled_window_no'] + 1]), axis=1) # Converts fraction into percentage for display rates_df['pred_probability'] = pd.Series( ["{0:.2f}%".format(val * 100) for val in rates_df['pred_probability']], index=rates_df.index) # Drop columns that are not needed for display rates_df = rates_df.drop( columns=['pred_ground_window_pdf', 'scheduled_window_no']) utilities.print_elapsed_time(start_time) print(" ") return rates_df
def full_query(start_year_month, end_year_month, frac): """Extracts a fraction of raw shipping records from the database without batch method. Does not query by batch. Faster but requires more RAM. Args: start_year_month (str): Start date in YYYY-MM format. end_year_month (str): End date in YYYY-MM format.\ frac (str): String representation of fraction of data to extract. Returns: pandas dataframe obj: Dataframe with batch query records. """ # Establishes connection to a MySQL db print(f"Connecting to {credentials.db}...") start_time = time.time() db = MySQLdb.connect(credentials.host, credentials.user, credentials.password, credentials.db) utilities.print_elapsed_time(start_time) # instantiates batch start and end dates as the int of the concatenated string of year + month print("Extracting records...") extraction_start_time = time.time() # Append day 1 to year and month to create datetime object. Day does not affect result start_date = datetime.strptime(start_year_month + "-1", "%Y-%m-%d").date() end = datetime.strptime(end_year_month + "-1", "%Y-%m-%d").date() month = end.month year = end.year last_date_of_month = calendar.monthrange(year, month)[1] end_date = f"{year}-{month}-{last_date_of_month}" records = pd.DataFrame() # Set warning for chained assignment to None. pd.options.mode.chained_assignment = None # Calculate number of months between start and end year/month results = query(db, start_date, end_date, frac) print(f"{len(results)} records extracted.") utilities.print_elapsed_time(extraction_start_time) if len(results) > 0: num_batches = 100 chunk_size = int(results.shape[0] / num_batches) print(f"Cleaning {len(results)} records...") for start in trange(0, results.shape[0], chunk_size): df_subset = results.iloc[start:start + chunk_size] records = records.append(preprocess(df_subset), ignore_index=True) print(f"Returning {len(records)} records.\n") return records
def store(records, frac): """Stores results from each query into a compressed pickle file. Args: records (pandas dataframe obj): Dataframe after general preprocessing. frac (str): String representation of fraction of data to extract. Returns: str: Output file path """ print("Saving records...") start_time = time.time() filename = create_filename(frac) output_path = os.path.join(paths.data_extracted_dir, filename + ".pkl.z") joblib.dump(records, output_path) print(f"Data extracted and stored in {output_path}") utilities.print_elapsed_time(start_time) return output_path
def predict_time_windows(test, model): """Predicts time window probability distribution with saved model for test input. Args: test (array): Input for model prediction. model (obj): Loaded model for prediction. Returns: pred (list): List of predicted ground delivery time window number based on mode of predicted probability distribution. pred_proba (2D array): Predicted probability distribution across time windows. Shape = (no. time windows, no. of test shipments). """ start_time = time.time() print("Loading trained model and predicting time windows...") pred = model.predict(test) pred_proba = model.predict_proba(test) utilities.print_elapsed_time(start_time) return pred, pred_proba
def remove_rows(df): """Removes unnecessary rows from dataframe. Args: df (pandas dataframe obj): Pandas dataframe Returns: pandas dataframe obj: Pandas dataframe with rows removed. """ print("Removing unnecessary rows...") print(f"Starting with {report_df_stats(df)}") start_time = time.time() # Keep Ground and Home Delivery since both are day definite print( "Removing non-ground service types since we are only predicting for ground deliveries..." ) services_kept = ["Ground", "Home Delivery"] df = df[df['std_service_type'].isin(services_kept)] # Keep rows with delivery time print("Removing rows without delivery time...") df = df.dropna(subset=['delivery_time']) # Remove rows that appear too often with abnormal delivery time = 23:59:59 and 00:12:00 print("Removing rows with anomalous delivery time 23:59:59, 00.12.00...") df = df[~df['delivery_time'].isin(['23:59:59'])] df = df[~df['delivery_time'].isin(['00:12:00'])] # Trim empty spaces / Replace empty strings with NA / Remove NA print("Removing rows with malformed zipcodes...") df['recipient_zip'] = df['recipient_zip'].str.strip() df['recipient_zip'].replace('', np.nan, inplace=True) df['sender_zip'] = df['sender_zip'].str.strip() df['sender_zip'].replace('', np.nan, inplace=True) df.dropna(subset=['recipient_zip', 'sender_zip'], inplace=True) # Remove zipcodes with alphabets df = df[df['recipient_zip'].apply(lambda x: x.isnumeric())] df = df[df['sender_zip'].apply(lambda x: x.isnumeric())] ## Remove zipcodes if length != 5 df = df[df['recipient_zip'].apply(lambda x: len(str(x)) == 5)] df = df[df['sender_zip'].apply(lambda x: len(str(x)) == 5)] print(f"Ending with {report_df_stats(df)}.") utilities.print_elapsed_time(start_time) return df
def format_batch_results(pred, pred_proba, df): """Formats dataframe for output. Args: pred (int): Predicted ground delivery time window no. based on mode of predicted probability distribution. pred_proba (np array object): Predicted probability distribution across time windows. Shape of (no. time windows, no. of test shipments). df (pandas dataframe obj): Pandas dataframe Returns: Pandas dataframe object: Dataframe with scheduled time windows, predicted time windows, cumulative probability as columns. """ print("Formatting batch results...") start_time = time.time() df_features = df.copy(deep=False) # Load dict that maps time window numbers to time windows windows_cmu = joblib.load(paths.windows_cmu) # Insert predicted time window into dataframe # windows_cmu maps time window number to corresponding time window df['scheduled_window'] = df.apply( lambda x: windows_cmu[x['scheduled_window_no']], axis=1) # Insert predicted probability distribution array object into each cell in column df['pred_ground_window_pdf'] = 0 df['pred_ground_window_pdf'] = df['pred_ground_window_pdf'].astype(object) for i in range(len(df)): df.at[i, 'pred_ground_window'] = windows_cmu[pred[i]] df.at[i, 'pred_ground_window_pdf'] = pred_proba[i] # For each probability distribution array object, slice the array with scheduled window number # Sum of the sliced array is the predicted cumulative probability of shipment arriving before or in scheduled window df['prob_arrive_by_scheduled_window'] = df.apply(lambda x: np.sum(x[ 'pred_ground_window_pdf'][:x['scheduled_window_no'] + 1]), axis=1) # Get cumulative probability for all time windows for each shipment col_to_format = ['prob_arrive_by_scheduled_window'] for i in list(windows_cmu.keys()): col_name = "prob_arrive_by_window_" + str(i) df[col_name] = df.apply( lambda x: np.sum(x['pred_ground_window_pdf'][:i + 1]), axis=1) col_to_format.append(col_name) # Converts fraction into percentage for display df[col_to_format] = df[col_to_format].astype('float64') df[col_to_format] = df[col_to_format].values * 100 df[col_to_format] = df[col_to_format].applymap("{0:.2f}%".format) # Drop columns that are not needed for prediction display df = df.drop(columns=[ 'pred_ground_window_pdf', 'scheduled_window_no', 'week_number', 'day_of_week', 'month', 'distance', 'sender_in_msa', 'rec_in_msa', 'same_msa', 'sender_pop', 'sender_pop_density', 'sender_houses', 'sender_state', 'recipient_pop', 'recipient_pop_density', 'recipient_houses', 'recipient_state' ]) # Create df just for display df_display = df.copy(deep=False) col_to_format.pop(0) df_display = df_display.drop(columns=col_to_format) print( tabulate(df_display, headers='keys', showindex=False, floatfmt=".2f", tablefmt='psql')) utilities.print_elapsed_time(start_time) # Save to xlsx format timestamp = utilities.get_timestamp() output_path = os.path.join(paths.output_delivery_prediction_dir, timestamp + "_predict_batch.xlsx") print("\nSaving results...") with pd.ExcelWriter(output_path) as writer: df.to_excel(writer, sheet_name='Predicted Window Probability') df_features.to_excel(writer, sheet_name='Features') print("Results saved to " + output_path + "\n") return df
def preprocess_batch(df, feature_names, scaler): """Combines all preprocessing functions to preprocess data for prediction. Args: df (pandas dataframe obj): Pandas dataframe after validation passed feature_names (npz object): Dictionary of numpy arrays containing feature names. scaler (obj): Scaler corresponding to selected model. Returns: df (pandas dataframe obj): Preprocessed pandas dataframe with new features. X_test (array): Test data to be used for prediction. """ print("Preprocessing batch...") start_time = time.time() print("Adding datetime features") # Get date time features df['shipment_date'] = pd.to_datetime(df['shipment_date']) df['week_number'] = df['shipment_date'].dt.week df['day_of_week'] = df['shipment_date'].dt.dayofweek df['month'] = df['shipment_date'].dt.month print("Adding distance...") # Get distance df['distance'] = delivery_prediction_preprocess.get_distance( df['sender_zip'].values, df['recipient_zip'].values) # Get MSA details df = delivery_prediction_preprocess.add_MSA_features(df) # Get zip details df = add_zip_details(df) # Add scheduled time window print("Adding scheduled time windows") df = add_scheduled_windows(df) # Decide on columns to keep columns_kept = [ 'shipper', 'weight', 'zone', 'sender_state', 'recipient_state', 'distance', 'sender_pop', 'sender_pop_density', 'sender_houses', 'recipient_pop', 'recipient_pop_density', 'recipient_houses', 'same_msa', 'sender_in_msa', 'rec_in_msa', 'week_number', 'day_of_week', 'month' ] predict_df = df.copy(deep=False) predict_df = predict_df[columns_kept] predict_df = predict_df.fillna(0) cat_cols = [ 'shipper', 'zone', 'week_number', 'day_of_week', 'sender_state', 'recipient_state', 'month' ] float_cols = [ 'weight', 'distance', 'sender_pop', 'sender_pop_density', 'same_msa', 'sender_in_msa', 'rec_in_msa', 'sender_houses', 'recipient_pop', 'recipient_pop_density', 'recipient_houses' ] predict_df[cat_cols] = predict_df[cat_cols].astype('category') predict_df[float_cols] = predict_df[float_cols].astype('float64') print("One-hot-encoding features...") # Dummify dataframe predict_df = pd.get_dummies(predict_df) # Create empty dataframe in same shape as the one used in model, fill with 0s df_full = pd.DataFrame(columns=feature_names['feature_names_dummified']) # Execute a right join to align our test dataframe with full dataframe predict_df, df_full = predict_df.align(df_full, join='right', axis=1, fill_value=0) # Convert dataframe to numpy array for prediction X_test = predict_df.values print("Scaling data with saved scaler...") # Scale data with saved min-max scaler X_test = scaler.transform(X_test) utilities.print_elapsed_time(start_time) return df, X_test
def preprocess_one(shipment_date, shipper, std_weight, sender_zip, recipient_zip, scaler, feature_names): """Preprocesses input to create features that model will predict on. Args: shipment_date (str): Shipment date in YYYY-MM-DD format shipper (str): Shipper name. only accepts FedEx or UPS. std_weight (float): Shipment weight sender_zip (str): Sender 5-digit zipcode recipient_zip (str): Recipient 5-digit zipcode scaler (obj): Scaler based on model feature_names (npz object): Dictionary of numpy arrays that contain feature names Returns: test (np array object): Input for model prediction. rates_df (pandas dataframe obj): Dataframe with service type, ship cost, ground cost, cost savings, scheduled window numbers, scheduled time windows as columns """ print("Preprocessing input...") start_time = time.time() # Get datetime features week_number, day_of_week, month = get_date_details(shipment_date) # Get sender_in_msa and recipient_in_MSA and same_msa booleans sender_in_msa, rec_in_msa, same_msa = get_msa_details( sender_zip, recipient_zip) # Get distance distance = round(get_distance(sender_zip, recipient_zip), 5) # Get population, density, no. houses, state code for recipient and sender search = SearchEngine() recipient_pop, recipient_pop_density, recipient_houses, recipient_state = get_zip_details( recipient_zip, search) sender_pop, sender_pop_density, sender_houses, sender_state = get_zip_details( sender_zip, search) # Get rates dataframe and zone rates_df, zone = get_shippo_details(shipper, std_weight, sender_zip, sender_state, recipient_zip, recipient_state) if rates_df is None: print('Shippo Api Error Ocurred: Please Try Again') os.system('python main.py') # Create empty dataframe with correct columns that model was trained on df = pd.DataFrame(columns=feature_names['feature_names']) # Add new row into df with test data df.loc[0] = [ shipper, std_weight, zone, sender_state, recipient_state, distance, sender_pop, sender_pop_density, sender_houses, recipient_pop, recipient_pop_density, recipient_houses, same_msa, sender_in_msa, rec_in_msa, week_number, day_of_week, month ] # Define categorical and float columns for one-hot-encoding purposes cat_cols = [ 'shipper', 'zone', 'week_number', 'day_of_week', 'sender_state', 'recipient_state', 'month' ] float_cols = [ 'std_weight', 'distance', 'sender_pop', 'sender_pop_density', 'sender_houses', 'recipient_pop', 'recipient_pop_density', 'recipient_houses' ] df[cat_cols] = df[cat_cols].astype('category') df[float_cols] = df[float_cols].astype('float64') # Create one-hot-encoded variables from categorical columns df = pd.get_dummies(df) # Create empty dataframe in same shape as the one used in model, fill with 0s df_full = pd.DataFrame(columns=feature_names['feature_names_dummified']) # Execute a right join to align test dataframe with dataframe that model was trained on df, df_full = df.align(df_full, join='right', axis=1, fill_value=0) # Convert dataframe to numpy array for prediction test = df.loc[0].values # Scale data with saved min-max scaler test = test.reshape(1, -1) test = scaler.transform(test) utilities.print_elapsed_time(start_time) return test, rates_df
def train(datadict, model_id, n_estimators=25, max_depth=50): """Random forest model training. Trains random forest model. Only n_estimators, max_depth hyperparameters are available to the user for training. The rest of the hyperparameters have been tuned by the CMU team. Saves model and statistics after training. Args: datadict (dict): Dictionary of numpy arrays containing preprocessed train and test data. model_id (str): Timestamp used to identify model, scaler and feature names files. n_estimators (str): Number of trees in forest. Less likely to overfit with more trees. max_depth (str): The maximum depth of the tree. More likely to overfit if depth is large. """ # Convert n_estimators and max_depth from string to int since model only accepts int n_estimators = int(n_estimators) max_depth = int(max_depth) print("\nTraining model...") start_time = time.time() X_train = datadict['X_train'] y_train = datadict['y_train'] X_test = datadict['X_test'] y_test = datadict['y_test'] model_rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, verbose=1, n_jobs=-1, bootstrap=False) print("\nFitting model...") print("Parameters used:", model_rf.get_params()) model_rf.fit(X_train, y_train) print("\nPredicting results...") y_pred_rf = model_rf.predict(X_test) # y_pred_proba_rf = model_rf.predict_proba(X_test) print("\nCalculating accuracy...") accuracy_df = get_accuracy_windows(1, y_test, y_pred_rf) accuracy = accuracy_score(y_test, y_pred_rf) * 100 # Save model print("\nSaving model...") model_path = os.path.join( paths.model_dir, "acc-" + f"{accuracy:.2f}" + "-model_" + model_id + ".pkl.z") print(f"Model saved in {model_path}") joblib.dump(model_rf, model_path) # Get model stats feature_importance_df = get_feature_importance(model_rf, model_id) classification_report_df = get_classification_report(y_test, y_pred_rf) params_df = get_params(model_rf) # Save stats to excel print("\nSaving model stats...") stats_path = os.path.join( paths.output_delivery_prediction_stats_dir, "acc-" + f"{accuracy:.2f}" + "-stats_" + model_id + ".xlsx") print(f"Stats saved in {stats_path}") with pd.ExcelWriter(stats_path) as writer: accuracy_df.to_excel(writer, sheet_name='Accuracy') feature_importance_df.to_excel(writer, sheet_name='Feature Importance') classification_report_df.to_excel(writer, sheet_name='Classification Report') params_df.to_excel(writer, sheet_name='Model Parameters') utilities.print_elapsed_time(start_time)
def batch_query(start_year_month, end_year_month, frac): """Extracts a fraction of raw shipping records from the database with batch method. Args: start_year_month (str): Start date in YYYY-MM format. end_year_month (str): End date in YYYY-MM format.\ frac (str): String representation of fraction of data to extract. Returns: pandas dataframe obj: Dataframe with batch query records. """ # Establishes connection to a MySQL db print(f"Connecting to {credentials.db}...") start_time = time.time() db = MySQLdb.connect(credentials.host, credentials.user, credentials.password, credentials.db) utilities.print_elapsed_time(start_time) # instantiates batch start and end dates as the int of the concatenated string of year + month print("Extracting and preprocessing records...") extraction_start_time = time.time() # Append day 1 to year and month to create datetime object. Day does not affect result start = datetime.strptime(start_year_month + "-1", "%Y-%m-%d").date() end = datetime.strptime(end_year_month + "-1", "%Y-%m-%d").date() records = pd.DataFrame() # Set warning for chained assignment to None. pd.options.mode.chained_assignment = None # Calculate number of months between start and end year/month delta = relativedelta.relativedelta(end, start) num_batches = delta.years * 12 + delta.months + 1 month = start.month year = start.year # tqdm progress bar reference: https://github.com/tqdm/tqdm pbar = trange(num_batches) for i in pbar: pbar.set_description(f"Querying {year} {calendar.month_abbr[month]}") # First day of month in batch first_date_of_month = 1 start_date = f"{year}-{month}-{first_date_of_month}" # Last day of month in batch last_date_of_month = calendar.monthrange(year, month)[1] end_date = f"{year}-{month}-{last_date_of_month}" # Query from first day to last day of given month results = query(db, start_date, end_date, frac) pbar.set_description( f"Preprocessing {len(results)} records for {year} {calendar.month_abbr[month]}" ) # Only preprocess if batch has records if len(results) > 0: records = records.append(preprocess(results), ignore_index=True) # If current month is 12, increment year by 1 and reset month to 1 for next batch if month == 12: month = 1 year += 1 # Else, increment month by 1 else: month += 1 print(f"{len(records)} records extracted and preprocessed") utilities.print_elapsed_time(extraction_start_time) return records