def save_predictions(model, tag, partition): r"""Save the predictions to disk. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- preds : numpy array The prediction vector. probas : numpy array The probability vector. """ # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] separator = model.specs['separator'] # Get date stamp to record file creation timestamp = get_datestamp() # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Read the prediction frame file_spec = ''.join([datasets[partition], '*']) file_name = most_recent_file(input_dir, file_spec) file_name = file_name.split(SSEP)[-1].split(PSEP)[0] pf = read_frame(input_dir, file_name, extension, separator) # Cull records before the prediction date try: predict_date = model.specs['predict_date'] found_pdate = True except: found_pdate = False if found_pdate: pd_indices = pf[pf.date >= predict_date].index.tolist() pf = pf.iloc[pd_indices] else: pd_indices = pf.index.tolist() # Save predictions for all projects logger.info("Saving Predictions") output_file = USEP.join(['predictions', timestamp]) preds = model.preds[(tag, partition)].squeeze() if found_pdate: preds = np.take(preds, pd_indices) pred_series = pd.Series(preds, index=pd_indices) df_pred = pd.DataFrame(pred_series, columns=['prediction']) write_frame(df_pred, output_dir, output_file, extension, separator) # Save probabilities for classification projects probas = None if model_type == ModelType.classification: logger.info("Saving Probabilities") output_file = USEP.join(['probabilities', timestamp]) probas = model.probas[(tag, partition)].squeeze() if found_pdate: probas = np.take(probas, pd_indices) prob_series = pd.Series(probas, index=pd_indices) df_prob = pd.DataFrame(prob_series, columns=['probability']) write_frame(df_prob, output_dir, output_file, extension, separator) # Save ranked predictions logger.info("Saving Ranked Predictions") pf['prediction'] = pred_series if model_type == ModelType.classification: pf['probability'] = prob_series pf.sort_values('probability', ascending=False, inplace=True) else: pf.sort_values('prediction', ascending=False, inplace=True) output_file = USEP.join(['rankings', timestamp]) write_frame(pf, output_dir, output_file, extension, separator) # Return predictions and any probabilities return preds, probas
def training_pipeline(model): r"""AlphaPy Training Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- model : alphapy.Model The final results are stored in the model object. Raises ------ KeyError If the number of columns of the train and test data do not match, then this exception is raised. """ logger.info("Training Pipeline") # Unpack the model specifications calibration = model.specs['calibration'] directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] grid_search = model.specs['grid_search'] model_type = model.specs['model_type'] predict_mode = model.specs['predict_mode'] rfe = model.specs['rfe'] sampling = model.specs['sampling'] scorer = model.specs['scorer'] separator = model.specs['separator'] target = model.specs['target'] # Get train and test data X_train, y_train = get_data(model, Partition.train) X_test, y_test = get_data(model, Partition.test) # Determine if there are any test labels if y_test.any(): logger.info("Test Labels Found") model.test_labels = True model = save_features(model, X_train, X_test, y_train, y_test) # Log feature statistics logger.info("Original Feature Statistics") logger.info("Number of Training Rows : %d", X_train.shape[0]) logger.info("Number of Training Columns : %d", X_train.shape[1]) if model_type == ModelType.classification: uv, uc = np.unique(y_train, return_counts=True) logger.info("Unique Training Values for %s : %s", target, uv) logger.info("Unique Training Counts for %s : %s", target, uc) logger.info("Number of Testing Rows : %d", X_test.shape[0]) logger.info("Number of Testing Columns : %d", X_test.shape[1]) if model_type == ModelType.classification and model.test_labels: uv, uc = np.unique(y_test, return_counts=True) logger.info("Unique Testing Values for %s : %s", target, uv) logger.info("Unique Testing Counts for %s : %s", target, uc) # Merge training and test data if X_train.shape[1] == X_test.shape[1]: split_point = X_train.shape[0] X = pd.concat([X_train, X_test]) else: raise IndexError( "The number of training and test columns [%d, %d] must match." % (X_train.shape[1], X_test.shape[1])) # Apply treatments to the feature matrix all_features = apply_treatments(model, X) # Drop features all_features = drop_features(all_features, drop) # Save the train and test files with extracted and dropped features datestamp = get_datestamp() data_dir = SSEP.join([directory, 'input']) df_train = all_features.iloc[:split_point, :] df_train = pd.concat( [df_train, pd.DataFrame(y_train, columns=[target])], axis=1) output_file = USEP.join([model.train_file, datestamp]) write_frame(df_train, data_dir, output_file, extension, separator) df_test = all_features.iloc[split_point:, :] if y_test.any(): df_test = pd.concat( [df_test, pd.DataFrame(y_test, columns=[target])], axis=1) output_file = USEP.join([model.test_file, datestamp]) write_frame(df_test, data_dir, output_file, extension, separator) # Create crosstabs for any categorical features if model_type == ModelType.classification: create_crosstabs(model) # Create initial features all_features = create_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Generate interactions all_features = create_interactions(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Remove low-variance features all_features = remove_lv_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Shuffle the data [if specified] model = shuffle_data(model) # Oversampling or Undersampling [if specified] if model_type == ModelType.classification: if sampling: model = sample_data(model) else: logger.info("Skipping Sampling") # Get sample weights (classification only) model = get_class_weights(model) # Perform feature selection, independent of algorithm if feature_selection: model = select_features(model) # Get the available classifiers and regressors logger.info("Getting All Estimators") estimators = get_estimators(model) # Get the available scorers if scorer not in scorers: raise KeyError("Scorer function %s not found" % scorer) # Model Selection logger.info("Selecting Models") for algo in model.algolist: logger.info("Algorithm: %s", algo) # select estimator try: estimator = estimators[algo] scoring = estimator.scoring est = estimator.estimator except KeyError: logger.info("Algorithm %s not found", algo) # initial fit model = first_fit(model, algo, est) # recursive feature elimination if rfe: if scoring: model = rfecv_search(model, algo) elif hasattr(est, "coef_"): model = rfe_search(model, algo) else: logger.info("No RFE Available for %s", algo) # grid search if grid_search: model = hyper_grid_search(model, estimator) # predictions model = make_predictions(model, algo, calibration) # Create a blended estimator if len(model.algolist) > 1: model = predict_blend(model) # Generate metrics model = generate_metrics(model, Partition.train) model = generate_metrics(model, Partition.test) # Store the best estimator model = predict_best(model) # Generate plots generate_plots(model, Partition.train) if model.test_labels: generate_plots(model, Partition.test) # Save best features and predictions save_model(model, 'BEST', Partition.test) # Return the model return model