def pull_unseen(file_path): if unseen_predictions: print("\n") print("Working on hybridizing unseen data.\n") new_common_cols = common_cols + hybrid_preserved_columns print("Old common cols are " + str(common_cols)) print("Hybrid cols to preserve are " + str(hybrid_preserved_columns)) paths_to_new_data = str(config[topic]['paths_to_new_data']) paths_to_new_data = eval(paths_to_new_data) for draw in range(len(paths_to_new_data)): if len(ast.literal_eval(config[topic]['estimands'])) > 2: save_dir = get_save_dir(topic=topic, estimand=estimand, version=version) unseen_csv_name = 'unseen_combined_predictions_draw{}'.format(draw) else: save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version) unseen_csv_name = 'unseen_predictions_draw{}'.format(draw) if len(paths_to_new_data) > 1: save_dir += 'draws/' new_total = get_total(other_algorithms, unseen_csv_name, target_col, estimator, estimator_col, new_common_cols, save_dir) print(new_total.head()) print("Estimator col is " + estimator_col) new_predictions = get_mean(topic, other_algorithms, unseen_csv_name, target_col, estimator_col, new_common_cols, new_total, estimators, estimands) print(new_predictions.columns.tolist()) print(new_predictions.isnull().any()) print(new_predictions.head()) new_predictions = new_predictions.drop(target_col, axis=1) for alg in other_algorithms: new_predictions = new_predictions.drop(alg, axis=1) new_predictions = new_predictions.rename(columns={'hybrid_prediction': 'value'}) file_path = save_dir + 'hybridized_unseen_predictions_draw{}.csv'.format(draw) print("Printing hybridized unseen predictions to " + file_path) new_predictions.to_csv(file_path)
def plot_predictions_vs_estimator_data(topic, version, sups, year_start, year_end, estimator, estimand, predictions, algorithm): """ Graphs predictions from the first model versus the original data. """ to_graph = predictions.merge(sups) to_graph = to_graph.sort_values(['me_name', 'super_region_id']) estimator_col = estimator.lower() + '_data' if len(to_graph.me_name.unique()) == 1: num_cols = 1 elif len(to_graph.me_name.unique()) == 2: num_cols = 2 else: num_cols = 3 g = sns.FacetGrid(to_graph[(to_graph.year_id >= year_start) & (to_graph.year_id <= year_end)], col='me_name', hue='super_region_name', col_wrap=num_cols, size=5, sharex=False, sharey=False) g = g.map(plt.scatter, '{}'.format(estimator_col), '{}_prediction'.format(algorithm), marker='o', alpha=0.5, facecolors='none', linewidth=1.5) g.add_legend(title='Super Region') g.set_axis_labels("{} mean".format(estimator), "{} prediction".format(estimand)) g.fig.subplots_adjust(top=.85) # previously .9 g.fig.suptitle( '{} Predictions vs {} Mean\nYear Id >={} and <= {}\nUsing {}: Version {}' .format(estimand, estimator, year_start, year_end, algorithm.upper(), version), fontsize=16) if num_cols == 1: for ax in g.axes.flat: plt.setp(ax.texts, text='') save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=None, version=version) file_path = save_dir + '{}_original_{}_predictions_vs_{}_data_{}_to_{}.png'.format( algorithm, estimand, estimator, year_start, year_end) print("\n\n") print('Saving figure to {}'.format(file_path)) plt.savefig(file_path) # plt.show() plt.clf() return
def predict_second_data_on_first_model(topic, estimators_objects_dict, estimators, estimands, i): for j in range(0, len(estimators)): if estimands[i] != estimators[j]: model = estimators_objects_dict[i]['model'] matched_second_data = estimators_objects_dict[j]['matched'] # Fake it till you make it # matched_second_data = matched_second_data.rename(columns={estimators[j].lower() + '_data': # estimands[i].lower() + '_data'}) save_dir = get_save_dir(topic, estimands[i], estimators[j]) model.predict(new_df=matched_second_data, save_dir=save_dir, unseen=False) return
def step_two_crosswalking(topic, version, raw, sups, estimators_array, estimators, estimands, me_name_df): """ Does the essential crosswalking part of this library. """ estimand_data = raw.copy(deep=True) print(raw.head()) estimand_data = estimand_data[estimand_data.estimator == estimands[0].lower()] estimand_data = estimand_data.rename(columns={'data': estimands[0].lower() + '_data'}) estimand_data = estimand_data.merge(sups) estimand_data['estimator'] = estimands[0].lower() print("The estimand data are: ") print(estimand_data.head()) if estimand_data.shape[0] == 0: raise ValueError("Oops! No estimand found in the original data set. Did you mean to look for a match for " + estimands[0] + "?") cols = og_feature_cols + preserved_cols + ['{}_data'.format(estimands[0].lower())] + required_columns cols = list(set(cols)) estimand_data = estimand_data[cols] estimators_array[0]['predictions']['estimator'] = estimators[0].lower() estimators_array[0]['predictions'] = estimators_array[0]['predictions'].merge(me_name_df) print("The predictions data are: ") print(estimators_array[0]['predictions'].head()) total = estimators_array[0]['predictions'].rename(columns={'{}_prediction'.format(algorithm): estimands[0].lower() + '_data'}) print("Checking for null values in " + estimators[0].lower() + '_data') print(total[estimators[0].lower() + '_data'].head()) if total[estimators[0].lower() + '_data'].isnull().any(): raise ValueError("Oops! You have missing values in the " + estimators[0].lower() + "_data column!") assert (len(list(set(total.columns)))) == len(list(total.columns)) assert len(list(set(estimand_data.columns))) == len(list(estimand_data.columns)) total = total.append(estimand_data) if len(estimators) > 2 or (len(estimators) > 1 and estimands[0].lower() != estimators[1].lower()): total = combine_all(estimators_array, estimands, estimators, topic, estimand_data, me_name_df, total) print(total.head()) print(total.describe()) print(total[[estimands[0].lower() + '_data', estimators[0].lower() + '_data']].head()) # Get save directory save_dir = get_save_dir(topic=topic, estimand=estimands[0], version=version, estimator=None) file_path = save_dir + '{}_combined_predictions.csv'.format(algorithm) print("Printing all of the predictions (combined) to " + file_path) print("\n\n") total.to_csv(file_path) return total
def pull_unseen(file_path): if unseen_predictions: new_common_cols = common_cols + hybrid_preserved_columns paths_to_new_data = str(config[topic]['paths_to_new_data']) paths_to_new_data = eval(paths_to_new_data) for draw in range(len(paths_to_new_data)): save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version) unseen_csv_name = 'unseen_predictions_draw{}'.format(draw) if len(paths_to_new_data) > 1: save_dir += 'draws/' new_total = get_total(other_algorithms, unseen_csv_name, target_col, estimator, estimator_col, new_common_cols, save_dir) new_predictions = get_mean(other_algorithms, unseen_csv_name, target_col, estimator_col, new_common_cols, new_total, estimators, estimands) new_predictions = new_predictions.drop(target_col, axis=1) for alg in other_algorithms: new_predictions = new_predictions.drop(alg, axis=1) new_predictions = new_predictions.rename( columns={ 'hybrid_prediction': 'value', 'seq': 'crosswalk_parent_seq' }) file_path = save_dir + 'hybridized_unseen_predictions_draw{}.csv'.format( draw) new_predictions.to_csv(file_path, index=False)
def predict_on_unseen_data(topic, estimator, estimand, version, og_feature_cols, og_categorical_cols, me_name_df, covariates, required_columns, estimators_object, path_to_new_data): """ Generate predictions on unseen data :param feature_cols: :param categorical_cols: :return: """ feature_cols = list(og_feature_cols) categorical_cols = list(og_categorical_cols) unseen_raw = pd.read_csv(drives().j + paths_to_new_data[i]) save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version) if len(paths_to_new_data) > 1: save_dir += 'draws/' if not os.path.exists(save_dir): try: os.makedirs(save_dir) except OSError as e: if e.errno != errno.EEXIST: raise if 'data' not in unseen_raw.columns: unseen_raw['data'] = None print(unseen_raw.shape) preserved_cols = unseen_raw.columns.tolist() unseen_merged = merge_data(unseen_raw, topic=topic, estimator=estimator, me_name_df=me_name_df, feature_cols=feature_cols, categorical_cols=categorical_cols, estimand=estimand, covariates=covariates, preserved_cols=preserved_cols, required_cols=required_columns, cov_dir='FILEPATH') print(unseen_merged.shape) oh_unseen_merged = one_hot_encoder(matched_df=unseen_merged, feature_cols=feature_cols, categorical_cols=categorical_cols) print(oh_unseen_merged.shape) unseen_estimator_data = get_estimator_data(merged_df=oh_unseen_merged, estimator=estimator, feature_cols=feature_cols) print(unseen_estimator_data.shape) print(unseen_estimator_data.isnull().any()) print(unseen_estimator_data.head()) print(estimator) print(estimand) print(estimators_object['model'].estimator) print(estimators_object['model'].estimand) if estimators_object['model'].estimator.lower() != estimators_object['model'].estimand.lower(): unseen_estimator_data = unseen_estimator_data[unseen_estimator_data[estimator.lower() + '_data'].notnull()] # assert estimator == estimators_object['model'].estimator # assert estimand == estimators_object['model'].estimand print(unseen_estimator_data.shape) new_preds = estimators_object['model'].predict(new_df=unseen_estimator_data, save_dir=save_dir, unseen=True, draw_number=i) print(new_preds.columns.tolist()) print(new_preds[estimator.lower() + '_data'].head())
def step_one_training(topic, version, algorithm, covariates, estimator, estimand, feature_cols, me_name_df, categorical_cols): """ Matches data, performs one-hot encoding if necessary, trains a machine-learning model, and runs the model and graphs. Returns several objects along the way. """ save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version) print('Currently working on predicting {} data using {} data.\n'.format(estimand, estimator)) merged = merge_data(raw, topic=topic, estimator=estimator, me_name_df=me_name_df, feature_cols=feature_cols, categorical_cols=categorical_cols, covariates=covariates, estimand=estimand, preserved_cols=preserved_cols, required_cols=required_columns, cov_dir='FILEPATH') oh_merged = one_hot_encoder(matched_df=merged, feature_cols=feature_cols, categorical_cols=categorical_cols) print(feature_cols) estimator_data = get_estimator_data(merged_df=oh_merged, estimator=estimator, feature_cols=feature_cols) if estimator.lower() != estimand.lower(): matched = match_data(merged, estimator, estimator_data, estimand, save_dir, og_feature_cols, required_columns, covariates) else: # Sometimes we are training the data on ALL the estimand data without any estimator data matched = estimator_data feature_cols.remove(estimator.lower() + '_data') # Without removing this column, our target is in our features # Skip if no common data points _validate_matched_dataset(matched, estimand, estimator) model = train_model(matched=matched, algorithm=algorithm, estimator=estimator, estimand=estimand, feature_cols=feature_cols, save_dir=save_dir) graphing_df = build_graphing_dataset(model, me_name_df, og_categorical_cols) predictions = run_model_and_graph(estimator_data=estimator_data, graphing_df=graphing_df, model=model, algorithm=algorithm, save_dir=save_dir) print("Raw shape: ") print(raw.shape) print("Merged shape: ") print(merged.shape) print("oh_merged shape: ") print(oh_merged.shape) print("Estimator data shape: ") print(estimator_data.shape) print("Matched shape: ") print(matched.shape) print(matched.year_id.unique()) print(matched.age_group_id.unique()) print("X_train shape: ") print(model.X_train.shape) print("X_test shape: ") print(model.X_test.shape) print("Graphing df shape: ") print(graphing_df.shape) print("Predictions shape: ") print(predictions.shape) print("\n") return {'merged': merged, 'estimator_data': estimator_data, 'matched': matched, 'model': model, 'predictions': predictions, 'graphing_df': graphing_df}
def combine_all(estimators_array, estimands, estimators, topic, estimand_data, me_name_df, total): """ :param estimators_array: :param estimands: :param estimators: :param topic: :param estimand_data: :param me_name_df: :return: """ for i in range(1, len(estimators_array)): if estimators_array[i]['model']: estimand = estimands[0].lower() print('Estimand is ' + estimand) first_estimator = estimators[0].lower() print('First estimator is ' + first_estimator) second_estimator = estimators[i].lower() print('Second estimator is ' + second_estimator) print('First target col is ' + estimand) second_estimand = estimands[i].lower() print('Second target col is ' + second_estimand) first_model = estimators_array[0]['model'] second_predictions = estimators_array[i]['predictions'] save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=second_estimator, version=version) save_dir += 'through_' + second_estimand.lower() + '/' if not os.path.exists(save_dir): try: os.makedirs(save_dir) except OSError as e: if e.errno != errno.EEXIST: raise new_df = second_predictions.rename(columns={'{}_prediction'.format(algorithm): first_estimator.lower() + '_data'}) print(new_df.head()) new_predictions = first_model.predict(new_df=new_df, save_dir=save_dir) print(new_predictions.columns) estimand_data = estimand_data.rename(columns={estimand.lower() + '_data': 'actual'}) print(estimand_data.columns) join_cols = list(set(og_feature_cols + required_columns)) join_cols.remove('estimator') new_graphing_df = new_predictions.merge(estimand_data, how='inner', on=join_cols, suffixes=('_predictions', '_actual')) super_regions_df = SharedLabels().super_regions() super_regions_df = super_regions_df.drop('location_id', axis=1) new_graphing_df = new_graphing_df.merge(super_regions_df) new_graphing_df = new_graphing_df.rename(columns={'{}_prediction'.format(algorithm): 'predicted'}) print(new_graphing_df.head()) print(new_graphing_df.shape) first_model.estimator = second_estimator metrics_obj = Metrics(new_graphing_df, algorithm=algorithm, model=first_model, save_dir=save_dir) metrics_obj.get_indiv_metrics() metrics_obj.print_facet_plots() new_predictions['estimator'] = estimators[i].lower() new_predictions = new_predictions.rename(columns={'{}_prediction'.format(algorithm): estimand.lower() + '_data'}) new_predictions = new_predictions.merge(me_name_df) print(new_predictions.columns) print(total.columns) total = total.append(new_predictions) return total
def make_st_gpr_plots(topic, version, sups, estimand, loc_id, me, total, algorithm): """ Graphs all predictions (including original data) in ST-GPR style. """ to_graph = total.merge(sups).sort_values('me_name') to_graph = to_graph[[ 'age_group_id', 'location_id', 'super_region_id', 'sex_id', 'me_name', 'year_id', '{}_data'.format(estimand.lower()), 'estimator' ]] # Select a few countries for which to make st-gpr plots to_graph = to_graph[(to_graph.location_id == loc_id) & (to_graph.me_name == me)] # Not every country has all modelable entities if len(to_graph) == 0: return to_graph = to_graph[~(to_graph.age_group_id == 21) & (to_graph.age_group_id <= 30)] if not to_graph.empty: plt.figure() g = sns.FacetGrid(to_graph, col='age_group_id', hue='estimator', col_wrap=4, size=5, sharex=False, sharey=False) g = g.map(plt.scatter, 'year_id', '{}_data'.format(estimand.lower()), marker='o', alpha=0.5, facecolors='none', linewidth=1.5) g.add_legend(title='Data Type') g.fig.subplots_adjust(top=.85) # previously .9 g.fig.suptitle( 'Observed and Estimated {} Data Time Trends\nfor {}\nin Location ID {}\n Using {}, Version {}' .format(estimand, me, loc_id, algorithm.upper(), version), fontsize=16) save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=None, version=version) + \ 'st_gpr_plots/' + algorithm + '/' if not os.path.exists(save_dir): try: os.makedirs(save_dir) except OSError as e: if e.errno != errno.EEXIST: raise file_path = save_dir + 'location_{}_me_{}.png'.format(loc_id, me) print("\n\n") print('Saving figure to {}'.format(file_path)) plt.savefig(file_path) plt.show() plt.clf() else: print("Your graphing dataframe is empty!") return
def predict_data_two_step(topic, algorithm, version, estimand_data, first_estimator, second_estimator, estimand, first_estimand, second_estimand, first_model, second_predictions): """ This function is essential in the "crosswalk" two-step modeling process. It takes the predictions generated from the second model and feeds them into the first model. It returns a dataFrame containing new predictions from the two-step model Parameters ---------- topic : a string declaring the risk topic of research algorithm : a string declaring the type of machine-learning algorithm being used version : a string declaring the model version (used for saving) estimand_data : a dataFrame of the target data first_estimator : a string declaring the estimator used in the first model second_estimator : same as above, but for the second model estimand : a string declaring the thing being estimated first_target_col : a string declaring the target column used in the first model second_target_col : same as above, but for the second model first_model : the first machine-learning model used in the two-step process second_predictions : a dataFrame resulting from generating predictions in the second model Returns ------- new_predictions : """ save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=second_estimator, version=version) save_dir += 'through_' + first_estimator.lower() + '/' if not os.path.exists(save_dir): try: os.makedirs(save_dir) except OSError as e: if e.errno != errno.EEXIST: raise new_df = second_predictions.rename( columns={'{}_prediction'.format(algorithm): second_estimand}) print(new_df.head()) new_predictions = first_model.predict(new_df=new_df, save_dir=save_dir) if new_predictions.shape[0] == 0: raise ValueError("Oops! You have no predictions.") estimand_data = estimand_data.rename(columns={first_estimand: 'actual'}) new_graphing_df = new_predictions.merge(estimand_data, how='inner', suffixes=['_predicted', '_actual'], on=[ 'year_id', 'age_group_id', 'sex_id', 'super_region_id', 'me_name', 'location_id' ]) labs = SharedLabels() sups = labs.super_regions() new_graphing_df = new_graphing_df.merge(sups) new_graphing_df = new_graphing_df.rename( columns={'{}_prediction'.format(algorithm): 'predicted'}) print(new_graphing_df.head()) first_model.estimator = second_estimator metrics_obj = Metrics(new_graphing_df, algorithm=algorithm, model=first_model, save_dir=save_dir) metrics_obj.get_indiv_metrics() metrics_obj.print_facet_plots() return new_predictions
with open( "FILEPATH/v" + version + "/estimators_obj_" + algorithm + "_" + estimator + ".pkl", 'rb') as f: estimators_object = pickle.load(f) draw_num = int(os.getenv("SGE_TASK_ID")) - 1 print(draw_num) path_to_new_data = paths_to_new_data[draw_num] print(path_to_new_data) feature_cols = list(og_feature_cols) categorical_cols = list(og_categorical_cols) unseen_raw = pd.read_csv(path_to_new_data) # Get output directory filepath save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version) if len(paths_to_new_data) > 1: save_dir += 'draws/' if not os.path.exists(save_dir): try: os.makedirs(save_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Add data column to unseen_raw if it's not already present if 'data' not in unseen_raw.columns: unseen_raw['data'] = None