Пример #1
0
def pull_unseen(file_path):

    if unseen_predictions:
        print("\n")
        print("Working on hybridizing unseen data.\n")
        new_common_cols = common_cols + hybrid_preserved_columns
        print("Old common cols are " + str(common_cols))
        print("Hybrid cols to preserve are " + str(hybrid_preserved_columns))

        paths_to_new_data = str(config[topic]['paths_to_new_data'])
        paths_to_new_data = eval(paths_to_new_data)

        for draw in range(len(paths_to_new_data)):

            if len(ast.literal_eval(config[topic]['estimands'])) > 2:
                save_dir = get_save_dir(topic=topic, estimand=estimand, version=version)
                unseen_csv_name = 'unseen_combined_predictions_draw{}'.format(draw)

            else:
                save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version)
                unseen_csv_name = 'unseen_predictions_draw{}'.format(draw)

            if len(paths_to_new_data) > 1:
                save_dir += 'draws/'

            new_total = get_total(other_algorithms, unseen_csv_name, target_col, estimator, estimator_col,
                                  new_common_cols, save_dir)
            print(new_total.head())
            print("Estimator col is " + estimator_col)

            new_predictions = get_mean(topic, other_algorithms, unseen_csv_name, target_col, estimator_col,
                                           new_common_cols, new_total, estimators, estimands)
            print(new_predictions.columns.tolist())
            print(new_predictions.isnull().any())
            print(new_predictions.head())

            new_predictions = new_predictions.drop(target_col, axis=1)
            for alg in other_algorithms:
                new_predictions = new_predictions.drop(alg, axis=1)

            new_predictions = new_predictions.rename(columns={'hybrid_prediction': 'value'})

            file_path = save_dir + 'hybridized_unseen_predictions_draw{}.csv'.format(draw)
            print("Printing hybridized unseen predictions to " + file_path)
            new_predictions.to_csv(file_path)
Пример #2
0
def plot_predictions_vs_estimator_data(topic, version, sups, year_start,
                                       year_end, estimator, estimand,
                                       predictions, algorithm):
    """ Graphs predictions from the first model versus the original data. """

    to_graph = predictions.merge(sups)
    to_graph = to_graph.sort_values(['me_name', 'super_region_id'])

    estimator_col = estimator.lower() + '_data'
    if len(to_graph.me_name.unique()) == 1:
        num_cols = 1
    elif len(to_graph.me_name.unique()) == 2:
        num_cols = 2
    else:
        num_cols = 3

    g = sns.FacetGrid(to_graph[(to_graph.year_id >= year_start)
                               & (to_graph.year_id <= year_end)],
                      col='me_name',
                      hue='super_region_name',
                      col_wrap=num_cols,
                      size=5,
                      sharex=False,
                      sharey=False)

    g = g.map(plt.scatter,
              '{}'.format(estimator_col),
              '{}_prediction'.format(algorithm),
              marker='o',
              alpha=0.5,
              facecolors='none',
              linewidth=1.5)
    g.add_legend(title='Super Region')
    g.set_axis_labels("{} mean".format(estimator),
                      "{} prediction".format(estimand))
    g.fig.subplots_adjust(top=.85)  # previously .9
    g.fig.suptitle(
        '{} Predictions vs {} Mean\nYear Id >={} and <= {}\nUsing {}: Version {}'
        .format(estimand, estimator, year_start, year_end, algorithm.upper(),
                version),
        fontsize=16)
    if num_cols == 1:
        for ax in g.axes.flat:
            plt.setp(ax.texts, text='')

    save_dir = get_save_dir(topic=topic,
                            estimand=estimand,
                            estimator=None,
                            version=version)
    file_path = save_dir + '{}_original_{}_predictions_vs_{}_data_{}_to_{}.png'.format(
        algorithm, estimand, estimator, year_start, year_end)
    print("\n\n")
    print('Saving figure to {}'.format(file_path))
    plt.savefig(file_path)
    # plt.show()
    plt.clf()
    return
Пример #3
0
def predict_second_data_on_first_model(topic, estimators_objects_dict, estimators, estimands, i):
    for j in range(0, len(estimators)):
        if estimands[i] != estimators[j]:
            model = estimators_objects_dict[i]['model']
            matched_second_data = estimators_objects_dict[j]['matched']

            # Fake it till you make it
            # matched_second_data = matched_second_data.rename(columns={estimators[j].lower() + '_data':
            #                                                          estimands[i].lower() + '_data'})
            save_dir = get_save_dir(topic, estimands[i], estimators[j])
            model.predict(new_df=matched_second_data, save_dir=save_dir, unseen=False)
    return
Пример #4
0
def step_two_crosswalking(topic, version, raw, sups, estimators_array, estimators, estimands, me_name_df):
    """ Does the essential crosswalking part of this library. """

    estimand_data = raw.copy(deep=True)
    print(raw.head())
    estimand_data = estimand_data[estimand_data.estimator == estimands[0].lower()]
    estimand_data = estimand_data.rename(columns={'data': estimands[0].lower() + '_data'})
    estimand_data = estimand_data.merge(sups)
    estimand_data['estimator'] = estimands[0].lower()
    print("The estimand data are: ")
    print(estimand_data.head())

    if estimand_data.shape[0] == 0:
        raise ValueError("Oops! No estimand found in the original data set. Did you mean to look for a match for " +
                         estimands[0] + "?")

    cols = og_feature_cols + preserved_cols + ['{}_data'.format(estimands[0].lower())] + required_columns
    cols = list(set(cols))
    estimand_data = estimand_data[cols]

    estimators_array[0]['predictions']['estimator'] = estimators[0].lower()
    estimators_array[0]['predictions'] = estimators_array[0]['predictions'].merge(me_name_df)
    print("The predictions data are: ")
    print(estimators_array[0]['predictions'].head())

    total = estimators_array[0]['predictions'].rename(columns={'{}_prediction'.format(algorithm):
                                                               estimands[0].lower() + '_data'})

    print("Checking for null values in " + estimators[0].lower() + '_data')
    print(total[estimators[0].lower() + '_data'].head())
    if total[estimators[0].lower() + '_data'].isnull().any():
        raise ValueError("Oops! You have missing values in the " + estimators[0].lower() + "_data column!")

    assert (len(list(set(total.columns)))) == len(list(total.columns))
    assert len(list(set(estimand_data.columns))) == len(list(estimand_data.columns))
    total = total.append(estimand_data)

    if len(estimators) > 2 or (len(estimators) > 1 and estimands[0].lower() != estimators[1].lower()):
        total = combine_all(estimators_array, estimands, estimators, topic, estimand_data, me_name_df, total)

    print(total.head())
    print(total.describe())
    print(total[[estimands[0].lower() + '_data', estimators[0].lower() + '_data']].head())

    # Get save directory
    save_dir = get_save_dir(topic=topic, estimand=estimands[0], version=version, estimator=None)
    file_path = save_dir + '{}_combined_predictions.csv'.format(algorithm)

    print("Printing all of the predictions (combined) to " + file_path)
    print("\n\n")
    total.to_csv(file_path)
    return total
Пример #5
0
def pull_unseen(file_path):

    if unseen_predictions:
        new_common_cols = common_cols + hybrid_preserved_columns

        paths_to_new_data = str(config[topic]['paths_to_new_data'])
        paths_to_new_data = eval(paths_to_new_data)

        for draw in range(len(paths_to_new_data)):
            save_dir = get_save_dir(topic=topic,
                                    estimand=estimand,
                                    estimator=estimator,
                                    version=version)
            unseen_csv_name = 'unseen_predictions_draw{}'.format(draw)

            if len(paths_to_new_data) > 1:
                save_dir += 'draws/'

            new_total = get_total(other_algorithms, unseen_csv_name,
                                  target_col, estimator, estimator_col,
                                  new_common_cols, save_dir)

            new_predictions = get_mean(other_algorithms, unseen_csv_name,
                                       target_col, estimator_col,
                                       new_common_cols, new_total, estimators,
                                       estimands)

            new_predictions = new_predictions.drop(target_col, axis=1)
            for alg in other_algorithms:
                new_predictions = new_predictions.drop(alg, axis=1)

            new_predictions = new_predictions.rename(
                columns={
                    'hybrid_prediction': 'value',
                    'seq': 'crosswalk_parent_seq'
                })

            file_path = save_dir + 'hybridized_unseen_predictions_draw{}.csv'.format(
                draw)
            new_predictions.to_csv(file_path, index=False)
Пример #6
0
def predict_on_unseen_data(topic, estimator, estimand, version, og_feature_cols, og_categorical_cols,
                           me_name_df, covariates, required_columns, estimators_object, path_to_new_data):
    """
    Generate predictions on unseen data

    :param feature_cols:
    :param categorical_cols:

    :return:
    """

    feature_cols = list(og_feature_cols)
    categorical_cols = list(og_categorical_cols)
    unseen_raw = pd.read_csv(drives().j + paths_to_new_data[i])

    save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version)
    if len(paths_to_new_data) > 1:
        save_dir += 'draws/'

        if not os.path.exists(save_dir):
            try:
                os.makedirs(save_dir)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

    if 'data' not in unseen_raw.columns:
        unseen_raw['data'] = None

    print(unseen_raw.shape)
    preserved_cols = unseen_raw.columns.tolist()
    unseen_merged = merge_data(unseen_raw, topic=topic,
                               estimator=estimator,
                               me_name_df=me_name_df,
                               feature_cols=feature_cols,
                               categorical_cols=categorical_cols,
                               estimand=estimand,
                               covariates=covariates,
                               preserved_cols=preserved_cols,
                               required_cols=required_columns,
                               cov_dir='FILEPATH')
    print(unseen_merged.shape)

    oh_unseen_merged = one_hot_encoder(matched_df=unseen_merged, feature_cols=feature_cols,
                                       categorical_cols=categorical_cols)

    print(oh_unseen_merged.shape)

    unseen_estimator_data = get_estimator_data(merged_df=oh_unseen_merged, estimator=estimator,
                                               feature_cols=feature_cols)
    print(unseen_estimator_data.shape)
    print(unseen_estimator_data.isnull().any())
    print(unseen_estimator_data.head())
    print(estimator)
    print(estimand)
    print(estimators_object['model'].estimator)
    print(estimators_object['model'].estimand)

    if estimators_object['model'].estimator.lower() != estimators_object['model'].estimand.lower():
        unseen_estimator_data = unseen_estimator_data[unseen_estimator_data[estimator.lower() + '_data'].notnull()]
    # assert estimator == estimators_object['model'].estimator
    # assert estimand == estimators_object['model'].estimand
    print(unseen_estimator_data.shape)

    new_preds = estimators_object['model'].predict(new_df=unseen_estimator_data, save_dir=save_dir, unseen=True,
                                                   draw_number=i)
    print(new_preds.columns.tolist())
    print(new_preds[estimator.lower() + '_data'].head())
Пример #7
0
def step_one_training(topic, version, algorithm, covariates, estimator, estimand, feature_cols, me_name_df,
                      categorical_cols):
    """ Matches data, performs one-hot encoding if necessary, trains a machine-learning model, and runs
    the model and graphs. Returns several objects along the way. """

    save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=estimator, version=version)

    print('Currently working on predicting {} data using {} data.\n'.format(estimand, estimator))

    merged = merge_data(raw, topic=topic,
                        estimator=estimator,
                        me_name_df=me_name_df,
                        feature_cols=feature_cols,
                        categorical_cols=categorical_cols,
                        covariates=covariates,
                        estimand=estimand,
                        preserved_cols=preserved_cols,
                        required_cols=required_columns,
                        cov_dir='FILEPATH')

    oh_merged = one_hot_encoder(matched_df=merged, feature_cols=feature_cols,
                                categorical_cols=categorical_cols)

    print(feature_cols)
    estimator_data = get_estimator_data(merged_df=oh_merged,
                                        estimator=estimator,
                                        feature_cols=feature_cols)
    if estimator.lower() != estimand.lower():
        matched = match_data(merged, estimator, estimator_data, estimand, save_dir, og_feature_cols, required_columns,
                             covariates)
    else:  # Sometimes we are training the data on ALL the estimand data without any estimator data
        matched = estimator_data
        feature_cols.remove(estimator.lower() + '_data')  # Without removing this column, our target is in our features

    # Skip if no common data points
    _validate_matched_dataset(matched, estimand, estimator)

    model = train_model(matched=matched, algorithm=algorithm, estimator=estimator,
                        estimand=estimand, feature_cols=feature_cols,
                        save_dir=save_dir)

    graphing_df = build_graphing_dataset(model, me_name_df, og_categorical_cols)

    predictions = run_model_and_graph(estimator_data=estimator_data, graphing_df=graphing_df,
                                          model=model, algorithm=algorithm,
                                          save_dir=save_dir)

    print("Raw shape: ")
    print(raw.shape)
    print("Merged shape: ")
    print(merged.shape)
    print("oh_merged shape: ")
    print(oh_merged.shape)
    print("Estimator data shape: ")
    print(estimator_data.shape)
    print("Matched shape: ")
    print(matched.shape)
    print(matched.year_id.unique())
    print(matched.age_group_id.unique())
    print("X_train shape: ")
    print(model.X_train.shape)
    print("X_test shape: ")
    print(model.X_test.shape)
    print("Graphing df shape: ")
    print(graphing_df.shape)
    print("Predictions shape: ")
    print(predictions.shape)
    print("\n")

    return {'merged': merged, 'estimator_data': estimator_data, 'matched': matched, 'model': model,
            'predictions': predictions, 'graphing_df': graphing_df}
Пример #8
0
def combine_all(estimators_array, estimands, estimators, topic, estimand_data, me_name_df, total):
    """

    :param estimators_array:
    :param estimands:
    :param estimators:
    :param topic:
    :param estimand_data:
    :param me_name_df:
    :return:
    """

    for i in range(1, len(estimators_array)):
        if estimators_array[i]['model']:
            estimand = estimands[0].lower()
            print('Estimand is ' + estimand)
            first_estimator = estimators[0].lower()
            print('First estimator is ' + first_estimator)
            second_estimator = estimators[i].lower()
            print('Second estimator is ' + second_estimator)
            print('First target col is ' + estimand)
            second_estimand = estimands[i].lower()
            print('Second target col is ' + second_estimand)
            first_model = estimators_array[0]['model']
            second_predictions = estimators_array[i]['predictions']

            save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=second_estimator, version=version)
            save_dir += 'through_' + second_estimand.lower() + '/'

            if not os.path.exists(save_dir):
                try:
                    os.makedirs(save_dir)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise

            new_df = second_predictions.rename(columns={'{}_prediction'.format(algorithm):
                                                        first_estimator.lower() + '_data'})
            print(new_df.head())

            new_predictions = first_model.predict(new_df=new_df, save_dir=save_dir)
            print(new_predictions.columns)

            estimand_data = estimand_data.rename(columns={estimand.lower() + '_data': 'actual'})
            print(estimand_data.columns)

            join_cols = list(set(og_feature_cols + required_columns))
            join_cols.remove('estimator')

            new_graphing_df = new_predictions.merge(estimand_data, how='inner', on=join_cols,
                                                    suffixes=('_predictions', '_actual'))

            super_regions_df = SharedLabels().super_regions()
            super_regions_df = super_regions_df.drop('location_id', axis=1)
            new_graphing_df = new_graphing_df.merge(super_regions_df)

            new_graphing_df = new_graphing_df.rename(columns={'{}_prediction'.format(algorithm): 'predicted'})
            print(new_graphing_df.head())
            print(new_graphing_df.shape)

            first_model.estimator = second_estimator

            metrics_obj = Metrics(new_graphing_df, algorithm=algorithm, model=first_model, save_dir=save_dir)
            metrics_obj.get_indiv_metrics()
            metrics_obj.print_facet_plots()

            new_predictions['estimator'] = estimators[i].lower()
            new_predictions = new_predictions.rename(columns={'{}_prediction'.format(algorithm):
                                                               estimand.lower() + '_data'})

            new_predictions = new_predictions.merge(me_name_df)
            print(new_predictions.columns)
            print(total.columns)

            total = total.append(new_predictions)
    return total
Пример #9
0
def make_st_gpr_plots(topic, version, sups, estimand, loc_id, me, total,
                      algorithm):
    """ Graphs all predictions (including original data) in ST-GPR style. """

    to_graph = total.merge(sups).sort_values('me_name')
    to_graph = to_graph[[
        'age_group_id', 'location_id', 'super_region_id', 'sex_id', 'me_name',
        'year_id', '{}_data'.format(estimand.lower()), 'estimator'
    ]]

    # Select a few countries for which to make st-gpr plots
    to_graph = to_graph[(to_graph.location_id == loc_id)
                        & (to_graph.me_name == me)]

    # Not every country has all modelable entities
    if len(to_graph) == 0:
        return

    to_graph = to_graph[~(to_graph.age_group_id == 21)
                        & (to_graph.age_group_id <= 30)]
    if not to_graph.empty:

        plt.figure()
        g = sns.FacetGrid(to_graph,
                          col='age_group_id',
                          hue='estimator',
                          col_wrap=4,
                          size=5,
                          sharex=False,
                          sharey=False)
        g = g.map(plt.scatter,
                  'year_id',
                  '{}_data'.format(estimand.lower()),
                  marker='o',
                  alpha=0.5,
                  facecolors='none',
                  linewidth=1.5)
        g.add_legend(title='Data Type')
        g.fig.subplots_adjust(top=.85)  # previously .9
        g.fig.suptitle(
            'Observed and Estimated {} Data Time Trends\nfor {}\nin Location ID {}\n Using {}, Version {}'
            .format(estimand, me, loc_id, algorithm.upper(), version),
            fontsize=16)


        save_dir = get_save_dir(topic=topic, estimand=estimand, estimator=None, version=version) + \
            'st_gpr_plots/' + algorithm + '/'

        if not os.path.exists(save_dir):
            try:
                os.makedirs(save_dir)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

        file_path = save_dir + 'location_{}_me_{}.png'.format(loc_id, me)
        print("\n\n")
        print('Saving figure to {}'.format(file_path))
        plt.savefig(file_path)
        plt.show()
        plt.clf()
    else:
        print("Your graphing dataframe is empty!")

    return
Пример #10
0
def predict_data_two_step(topic, algorithm, version, estimand_data,
                          first_estimator, second_estimator, estimand,
                          first_estimand, second_estimand, first_model,
                          second_predictions):
    """ This function is essential in the "crosswalk" two-step modeling process. It takes the predictions generated from
    the second model and feeds them into the first model. It returns a dataFrame containing new predictions from the
    two-step model

    Parameters
    ----------
    topic :
        a string declaring the risk topic of research
    algorithm :
        a string declaring the type of machine-learning algorithm being used
    version :
        a string declaring the model version (used for saving)
    estimand_data :
        a dataFrame of the target data
    first_estimator :
        a string declaring the estimator used in the first model
    second_estimator :
        same as above, but for the second model
    estimand :
        a string declaring the thing being estimated
    first_target_col :
        a string declaring the target column used in the first model
    second_target_col :
        same as above, but for the second model
    first_model :
        the first machine-learning model used in the two-step process
    second_predictions :
        a dataFrame resulting from generating predictions in the second model

    Returns
    -------
    new_predictions :

    """

    save_dir = get_save_dir(topic=topic,
                            estimand=estimand,
                            estimator=second_estimator,
                            version=version)
    save_dir += 'through_' + first_estimator.lower() + '/'

    if not os.path.exists(save_dir):
        try:
            os.makedirs(save_dir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

    new_df = second_predictions.rename(
        columns={'{}_prediction'.format(algorithm): second_estimand})
    print(new_df.head())

    new_predictions = first_model.predict(new_df=new_df, save_dir=save_dir)
    if new_predictions.shape[0] == 0:
        raise ValueError("Oops! You have no predictions.")

    estimand_data = estimand_data.rename(columns={first_estimand: 'actual'})
    new_graphing_df = new_predictions.merge(estimand_data,
                                            how='inner',
                                            suffixes=['_predicted', '_actual'],
                                            on=[
                                                'year_id', 'age_group_id',
                                                'sex_id', 'super_region_id',
                                                'me_name', 'location_id'
                                            ])

    labs = SharedLabels()
    sups = labs.super_regions()
    new_graphing_df = new_graphing_df.merge(sups)

    new_graphing_df = new_graphing_df.rename(
        columns={'{}_prediction'.format(algorithm): 'predicted'})
    print(new_graphing_df.head())

    first_model.estimator = second_estimator
    metrics_obj = Metrics(new_graphing_df,
                          algorithm=algorithm,
                          model=first_model,
                          save_dir=save_dir)
    metrics_obj.get_indiv_metrics()
    metrics_obj.print_facet_plots()
    return new_predictions
Пример #11
0
    with open(
            "FILEPATH/v" + version + "/estimators_obj_" + algorithm + "_" +
            estimator + ".pkl", 'rb') as f:
        estimators_object = pickle.load(f)
    draw_num = int(os.getenv("SGE_TASK_ID")) - 1
    print(draw_num)
    path_to_new_data = paths_to_new_data[draw_num]
    print(path_to_new_data)

    feature_cols = list(og_feature_cols)
    categorical_cols = list(og_categorical_cols)
    unseen_raw = pd.read_csv(path_to_new_data)

    # Get output directory filepath
    save_dir = get_save_dir(topic=topic,
                            estimand=estimand,
                            estimator=estimator,
                            version=version)
    if len(paths_to_new_data) > 1:
        save_dir += 'draws/'

        if not os.path.exists(save_dir):
            try:
                os.makedirs(save_dir)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

    # Add data column to unseen_raw if it's not already present
    if 'data' not in unseen_raw.columns:
        unseen_raw['data'] = None