Пример #1
0
def predict_prob():

    from sklearn.externals import joblib

    data_par = Data()
    cols_process = data_par.cols_process
    cols_output = data_par.cols_output
    valid_status = data_par.valid_status
    dir_to_saved_data = data_par.dir_to_saved_data
    dir_to_query_data = data_par.dir_to_query_data
    pipeline_type = data_par.prediction_model

    saved_data_dir = Path(dir_to_saved_data)
    trained_model_filename = saved_data_dir.joinpath(pipeline_type +
                                                     "_model.pkl")
    pipeline = joblib.load(trained_model_filename)

    # %%
    input_data_filename = Path(dir_to_query_data, "predict_scraped.csv")
    df = pd.read_csv(input_data_filename, usecols=cols_process)
    df = preprocess_train_df(df, valid_status, cols_output, predict=True)
    prob = pipeline.predict_proba(df)

    return prob
Пример #2
0
    funded_stat.columns = ["FeatureName", "FeatureValueFunded"]

    expired_stat = df_expired_described.loc[stat_name_select]
    expired_stat = expired_stat.to_frame()
    expired_stat.reset_index(level=0, inplace=True)
    expired_stat.columns = ["FeatureName", "FeatureValueExpired"]

    funded_stat = funded_stat.merge(expired_stat, on="FeatureName")

    funded_stat.to_csv(path_to_output_df, index=False)

    return funded_stat


# %% ###########
data_par = Data()
cols_process = data_par.cols_process
cols_output = data_par.cols_output
valid_status = data_par.valid_status
dir_to_saved_data = data_par.dir_to_saved_data
dir_to_query_data = data_par.dir_to_query_data
path_to_training_data = data_par.path_to_training_data
stat_name_select = data_par.stat_name_select

predict = False

csv_name_tags = "stats_tags_df.csv"
csv_name_loanuse = "stats_loanuse_df.csv"
csv_name_desc = "stats_desc_df.csv"

df = pd.read_csv(path_to_training_data, usecols=cols_process)
Пример #3
0
def plot_factors(loan_id):

    data_par = Data()
    cols_process = data_par.cols_process
    cols_output = data_par.cols_output
    valid_status = data_par.valid_status
    dir_to_saved_data = data_par.dir_to_saved_data
    dir_to_query_data = data_par.dir_to_query_data

    # %%

    predict = True
    N_top_features = 6

    scraped_filename = Path(
        dir_to_query_data,
        "predict_scraped.csv")  # input - predicted from scraping

    df = pd.read_csv(scraped_filename, usecols=cols_process)
    df = preprocess_train_df(df, valid_status, cols_output, predict)

    fit_stats(dir_to_saved_data, df)
    # returns all 3 transformers
    stats_tags_df, stats_loanuse_df, stats_desc_df = transform_stats(
        dir_to_saved_data, df)

    # tags
    # input
    feature_stats_filename_tags = Path(
        dir_to_saved_data, "funded_stats_tags_df.csv")  # input - mean stats
    coefficient_ranking_filename_tags = Path(
        dir_to_saved_data, "coefs_stats_df_tags.csv")  # input - coefficients
    # output
    predicted_features_filename_tags = str(
        loan_id) + "_predict_ranked_tags.csv"
    top_feature_compared_filename_tags = Path(
        dir_to_query_data,
        str(loan_id) +
        "_predict_compared_tags.csv")  # input - predicted from scraping

    # loanuse
    # input
    feature_stats_filename_loanuse = Path(
        dir_to_saved_data, "funded_stats_loanuse_df.csv")  # input - mean stats
    coefficient_ranking_filename_loanuse = Path(
        dir_to_saved_data,
        "coefs_stats_df_loanuse.csv")  # input - coefficients
    # output
    predicted_features_filename_loanuse = str(
        loan_id) + "_predict_ranked_loanuse.csv"
    top_feature_compared_filename_loanuse = Path(
        dir_to_query_data,
        str(loan_id) +
        "_predict_compared_loanuse.csv")  # input - predicted from scraping

    # desc
    # input
    feature_stats_filename_desc = Path(
        dir_to_saved_data, "funded_stats_desc_df.csv")  # input - mean stats
    coefficient_ranking_filename_desc = Path(
        dir_to_saved_data, "coefs_stats_df_desc.csv")  # input - coefficients
    # output
    predicted_features_filename_desc = str(
        loan_id) + "_predict_ranked_desc.csv"
    top_feature_compared_filename_desc = Path(
        dir_to_query_data,
        str(loan_id) +
        "_predict_compared_desc.csv")  # input - predicted from scraping

    # transforming predicted statistical features
    save_transformed_stats(dir_to_query_data, stats_tags_df,
                           predicted_features_filename_tags)
    save_transformed_stats(dir_to_query_data, stats_loanuse_df,
                           predicted_features_filename_loanuse)
    save_transformed_stats(dir_to_query_data, stats_desc_df,
                           predicted_features_filename_desc)

    # %%
    top_features_tags, top_features_long_tags = get_top_features(
        N_top_features,
        dir_to_query_data,
        predicted_features_filename_tags,
        feature_stats_filename_tags,
        coefficient_ranking_filename_tags,
        top_feature_compared_filename_tags,
    )
    top_features_loanuse, top_features_long_loanuse = get_top_features(
        N_top_features,
        dir_to_query_data,
        predicted_features_filename_loanuse,
        feature_stats_filename_loanuse,
        coefficient_ranking_filename_loanuse,
        top_feature_compared_filename_loanuse,
    )
    top_features_desc, top_features_long_desc = get_top_features(
        N_top_features,
        dir_to_query_data,
        predicted_features_filename_desc,
        feature_stats_filename_desc,
        coefficient_ranking_filename_desc,
        top_feature_compared_filename_desc,
    )

    # normalize data
    print(top_features_tags)
    print(top_features_loanuse)
    print(top_features_desc)

    top_features_tags = normalize_df(top_features_tags)
    top_features_loanuse = normalize_df(top_features_loanuse)
    top_features_desc = normalize_df(top_features_desc)

    print(top_features_tags.columns)
    # select top features
    top_features_tags = top_features_tags.loc[top_features_tags["Info"].isin(
        ["num_words", "num_hashtags"])]
    top_features_loanuse = top_features_loanuse.loc[
        top_features_loanuse["Info"].isin(["num_words"])]
    top_features_desc = top_features_desc.loc[top_features_desc["Info"].isin(
        ["num_words", "num_sentences", "num_paragraphs"])]

    # make suggestion
    desc_text_words = text_suggestion(top_features_desc, "num_words")
    desc_text_sentences = text_suggestion(top_features_desc, "num_sentences")
    desc_text_paragraphs = text_suggestion(top_features_desc, "num_paragraphs")
    loanuse_text_words = text_suggestion(top_features_loanuse, "num_words")
    tags_text_words = text_suggestion(top_features_tags, "num_words")
    tags_text_hashtags = text_suggestion(top_features_tags, "num_words")

    top_features_tags = top_features_tags.replace({
        "num_words":
        "number of words",
        "num_hashtags":
        "number of hashtags"
    })
    top_features_loanuse = top_features_loanuse.replace(
        {"num_words": "number of words"})
    top_features_desc = top_features_desc.replace({
        "num_words":
        "number of words",
        "num_sentences":
        "number of sentences",
        "num_paragraphs":
        "number of paragraphs",
    })

    print(top_features_tags)
    print(top_features_loanuse)
    print(top_features_desc)

    top_features_long_tags = pd.melt(
        top_features_tags,
        id_vars="Info",
        var_name="Campaigns",
        value_name="Performance",
    )
    top_features_long_loanuse = pd.melt(
        top_features_loanuse,
        id_vars="Info",
        var_name="Campaigns",
        value_name="Performance",
    )
    top_features_long_desc = pd.melt(
        top_features_desc,
        id_vars="Info",
        var_name="Campaigns",
        value_name="Performance",
    )

    fig_dpi = 400
    marker_size = 7

    legend_coods_tags = (0.3, 1.45)
    legend_coods_loanuse = (0.3, 1.65)
    legend_coods_desc = (0.3, 1.35)

    fig_size_tags = (3.5, 0.85)
    fig_size_loanuse = (3.5, 0.65)
    fig_size_desc = (3.5, 1)

    img_name_tags = make_stripplot(fig_size_tags, fig_dpi, legend_coods_tags,
                                   marker_size, "Campaign_tags.png",
                                   top_features_long_tags, loan_id)
    img_name_loanuse = make_stripplot(fig_size_loanuse, fig_dpi,
                                      legend_coods_loanuse, marker_size,
                                      "Campaign_loanuse.png",
                                      top_features_long_loanuse, loan_id)
    img_name_desc = make_stripplot(fig_size_desc, fig_dpi, legend_coods_desc,
                                   marker_size, "Campaign_desc.png",
                                   top_features_long_desc, loan_id)

    return (
        img_name_desc,
        img_name_loanuse,
        img_name_tags,
        desc_text_words,
        desc_text_sentences,
        desc_text_paragraphs,
        loanuse_text_words,
        tags_text_words,
        tags_text_hashtags,
    )