def predict_prob(): from sklearn.externals import joblib data_par = Data() cols_process = data_par.cols_process cols_output = data_par.cols_output valid_status = data_par.valid_status dir_to_saved_data = data_par.dir_to_saved_data dir_to_query_data = data_par.dir_to_query_data pipeline_type = data_par.prediction_model saved_data_dir = Path(dir_to_saved_data) trained_model_filename = saved_data_dir.joinpath(pipeline_type + "_model.pkl") pipeline = joblib.load(trained_model_filename) # %% input_data_filename = Path(dir_to_query_data, "predict_scraped.csv") df = pd.read_csv(input_data_filename, usecols=cols_process) df = preprocess_train_df(df, valid_status, cols_output, predict=True) prob = pipeline.predict_proba(df) return prob
funded_stat.columns = ["FeatureName", "FeatureValueFunded"] expired_stat = df_expired_described.loc[stat_name_select] expired_stat = expired_stat.to_frame() expired_stat.reset_index(level=0, inplace=True) expired_stat.columns = ["FeatureName", "FeatureValueExpired"] funded_stat = funded_stat.merge(expired_stat, on="FeatureName") funded_stat.to_csv(path_to_output_df, index=False) return funded_stat # %% ########### data_par = Data() cols_process = data_par.cols_process cols_output = data_par.cols_output valid_status = data_par.valid_status dir_to_saved_data = data_par.dir_to_saved_data dir_to_query_data = data_par.dir_to_query_data path_to_training_data = data_par.path_to_training_data stat_name_select = data_par.stat_name_select predict = False csv_name_tags = "stats_tags_df.csv" csv_name_loanuse = "stats_loanuse_df.csv" csv_name_desc = "stats_desc_df.csv" df = pd.read_csv(path_to_training_data, usecols=cols_process)
def plot_factors(loan_id): data_par = Data() cols_process = data_par.cols_process cols_output = data_par.cols_output valid_status = data_par.valid_status dir_to_saved_data = data_par.dir_to_saved_data dir_to_query_data = data_par.dir_to_query_data # %% predict = True N_top_features = 6 scraped_filename = Path( dir_to_query_data, "predict_scraped.csv") # input - predicted from scraping df = pd.read_csv(scraped_filename, usecols=cols_process) df = preprocess_train_df(df, valid_status, cols_output, predict) fit_stats(dir_to_saved_data, df) # returns all 3 transformers stats_tags_df, stats_loanuse_df, stats_desc_df = transform_stats( dir_to_saved_data, df) # tags # input feature_stats_filename_tags = Path( dir_to_saved_data, "funded_stats_tags_df.csv") # input - mean stats coefficient_ranking_filename_tags = Path( dir_to_saved_data, "coefs_stats_df_tags.csv") # input - coefficients # output predicted_features_filename_tags = str( loan_id) + "_predict_ranked_tags.csv" top_feature_compared_filename_tags = Path( dir_to_query_data, str(loan_id) + "_predict_compared_tags.csv") # input - predicted from scraping # loanuse # input feature_stats_filename_loanuse = Path( dir_to_saved_data, "funded_stats_loanuse_df.csv") # input - mean stats coefficient_ranking_filename_loanuse = Path( dir_to_saved_data, "coefs_stats_df_loanuse.csv") # input - coefficients # output predicted_features_filename_loanuse = str( loan_id) + "_predict_ranked_loanuse.csv" top_feature_compared_filename_loanuse = Path( dir_to_query_data, str(loan_id) + "_predict_compared_loanuse.csv") # input - predicted from scraping # desc # input feature_stats_filename_desc = Path( dir_to_saved_data, "funded_stats_desc_df.csv") # input - mean stats coefficient_ranking_filename_desc = Path( dir_to_saved_data, "coefs_stats_df_desc.csv") # input - coefficients # output predicted_features_filename_desc = str( loan_id) + "_predict_ranked_desc.csv" top_feature_compared_filename_desc = Path( dir_to_query_data, str(loan_id) + "_predict_compared_desc.csv") # input - predicted from scraping # transforming predicted statistical features save_transformed_stats(dir_to_query_data, stats_tags_df, predicted_features_filename_tags) save_transformed_stats(dir_to_query_data, stats_loanuse_df, predicted_features_filename_loanuse) save_transformed_stats(dir_to_query_data, stats_desc_df, predicted_features_filename_desc) # %% top_features_tags, top_features_long_tags = get_top_features( N_top_features, dir_to_query_data, predicted_features_filename_tags, feature_stats_filename_tags, coefficient_ranking_filename_tags, top_feature_compared_filename_tags, ) top_features_loanuse, top_features_long_loanuse = get_top_features( N_top_features, dir_to_query_data, predicted_features_filename_loanuse, feature_stats_filename_loanuse, coefficient_ranking_filename_loanuse, top_feature_compared_filename_loanuse, ) top_features_desc, top_features_long_desc = get_top_features( N_top_features, dir_to_query_data, predicted_features_filename_desc, feature_stats_filename_desc, coefficient_ranking_filename_desc, top_feature_compared_filename_desc, ) # normalize data print(top_features_tags) print(top_features_loanuse) print(top_features_desc) top_features_tags = normalize_df(top_features_tags) top_features_loanuse = normalize_df(top_features_loanuse) top_features_desc = normalize_df(top_features_desc) print(top_features_tags.columns) # select top features top_features_tags = top_features_tags.loc[top_features_tags["Info"].isin( ["num_words", "num_hashtags"])] top_features_loanuse = top_features_loanuse.loc[ top_features_loanuse["Info"].isin(["num_words"])] top_features_desc = top_features_desc.loc[top_features_desc["Info"].isin( ["num_words", "num_sentences", "num_paragraphs"])] # make suggestion desc_text_words = text_suggestion(top_features_desc, "num_words") desc_text_sentences = text_suggestion(top_features_desc, "num_sentences") desc_text_paragraphs = text_suggestion(top_features_desc, "num_paragraphs") loanuse_text_words = text_suggestion(top_features_loanuse, "num_words") tags_text_words = text_suggestion(top_features_tags, "num_words") tags_text_hashtags = text_suggestion(top_features_tags, "num_words") top_features_tags = top_features_tags.replace({ "num_words": "number of words", "num_hashtags": "number of hashtags" }) top_features_loanuse = top_features_loanuse.replace( {"num_words": "number of words"}) top_features_desc = top_features_desc.replace({ "num_words": "number of words", "num_sentences": "number of sentences", "num_paragraphs": "number of paragraphs", }) print(top_features_tags) print(top_features_loanuse) print(top_features_desc) top_features_long_tags = pd.melt( top_features_tags, id_vars="Info", var_name="Campaigns", value_name="Performance", ) top_features_long_loanuse = pd.melt( top_features_loanuse, id_vars="Info", var_name="Campaigns", value_name="Performance", ) top_features_long_desc = pd.melt( top_features_desc, id_vars="Info", var_name="Campaigns", value_name="Performance", ) fig_dpi = 400 marker_size = 7 legend_coods_tags = (0.3, 1.45) legend_coods_loanuse = (0.3, 1.65) legend_coods_desc = (0.3, 1.35) fig_size_tags = (3.5, 0.85) fig_size_loanuse = (3.5, 0.65) fig_size_desc = (3.5, 1) img_name_tags = make_stripplot(fig_size_tags, fig_dpi, legend_coods_tags, marker_size, "Campaign_tags.png", top_features_long_tags, loan_id) img_name_loanuse = make_stripplot(fig_size_loanuse, fig_dpi, legend_coods_loanuse, marker_size, "Campaign_loanuse.png", top_features_long_loanuse, loan_id) img_name_desc = make_stripplot(fig_size_desc, fig_dpi, legend_coods_desc, marker_size, "Campaign_desc.png", top_features_long_desc, loan_id) return ( img_name_desc, img_name_loanuse, img_name_tags, desc_text_words, desc_text_sentences, desc_text_paragraphs, loanuse_text_words, tags_text_words, tags_text_hashtags, )