def histogram_with_distributions(ax: plt.Axes, series: pd.Series, var: str): values = series.sort_values().values ax.hist(values, 20, density=True) distributions = compute_known_distributions(values) ds.multiple_line_chart(values, distributions, ax=ax, title='Best fit for %s' % var, xlabel=var, ylabel='')
def analyse_per_metric(rules: pd.DataFrame, metric: str, metric_values: list, save_dir: str) -> list: print(f'Analyse per {metric}...') conf = {'avg': [], 'top25%': [], 'top10': []} lift = {'avg': [], 'top25%': [], 'top10': []} leverage = {'avg': [], 'top25%': [], 'top10': []} top_conf = [] top_lift = [] top_leverage = [] nr_rules = [] for m in metric_values: rs = rules[rules[metric] >= m] nr_rules.append(len(rs)) conf['avg'].append(rs['confidence'].mean(axis=0)) lift['avg'].append(rs['lift'].mean(axis=0)) leverage['avg'].append(rs['leverage'].mean(axis=0)) top_conf = rs.nlargest(int(0.25*len(rs)), 'confidence') conf['top25%'].append(top_conf['confidence'].mean(axis=0)) top_lift = rs.nlargest(int(0.25*len(rs)), 'lift') lift['top25%'].append(top_lift['lift'].mean(axis=0)) top_leverage = rs.nlargest(int(0.25*len(rs)), 'leverage') leverage['top25%'].append(top_leverage['leverage'].mean(axis=0)) top_conf = rs.nlargest(10, 'confidence') conf['top10'].append(top_conf['confidence'].mean(axis=0)) top_lift = rs.nlargest(10, 'lift') lift['top10'].append(top_lift['lift'].mean(axis=0)) top_leverage = rs.nlargest(10, 'leverage') leverage['top10'].append(top_leverage['leverage'].mean(axis=0)) _, axs = plt.subplots(2, 2, figsize=(20, 10), squeeze=False) ds.multiple_line_chart(metric_values, conf, ax=axs[0, 0], title=f'Avg Confidence x {metric}', xlabel=metric, ylabel='Avg confidence') ds.multiple_line_chart(metric_values, lift, ax=axs[0, 1], title=f'Avg Lift x {metric}', xlabel=metric, ylabel='Avg lift') ds.multiple_line_chart(metric_values, leverage, ax=axs[1, 0], title=f'Avg Leverage x {metric}', xlabel=metric, ylabel='Avg leverage') plt.savefig(save_dir + 'HFCR Pattern Mining - Association Rules analyse per ' + metric) plot_top_rules(top_conf, 'confidence', metric, subDir) plot_top_rules(top_lift, 'lift', metric, subDir) plot_top_rules(top_leverage, 'leverage', metric, subDir) return nr_rules
test_acc_values.append(test_accuracy) if yvalues[-1] > last_best: best = (f, d, imp) best_model = (prd_trn_lst, prd_tst_lst) last_best = yvalues[-1] last_best_train = train_acc_values[-1] best_tree = tree values[d] = yvalues overfit_values[f][d] = {} overfit_values[f][d]['train'] = train_acc_values overfit_values[f][d]['test'] = test_acc_values ds.multiple_line_chart(min_impurity_decrease, values, ax=axs[0, k], title='Decision Trees with %s criteria' % f, xlabel='min_impurity_decrease', ylabel='accuracy', percentage=True) if (count == 0): text = key else: text = last_name + ' - ' + key accuracies[text] = [last_best_train, last_best] last_accuracy = last_best print( 'Best results achieved with %s criteria, depth=%d and min_impurity_decrease=%1.5f ==> accuracy=%1.5f' % (best[0], best[1], best[2], last_best)) fig.text( 0.5,
print('HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB') _, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.multiple_bar_chart(LINKS, values_mse, title=f'Hierarchical MSE', xlabel='metric', ylabel='MSE', ax=ax[0, 0]) ds.multiple_bar_chart(LINKS, values_mae, title=f'Hierarchical MAE', xlabel='metric', ylabel='MAE', ax=ax[0, 1]) ds.multiple_bar_chart(LINKS, values_sc, title=f'Hierarchical SC', xlabel='metric', ylabel='SC', ax=ax[0, 2], percentage=True) ds.multiple_bar_chart(LINKS, values_db, title=f'Hierarchical DB', xlabel='metric', ylabel='DB', ax=ax[0, 3]) plt.suptitle('HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB') plt.savefig(subDir + 'HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB') """ ds.multiple_line_chart(N_CLUSTERS, fig_values_1, ax=ax[0, 0], title='K-Means', xlabel='k', ylabel='SC', percentage=True) ds.multiple_line_chart(N_CLUSTERS, fig_values_2, ax=ax[0, 1], title='EM', xlabel='k', ylabel='SC', percentage=True) #ds.multiple_line_chart(EPS, fig_values_3, ax=ax[0, 2], title='EPS', xlabel='k', ylabel='SC', percentage=True) ds.multiple_line_chart(N_CLUSTERS, fig_values_4, ax=ax[0, 2], title='Hierarchical',
test_acc_values.append(test_accuracy) if yvalues[-1] > last_best: best = (d, f, n) last_best = yvalues[-1] last_best_train = train_acc_values[-1] best_tree = rf best_model = (prd_trn_lst, prd_tst_lst) values[f] = yvalues overfit_values[d][f] = {} overfit_values[d][f]['train'] = train_acc_values overfit_values[d][f]['test'] = test_acc_values ds.multiple_line_chart(n_estimators, values, ax=axs[0, k], title='Random Forests with max_depth=%d' % d, xlabel='nr estimators', ylabel='accuracy', percentage=True) text = key if (do_feature_eng): text += ' with FS' best_accuracies[text] = [last_best_train, last_best] print( 'Best results with depth=%d, %1.2f features and %d estimators, with accuracy=%1.2f' % (best[0], best[1], best[2], last_best)) fig.text( 0.5, 0.03, 'Best results with depth=%d, %1.2f features and %d estimators, with accuracy=%1.2f'
best = (n, d) last_best = yvalues[-1] last_train_best = train_accuracy last_best_recall = yvalues_recall[-1] last_train_best_recall = train_recall values[d] = yvalues text = key if (do_feature_eng): text += ' with FS' best_accuracies[text] = [last_train_best, last_best] recalls[text] = [last_train_best_recall, last_best_recall] plt.figure() ds.multiple_line_chart(nvalues, values, title='KNN variants', xlabel='n', ylabel='accuracy', percentage=True) plt.suptitle('QOT KNN - ' + key + ' - parameters') plt.savefig(subDir + 'QOT KNN - ' + key + ' - parameters') print('Best results with %d neighbors and %s' % (best[0], best[1])) plt.figure() fig, axs = plt.subplots(1, len(dist), figsize=(32, 8), squeeze=False) i = 0 for k in range(len(dist)): d = dist[k] ds.multiple_line_chart(nvalues, overfitting_values[d], ax=axs[0, k], title='Overfitting for dist = %s' % (d),
last_best_train, last_best ] best_tree = gb values[lr] = yvalues overfitting_values[max_feat_string][d][lr] = {} overfitting_values[max_feat_string][d][lr][ 'train'] = train_acc_values overfitting_values[max_feat_string][d][lr][ 'test'] = test_acc_values ds.multiple_line_chart( n_estimators, values, ax=axs[w, k], title= 'Gradient Boorsting with max_features=%s max_depth=%d' % (max_feat_string, d), xlabel='nr estimators', ylabel='accuracy', percentage=True) print( 'Best results with max_features=%s, depth=%d, learning rate=%1.2f and %d estimators, with accuracy=%1.2f' % (best[0], best[1], best[2], best[3], last_best)) fig.text( 0.5, 0.03, 'Best results with max_features=%s, depth=%d, learning rate=%1.2f and %d estimators, with accuracy=%1.2f' % (best[0], best[1], best[2], best[3], last_best), fontsize=7, ha='center',
for imp in min_impurity_decrease: tree = DecisionTreeClassifier(min_samples_leaf=1, max_depth=d, criterion=f, min_impurity_decrease=imp) tree.fit(trnX, trnY) prdY = tree.predict(tstX) yvalues.append(metrics.accuracy_score(tstY, prdY)) if yvalues[-1] > last_best: best = (f, d, imp) last_best = yvalues[-1] best_tree = tree values[d] = yvalues ds.multiple_line_chart(min_impurity_decrease, values, ax=axs[0, k], title='Decision Trees with %s criteria' % f, xlabel='min_impurity_decrease', ylabel='accuracy', percentage=True) plt.show() print( 'Best results achieved with %s criteria, depth=%d and min_impurity_decrease=%1.2f ==> accuracy=%1.2f' % (best[0], best[1], best[2], last_best)) prd_trn = best_tree.predict(trnX) prd_tst = best_tree.predict(tstX) ds.plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst)
for d in (0, 1): mse = {} mae = {} for p in params: mse_lst = [] mae_lst = [] for q in params: mod = ARIMA(df, order=(p, d, q)) results = mod.fit() mse_lst.append(results.mse) mae_lst.append(results.mae) mse[p] = mse_lst mae[p] = mae_lst ds.multiple_line_chart(params, mse, ax=axs[d, 0], title=f'MSE with d={d}', xlabel='p', ylabel='mse') ds.multiple_line_chart(params, mae, ax=axs[d, 1], title=f'MAE with d={d}', xlabel='p', ylabel='mae') plt.savefig(graphsDir + 'Covid19 - MSE and MAE') def plot_forecasting(train: pd.Series, test: pd.Series, pred, ax: plt.Axes = None,
rf = RandomForestClassifier(n_estimators=n, max_depth=d, max_features=f) rf.fit(trnX, trnY) prdY = rf.predict(tstX) yvalues.append(metrics.accuracy_score(tstY, prdY)) if yvalues[-1] > last_best: best = (d, f, n) last_best = yvalues[-1] best_tree = rf values[f] = yvalues ds.multiple_line_chart(n_estimators, values, ax=axs[0, k], title='Random Forests with max_depth=%d' % d, xlabel='nr estimators', ylabel='accuracy', percentage=True) plt.show() print( 'Best results with depth=%d, %1.2f features and %d estimators, with accuracy=%1.2f' % (best[0], best[1], best[2], last_best)) #Performance prd_trn = best_tree.predict(trnX) prd_tst = best_tree.predict(tstX) ds.plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst) #ORAL TOXICITY