def histogram_with_distributions(ax: plt.Axes, series: pd.Series, var: str):
    values = series.sort_values().values
    ax.hist(values, 20, density=True)
    distributions = compute_known_distributions(values)
    ds.multiple_line_chart(values,
                           distributions,
                           ax=ax,
                           title='Best fit for %s' % var,
                           xlabel=var,
                           ylabel='')
def analyse_per_metric(rules: pd.DataFrame, metric: str, metric_values: list, save_dir: str) -> list:
    print(f'Analyse per {metric}...')
    conf = {'avg': [], 'top25%': [], 'top10': []}
    lift = {'avg': [], 'top25%': [], 'top10': []}
    leverage = {'avg': [], 'top25%': [], 'top10': []}
    top_conf = []
    top_lift = []
    top_leverage = []
    nr_rules = []
    for m in metric_values:
        rs = rules[rules[metric] >= m]
        nr_rules.append(len(rs))
        conf['avg'].append(rs['confidence'].mean(axis=0))
        lift['avg'].append(rs['lift'].mean(axis=0))
        leverage['avg'].append(rs['leverage'].mean(axis=0))

        top_conf = rs.nlargest(int(0.25*len(rs)), 'confidence')
        conf['top25%'].append(top_conf['confidence'].mean(axis=0))
        top_lift = rs.nlargest(int(0.25*len(rs)), 'lift')
        lift['top25%'].append(top_lift['lift'].mean(axis=0))
        top_leverage = rs.nlargest(int(0.25*len(rs)), 'leverage')
        leverage['top25%'].append(top_leverage['leverage'].mean(axis=0))

        top_conf = rs.nlargest(10, 'confidence')
        conf['top10'].append(top_conf['confidence'].mean(axis=0))
        top_lift = rs.nlargest(10, 'lift')
        lift['top10'].append(top_lift['lift'].mean(axis=0))
        top_leverage = rs.nlargest(10, 'leverage')
        leverage['top10'].append(top_leverage['leverage'].mean(axis=0))

    _, axs = plt.subplots(2, 2, figsize=(20, 10), squeeze=False)
    ds.multiple_line_chart(metric_values, conf, ax=axs[0, 0], title=f'Avg Confidence x {metric}',
                           xlabel=metric, ylabel='Avg confidence')
    ds.multiple_line_chart(metric_values, lift, ax=axs[0, 1], title=f'Avg Lift x {metric}',
                           xlabel=metric, ylabel='Avg lift')
    ds.multiple_line_chart(metric_values, leverage, ax=axs[1, 0], title=f'Avg Leverage x {metric}',
                           xlabel=metric, ylabel='Avg leverage')
    plt.savefig(save_dir + 'HFCR Pattern Mining - Association Rules analyse per ' + metric)

    plot_top_rules(top_conf, 'confidence', metric, subDir)
    plot_top_rules(top_lift, 'lift', metric, subDir)
    plot_top_rules(top_leverage, 'leverage', metric, subDir)

    return nr_rules
                    test_acc_values.append(test_accuracy)
                    if yvalues[-1] > last_best:
                        best = (f, d, imp)
                        best_model = (prd_trn_lst, prd_tst_lst)
                        last_best = yvalues[-1]
                        last_best_train = train_acc_values[-1]
                        best_tree = tree

                values[d] = yvalues
                overfit_values[f][d] = {}
                overfit_values[f][d]['train'] = train_acc_values
                overfit_values[f][d]['test'] = test_acc_values
            ds.multiple_line_chart(min_impurity_decrease,
                                   values,
                                   ax=axs[0, k],
                                   title='Decision Trees with %s criteria' % f,
                                   xlabel='min_impurity_decrease',
                                   ylabel='accuracy',
                                   percentage=True)

        if (count == 0): text = key
        else: text = last_name + ' - ' + key
        accuracies[text] = [last_best_train, last_best]

        last_accuracy = last_best

        print(
            'Best results achieved with %s criteria, depth=%d and min_impurity_decrease=%1.5f ==> accuracy=%1.5f'
            % (best[0], best[1], best[2], last_best))
        fig.text(
            0.5,

	print('HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB')
	_, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False)
	ds.multiple_bar_chart(LINKS, values_mse, title=f'Hierarchical MSE', xlabel='metric', ylabel='MSE', ax=ax[0, 0])
	ds.multiple_bar_chart(LINKS, values_mae, title=f'Hierarchical MAE', xlabel='metric', ylabel='MAE', ax=ax[0, 1])
	ds.multiple_bar_chart(LINKS, values_sc, title=f'Hierarchical SC', xlabel='metric', ylabel='SC', ax=ax[0, 2], percentage=True)
	ds.multiple_bar_chart(LINKS, values_db, title=f'Hierarchical DB', xlabel='metric', ylabel='DB', ax=ax[0, 3])
	plt.suptitle('HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB')
	plt.savefig(subDir + 'HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB')
    """

ds.multiple_line_chart(N_CLUSTERS,
                       fig_values_1,
                       ax=ax[0, 0],
                       title='K-Means',
                       xlabel='k',
                       ylabel='SC',
                       percentage=True)
ds.multiple_line_chart(N_CLUSTERS,
                       fig_values_2,
                       ax=ax[0, 1],
                       title='EM',
                       xlabel='k',
                       ylabel='SC',
                       percentage=True)
#ds.multiple_line_chart(EPS, fig_values_3, ax=ax[0, 2], title='EPS', xlabel='k', ylabel='SC', percentage=True)
ds.multiple_line_chart(N_CLUSTERS,
                       fig_values_4,
                       ax=ax[0, 2],
                       title='Hierarchical',
                    test_acc_values.append(test_accuracy)
                    if yvalues[-1] > last_best:
                        best = (d, f, n)
                        last_best = yvalues[-1]
                        last_best_train = train_acc_values[-1]
                        best_tree = rf
                        best_model = (prd_trn_lst, prd_tst_lst)

                values[f] = yvalues
                overfit_values[d][f] = {}
                overfit_values[d][f]['train'] = train_acc_values
                overfit_values[d][f]['test'] = test_acc_values
            ds.multiple_line_chart(n_estimators,
                                   values,
                                   ax=axs[0, k],
                                   title='Random Forests with max_depth=%d' %
                                   d,
                                   xlabel='nr estimators',
                                   ylabel='accuracy',
                                   percentage=True)

        text = key
        if (do_feature_eng): text += ' with FS'
        best_accuracies[text] = [last_best_train, last_best]

        print(
            'Best results with depth=%d, %1.2f features and %d estimators, with accuracy=%1.2f'
            % (best[0], best[1], best[2], last_best))
        fig.text(
            0.5,
            0.03,
            'Best results with depth=%d, %1.2f features and %d estimators, with accuracy=%1.2f'
示例#6
0
                    best = (n, d)
                    last_best = yvalues[-1]
                    last_train_best = train_accuracy
                    last_best_recall = yvalues_recall[-1]
                    last_train_best_recall = train_recall
            values[d] = yvalues

        text = key
        if (do_feature_eng): text += ' with FS'
        best_accuracies[text] = [last_train_best, last_best]
        recalls[text] = [last_train_best_recall, last_best_recall]

        plt.figure()
        ds.multiple_line_chart(nvalues,
                               values,
                               title='KNN variants',
                               xlabel='n',
                               ylabel='accuracy',
                               percentage=True)
        plt.suptitle('QOT KNN - ' + key + ' - parameters')
        plt.savefig(subDir + 'QOT KNN - ' + key + ' - parameters')
        print('Best results with %d neighbors and %s' % (best[0], best[1]))

        plt.figure()
        fig, axs = plt.subplots(1, len(dist), figsize=(32, 8), squeeze=False)
        i = 0
        for k in range(len(dist)):
            d = dist[k]
            ds.multiple_line_chart(nvalues,
                                   overfitting_values[d],
                                   ax=axs[0, k],
                                   title='Overfitting for dist = %s' % (d),
                                    last_best_train, last_best
                                ]
                                best_tree = gb

                        values[lr] = yvalues
                        overfitting_values[max_feat_string][d][lr] = {}
                        overfitting_values[max_feat_string][d][lr][
                            'train'] = train_acc_values
                        overfitting_values[max_feat_string][d][lr][
                            'test'] = test_acc_values

                    ds.multiple_line_chart(
                        n_estimators,
                        values,
                        ax=axs[w, k],
                        title=
                        'Gradient Boorsting with max_features=%s max_depth=%d'
                        % (max_feat_string, d),
                        xlabel='nr estimators',
                        ylabel='accuracy',
                        percentage=True)

            print(
                'Best results with max_features=%s, depth=%d, learning rate=%1.2f and %d estimators, with accuracy=%1.2f'
                % (best[0], best[1], best[2], best[3], last_best))
            fig.text(
                0.5,
                0.03,
                'Best results with max_features=%s, depth=%d, learning rate=%1.2f and %d estimators, with accuracy=%1.2f'
                % (best[0], best[1], best[2], best[3], last_best),
                fontsize=7,
                ha='center',
示例#8
0
文件: Toxicity.py 项目: dsfca/CDados
        for imp in min_impurity_decrease:
            tree = DecisionTreeClassifier(min_samples_leaf=1,
                                          max_depth=d,
                                          criterion=f,
                                          min_impurity_decrease=imp)
            tree.fit(trnX, trnY)
            prdY = tree.predict(tstX)
            yvalues.append(metrics.accuracy_score(tstY, prdY))
            if yvalues[-1] > last_best:
                best = (f, d, imp)
                last_best = yvalues[-1]
                best_tree = tree

        values[d] = yvalues
    ds.multiple_line_chart(min_impurity_decrease,
                           values,
                           ax=axs[0, k],
                           title='Decision Trees with %s criteria' % f,
                           xlabel='min_impurity_decrease',
                           ylabel='accuracy',
                           percentage=True)

plt.show()
print(
    'Best results achieved with %s criteria, depth=%d and min_impurity_decrease=%1.2f ==> accuracy=%1.2f'
    % (best[0], best[1], best[2], last_best))

prd_trn = best_tree.predict(trnX)
prd_tst = best_tree.predict(tstX)
ds.plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst)
示例#9
0
for d in (0, 1):
    mse = {}
    mae = {}
    for p in params:
        mse_lst = []
        mae_lst = []
        for q in params:
            mod = ARIMA(df, order=(p, d, q))
            results = mod.fit()
            mse_lst.append(results.mse)
            mae_lst.append(results.mae)
        mse[p] = mse_lst
        mae[p] = mae_lst
    ds.multiple_line_chart(params,
                           mse,
                           ax=axs[d, 0],
                           title=f'MSE with d={d}',
                           xlabel='p',
                           ylabel='mse')
    ds.multiple_line_chart(params,
                           mae,
                           ax=axs[d, 1],
                           title=f'MAE with d={d}',
                           xlabel='p',
                           ylabel='mae')
plt.savefig(graphsDir + 'Covid19 - MSE and MAE')


def plot_forecasting(train: pd.Series,
                     test: pd.Series,
                     pred,
                     ax: plt.Axes = None,
示例#10
0
文件: lab7.py 项目: dsfca/CDados
            rf = RandomForestClassifier(n_estimators=n,
                                        max_depth=d,
                                        max_features=f)
            rf.fit(trnX, trnY)
            prdY = rf.predict(tstX)
            yvalues.append(metrics.accuracy_score(tstY, prdY))
            if yvalues[-1] > last_best:
                best = (d, f, n)
                last_best = yvalues[-1]
                best_tree = rf

        values[f] = yvalues
    ds.multiple_line_chart(n_estimators,
                           values,
                           ax=axs[0, k],
                           title='Random Forests with max_depth=%d' % d,
                           xlabel='nr estimators',
                           ylabel='accuracy',
                           percentage=True)

plt.show()
print(
    'Best results with depth=%d, %1.2f features and %d estimators, with accuracy=%1.2f'
    % (best[0], best[1], best[2], last_best))

#Performance
prd_trn = best_tree.predict(trnX)
prd_tst = best_tree.predict(tstX)
ds.plot_evaluation_results(pd.unique(y), trnY, prd_trn, tstY, prd_tst)

#ORAL TOXICITY