示例#1
0
alphas = [1, 0.8, 0.6, 0.4, 0.2, 0]

plt.figure()
fig, axs = plt.subplots(1, 1, figsize=(16, 10), squeeze=False)
values = {}
yvalues = []
for a in alphas:
    nb = BernoulliNB(alpha=a)
    nb.fit(trnX, trnY)
    pred = nb.predict(valX)
    yvalues.append(metrics.accuracy_score(valY, pred))
values["Bern"] = yvalues
plot.multiple_line_chart(axs[0, 0],
                         alphas,
                         values,
                         'Bernoulli with various alpha',
                         'nr estimators',
                         'accuracy',
                         percentage=True)

plt.show()

#Alpha não faz diferença excepto sendo 0 que piora
#%%
#binarize
#%%
binarizes = [1, 0.8, 0.6, 0.4, 0.2, 0]

plt.figure()
fig, axs = plt.subplots(1, 1, figsize=(16, 10), squeeze=False)
values = {}
X: np.ndarray = df.drop(to_clf, axis=1).values

for n in n_clusters:
    kmeans = cluster.KMeans(n_clusters=n, random_state=1).fit(X)
    prdY = kmeans.labels_
    inertias.append(kmeans.inertia_)
    sil.append(metrics.silhouette_score(X, prdY))

ivalues={}
svalues={}
ivalues["norm"] = inertias
svalues["norm"] = sil
print(sil[4])

plot.multiple_line_chart(axs[0, 0], n_clusters, ivalues, '\nKmeans',
                         'nr estimators', 'inertia', percentage=False)
plot.multiple_line_chart(axs[0, 1], n_clusters, svalues, '\nKmeans',
                         'nr estimators', 'silhouete', percentage=False)

plt.show()

#%%
n_clusters=6
algs = ["PCA", "selectkbest"]
plt.figure()
fig, axs = plt.subplots(2 ,len(algs), figsize=(14, 8), squeeze=False)
for a in range(len(algs)):
    datar = datapp.feature_reduction(df, to_clf,categoric+[to_clf], n_features=2, as_int=True, alg=algs[a])

    y: np.ndarray = datar[to_clf].values
    X: np.ndarray = datar.drop([to_clf], axis=1).values
示例#3
0
                                          "class", ["class", "id"],
                                          d,
                                          alg=f)
            xg_clf = GradientBoostingClassifier()
            startTime = time.process_time()
            acc, sens, _ = eval.train_predict_kfold(df,
                                                    "class",
                                                    xg_clf,
                                                    bal=bal)
            print(time.process_time() - startTime)
            yvalues.append(acc)
            syvalues.append(sens)
        values[d] = yvalues
        svalues[d] = syvalues
    plot.multiple_line_chart(axs[0, k], thresholds, values,
                             'XGBoost with %s reduction' % f,
                             'threshold of reduction', 'accuracy')
    plot.multiple_line_chart(axs[1, k],
                             thresholds,
                             svalues,
                             'XGBoost with %s reduction' % f,
                             'threshold of reduction',
                             'sensitivity',
                             percentage=False)

plt.show()

#%%
tr = 0.9
f = "selectkbest"
selectk = 0.75
示例#4
0
    values = {}
    svalues = {}
    for d in selects:
        yvalues = []
        syvalues = []
        for tr in thresholds:
            datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization,
                              ignore_classes=categoric, as_df=True)
            df = datapp.feature_reduction(datared, "class",["class","id"], d, alg=f)
            rf = RandomForestClassifier(random_state=rs)
            acc, sens, _ = eval.train_predict_kfold(df, "class", rf, bal=bal)
            yvalues.append(acc)
            syvalues.append(sens)
        values[d] = yvalues
        svalues[d] = syvalues
    plot.multiple_line_chart(axs[0, k], thresholds, values, 'Random Forests with %s reduction' % f,
                             'threshold of reduction', 'accuracy')
    plot.multiple_line_chart(axs[1, k], thresholds, svalues, 'Random Forests with %s reduction' % f,
                             'threshold of reduction', 'sensitivity', percentage=False)

plt.show()
#%%
tr=0.95
f= "selectkbest"
selectk = 0.6
datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization,
                              ignore_classes=categoric, as_df=True)
df = datapp.feature_reduction(datared, "class",["class","id"], n_features=selectk, alg=f)
df.shape

#%%
                                              d,
                                              alg=alg)

                acc, sens, x = eval.train_predict_kfold(df,
                                                        "class",
                                                        f,
                                                        bal=bal)

                yvalues.append(acc)
                syvalues.append(sens)
            values[d] = yvalues
            svalues[d] = syvalues
        plot.multiple_line_chart(axs[0 + 2 * i, k],
                                 thresholds,
                                 values,
                                 '{} with {}'.format(f, str(alg)),
                                 'threshold of reduction',
                                 'accuracy',
                                 percentage=False)
        plot.multiple_line_chart(axs[1 + 2 * i, k],
                                 thresholds,
                                 svalues,
                                 '{} with {}'.format(f, str(alg)),
                                 'threshold of reduction',
                                 'sensitivity',
                                 percentage=False)

plt.show()
#%%
tr = 0.9
f = "PCA"
示例#6
0
for i in range(len(bins)):
    q_lifts.append(rules_q[i]["lift"].mean())
    c_lifts.append(rules_c[i]["lift"].mean())

q_sup = []
c_sup = []
for i in range(len(bins)):
    q_sup.append(rules_qsup[i]["support"].mean())
    c_sup.append(rules_csup[i]["support"].mean())
#%%
lvalues = {}
lvalues["cut"] = c_lifts
lvalues["qcut"] = q_lifts

svalues = {}
svalues["cut"] = c_sup
svalues["qcut"] = q_sup

plt.figure()
fig, axs = plt.subplots(1, 2, figsize=(12, 6), squeeze=False)
axs[0, 0].set_xticks(bins)
axs[0, 1].set_xticks(bins)
plot.multiple_line_chart(axs[0, 0], bins, lvalues,
                         'Lift of top rules of corresponding bins', 'bins',
                         'lift')
plot.multiple_line_chart(axs[0, 1], bins, svalues,
                         'Support of top rules of corresponding bins', 'bins',
                         'support')

plt.show()
示例#7
0
    for d in selects:
        yvalues = []
        syvalues = []
        for tr in thresholds:
            datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization,
                              ignore_classes=categoric, as_df=True)
            df = datapp.feature_reduction(datared, "class",["class","id"], d, alg=f)

            tree = DecisionTreeClassifier(random_state=rs)
            acc, sens, _ = eval.train_predict_kfold(df, "class", tree, bal=bal)

            yvalues.append(acc)
            syvalues.append(sens)
        values[d] = yvalues
        svalues[d] = syvalues
    plot.multiple_line_chart(axs[0, k], thresholds, values, 'Decision Trees with %s reduction' % f,
                             'threshold of reduction', 'accuracy')
    plot.multiple_line_chart(axs[1, k], thresholds, svalues, 'Decision Trees with %s reduction' % f,
                             'threshold of reduction', 'sensitivity', percentage=False)

plt.show()
#%%
tr=0.95
f= "selectkbest"
selectk = 1
datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization,
                              ignore_classes=categoric, as_df=True)
df = datapp.feature_reduction(datared, "class",["class","id"], n_features=selectk, alg=f)
df.shape

# %%
# PARAMETERS PARA TESTAR: criterion, splitter, max_depth, min_sample_leaf, min_samples_split
criteria = ['entropy', 'gini']

plt.figure()
fig, axs = plt.subplots(1, len(criteria), figsize=(12, 7), squeeze=False)
for k in range(len(criteria)):
    f = criteria[k]
    values = {}
    for d in max_depths:
        yvalues = []
        for n in min_samples_leaf:
            tree = DecisionTreeClassifier(min_samples_leaf=n, max_depth=d, criterion=f, random_state=rs)
            tree.fit(trnX, trnY)
            pred = tree.predict(valX)
            yvalues.append(metrics.accuracy_score(valY, pred))
        values[d] = yvalues
    plot.multiple_line_chart(axs[0, k], min_samples_leaf, values, 'Decision Trees with %s criteria' % f,
                             'min_samples_leaf', 'accuracy', percentage=True)

plt.show()

# %%
criterion = "gini"
# %%


min_samples_leaf = [.01, .0075, .005, .0025, .001]
max_depths = [5, 10, 25]
splitters = ['best', 'random']

plt.figure()
fig, axs = plt.subplots(1, len(splitters), figsize=(12, 7), squeeze=False)
for k in range(len(splitters)):
                                            ignore_classes=categoric,
                                            as_df=True)
            df = datapp.feature_reduction(datared,
                                          "class", ["class", "id"],
                                          d,
                                          alg=f)

            knn = KNeighborsClassifier()
            acc, sens, x = eval.train_predict_kfold(df, "class", knn, bal=bal)

            yvalues.append(acc)
            syvalues.append(sens)
        values[d] = yvalues
        svalues[d] = syvalues
    plot.multiple_line_chart(axs[0, k], thresholds, values,
                             'KNN with %s reduction' % f,
                             'threshold of reduction', 'accuracy')
    plot.multiple_line_chart(axs[1, k],
                             thresholds,
                             svalues,
                             'KNN with %s reduction' % f,
                             'threshold of reduction',
                             'sensitivity',
                             percentage=False)

plt.show()
#%%
tr = 0.9
f = "selectkbest"
selectk = 1
datared = datapp.preprocess_alt(data,
        print("max depth cycle")
        yvalues = []
        for n in n_estimators:
            rf = RandomForestClassifier(n_estimators=n,
                                        max_depth=d,
                                        max_features=f,
                                        random_state=rs)
            # rf = GaussianNB()
            rf.fit(trnX, trnY)
            pred = rf.predict(valX)
            yvalues.append(metrics.accuracy_score(valY, pred))
        values[d] = yvalues
    plot.multiple_line_chart(axs[0, k],
                             n_estimators,
                             values,
                             'Random Forests with %s features' % f,
                             'nr estimators',
                             'accuracy',
                             percentage=False)

plt.show()
# %%

max_features = 0.3

# %%
n_estimators = [50, 100, 200, 300, 400]
max_depths = [10, 25, 50]
criterions = ['gini', 'entropy']

plt.figure()
示例#11
0
    f = lr[k]
    values = {}
    for d in n_estimators:
        yvalues = []
        for n in min_samples_leaf:
            gb = GradientBoostingClassifier(min_samples_leaf=n,
                                            n_estimators=d,
                                            loss=loss,
                                            learning_rate=f,
                                            random_state=rs)
            gb.fit(trnX, trnY)
            pred = gb.predict(valX)
            yvalues.append(metrics.accuracy_score(valY, pred))
        values[d] = yvalues
    plot.multiple_line_chart(axs[0, k], min_samples_leaf, values,
                             'Gradient Boosting with %s lf' % f,
                             'min_samples_leaf', 'accuracy')

plt.show()

#%%
lr = 0.5
#%%
min_samples_leaf = [.05, .025, .01, .005, .0025, .001]
n_estimators = [100, 200, 300]
max_depth = [3, 10, 15, 25]

plt.figure()
fig, axs = plt.subplots(2, len(max_depth), figsize=(12, 7), squeeze=False)
for k in range(len(max_depth)):
    f = max_depth[k]