alphas = [1, 0.8, 0.6, 0.4, 0.2, 0] plt.figure() fig, axs = plt.subplots(1, 1, figsize=(16, 10), squeeze=False) values = {} yvalues = [] for a in alphas: nb = BernoulliNB(alpha=a) nb.fit(trnX, trnY) pred = nb.predict(valX) yvalues.append(metrics.accuracy_score(valY, pred)) values["Bern"] = yvalues plot.multiple_line_chart(axs[0, 0], alphas, values, 'Bernoulli with various alpha', 'nr estimators', 'accuracy', percentage=True) plt.show() #Alpha não faz diferença excepto sendo 0 que piora #%% #binarize #%% binarizes = [1, 0.8, 0.6, 0.4, 0.2, 0] plt.figure() fig, axs = plt.subplots(1, 1, figsize=(16, 10), squeeze=False) values = {}
X: np.ndarray = df.drop(to_clf, axis=1).values for n in n_clusters: kmeans = cluster.KMeans(n_clusters=n, random_state=1).fit(X) prdY = kmeans.labels_ inertias.append(kmeans.inertia_) sil.append(metrics.silhouette_score(X, prdY)) ivalues={} svalues={} ivalues["norm"] = inertias svalues["norm"] = sil print(sil[4]) plot.multiple_line_chart(axs[0, 0], n_clusters, ivalues, '\nKmeans', 'nr estimators', 'inertia', percentage=False) plot.multiple_line_chart(axs[0, 1], n_clusters, svalues, '\nKmeans', 'nr estimators', 'silhouete', percentage=False) plt.show() #%% n_clusters=6 algs = ["PCA", "selectkbest"] plt.figure() fig, axs = plt.subplots(2 ,len(algs), figsize=(14, 8), squeeze=False) for a in range(len(algs)): datar = datapp.feature_reduction(df, to_clf,categoric+[to_clf], n_features=2, as_int=True, alg=algs[a]) y: np.ndarray = datar[to_clf].values X: np.ndarray = datar.drop([to_clf], axis=1).values
"class", ["class", "id"], d, alg=f) xg_clf = GradientBoostingClassifier() startTime = time.process_time() acc, sens, _ = eval.train_predict_kfold(df, "class", xg_clf, bal=bal) print(time.process_time() - startTime) yvalues.append(acc) syvalues.append(sens) values[d] = yvalues svalues[d] = syvalues plot.multiple_line_chart(axs[0, k], thresholds, values, 'XGBoost with %s reduction' % f, 'threshold of reduction', 'accuracy') plot.multiple_line_chart(axs[1, k], thresholds, svalues, 'XGBoost with %s reduction' % f, 'threshold of reduction', 'sensitivity', percentage=False) plt.show() #%% tr = 0.9 f = "selectkbest" selectk = 0.75
values = {} svalues = {} for d in selects: yvalues = [] syvalues = [] for tr in thresholds: datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization, ignore_classes=categoric, as_df=True) df = datapp.feature_reduction(datared, "class",["class","id"], d, alg=f) rf = RandomForestClassifier(random_state=rs) acc, sens, _ = eval.train_predict_kfold(df, "class", rf, bal=bal) yvalues.append(acc) syvalues.append(sens) values[d] = yvalues svalues[d] = syvalues plot.multiple_line_chart(axs[0, k], thresholds, values, 'Random Forests with %s reduction' % f, 'threshold of reduction', 'accuracy') plot.multiple_line_chart(axs[1, k], thresholds, svalues, 'Random Forests with %s reduction' % f, 'threshold of reduction', 'sensitivity', percentage=False) plt.show() #%% tr=0.95 f= "selectkbest" selectk = 0.6 datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization, ignore_classes=categoric, as_df=True) df = datapp.feature_reduction(datared, "class",["class","id"], n_features=selectk, alg=f) df.shape #%%
d, alg=alg) acc, sens, x = eval.train_predict_kfold(df, "class", f, bal=bal) yvalues.append(acc) syvalues.append(sens) values[d] = yvalues svalues[d] = syvalues plot.multiple_line_chart(axs[0 + 2 * i, k], thresholds, values, '{} with {}'.format(f, str(alg)), 'threshold of reduction', 'accuracy', percentage=False) plot.multiple_line_chart(axs[1 + 2 * i, k], thresholds, svalues, '{} with {}'.format(f, str(alg)), 'threshold of reduction', 'sensitivity', percentage=False) plt.show() #%% tr = 0.9 f = "PCA"
for i in range(len(bins)): q_lifts.append(rules_q[i]["lift"].mean()) c_lifts.append(rules_c[i]["lift"].mean()) q_sup = [] c_sup = [] for i in range(len(bins)): q_sup.append(rules_qsup[i]["support"].mean()) c_sup.append(rules_csup[i]["support"].mean()) #%% lvalues = {} lvalues["cut"] = c_lifts lvalues["qcut"] = q_lifts svalues = {} svalues["cut"] = c_sup svalues["qcut"] = q_sup plt.figure() fig, axs = plt.subplots(1, 2, figsize=(12, 6), squeeze=False) axs[0, 0].set_xticks(bins) axs[0, 1].set_xticks(bins) plot.multiple_line_chart(axs[0, 0], bins, lvalues, 'Lift of top rules of corresponding bins', 'bins', 'lift') plot.multiple_line_chart(axs[0, 1], bins, svalues, 'Support of top rules of corresponding bins', 'bins', 'support') plt.show()
for d in selects: yvalues = [] syvalues = [] for tr in thresholds: datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization, ignore_classes=categoric, as_df=True) df = datapp.feature_reduction(datared, "class",["class","id"], d, alg=f) tree = DecisionTreeClassifier(random_state=rs) acc, sens, _ = eval.train_predict_kfold(df, "class", tree, bal=bal) yvalues.append(acc) syvalues.append(sens) values[d] = yvalues svalues[d] = syvalues plot.multiple_line_chart(axs[0, k], thresholds, values, 'Decision Trees with %s reduction' % f, 'threshold of reduction', 'accuracy') plot.multiple_line_chart(axs[1, k], thresholds, svalues, 'Decision Trees with %s reduction' % f, 'threshold of reduction', 'sensitivity', percentage=False) plt.show() #%% tr=0.95 f= "selectkbest" selectk = 1 datared = datapp.preprocess_alt(data, "class", red_corr=True, tr=tr, n=5, normalization=normalization, ignore_classes=categoric, as_df=True) df = datapp.feature_reduction(datared, "class",["class","id"], n_features=selectk, alg=f) df.shape # %% # PARAMETERS PARA TESTAR: criterion, splitter, max_depth, min_sample_leaf, min_samples_split
criteria = ['entropy', 'gini'] plt.figure() fig, axs = plt.subplots(1, len(criteria), figsize=(12, 7), squeeze=False) for k in range(len(criteria)): f = criteria[k] values = {} for d in max_depths: yvalues = [] for n in min_samples_leaf: tree = DecisionTreeClassifier(min_samples_leaf=n, max_depth=d, criterion=f, random_state=rs) tree.fit(trnX, trnY) pred = tree.predict(valX) yvalues.append(metrics.accuracy_score(valY, pred)) values[d] = yvalues plot.multiple_line_chart(axs[0, k], min_samples_leaf, values, 'Decision Trees with %s criteria' % f, 'min_samples_leaf', 'accuracy', percentage=True) plt.show() # %% criterion = "gini" # %% min_samples_leaf = [.01, .0075, .005, .0025, .001] max_depths = [5, 10, 25] splitters = ['best', 'random'] plt.figure() fig, axs = plt.subplots(1, len(splitters), figsize=(12, 7), squeeze=False) for k in range(len(splitters)):
ignore_classes=categoric, as_df=True) df = datapp.feature_reduction(datared, "class", ["class", "id"], d, alg=f) knn = KNeighborsClassifier() acc, sens, x = eval.train_predict_kfold(df, "class", knn, bal=bal) yvalues.append(acc) syvalues.append(sens) values[d] = yvalues svalues[d] = syvalues plot.multiple_line_chart(axs[0, k], thresholds, values, 'KNN with %s reduction' % f, 'threshold of reduction', 'accuracy') plot.multiple_line_chart(axs[1, k], thresholds, svalues, 'KNN with %s reduction' % f, 'threshold of reduction', 'sensitivity', percentage=False) plt.show() #%% tr = 0.9 f = "selectkbest" selectk = 1 datared = datapp.preprocess_alt(data,
print("max depth cycle") yvalues = [] for n in n_estimators: rf = RandomForestClassifier(n_estimators=n, max_depth=d, max_features=f, random_state=rs) # rf = GaussianNB() rf.fit(trnX, trnY) pred = rf.predict(valX) yvalues.append(metrics.accuracy_score(valY, pred)) values[d] = yvalues plot.multiple_line_chart(axs[0, k], n_estimators, values, 'Random Forests with %s features' % f, 'nr estimators', 'accuracy', percentage=False) plt.show() # %% max_features = 0.3 # %% n_estimators = [50, 100, 200, 300, 400] max_depths = [10, 25, 50] criterions = ['gini', 'entropy'] plt.figure()
f = lr[k] values = {} for d in n_estimators: yvalues = [] for n in min_samples_leaf: gb = GradientBoostingClassifier(min_samples_leaf=n, n_estimators=d, loss=loss, learning_rate=f, random_state=rs) gb.fit(trnX, trnY) pred = gb.predict(valX) yvalues.append(metrics.accuracy_score(valY, pred)) values[d] = yvalues plot.multiple_line_chart(axs[0, k], min_samples_leaf, values, 'Gradient Boosting with %s lf' % f, 'min_samples_leaf', 'accuracy') plt.show() #%% lr = 0.5 #%% min_samples_leaf = [.05, .025, .01, .005, .0025, .001] n_estimators = [100, 200, 300] max_depth = [3, 10, 15, 25] plt.figure() fig, axs = plt.subplots(2, len(max_depth), figsize=(12, 7), squeeze=False) for k in range(len(max_depth)): f = max_depth[k]