def stack(X, y, X_test, y_test): X, X1, y, y1 = train_test_split(X, y, test_size=0.5) #clf1 = GradientBoostingClassifier(n_estimators=10) #clf1 = RandomForestClassifier(n_estimators=20) clf1 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) clf2 = linear_model.SGDClassifier(loss='log') enc = OneHotEncoder() #clf2 = RandomForestClassifier(n_estimators=10) #clf2 = GradientBoostingClassifier(n_estimators=20) clf1.fit(X, y) enc.fit(clf1.apply(X)) clf2.fit(enc.transform(clf1.apply(X1)), y1) #prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)[:, :, 0]))[:, 1] prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)).toarray())[:, 1] res = clf2.predict(enc.transform(clf1.apply(X_test))) check = zip(y_test, res) tp, tn, fp, fn = 0, 0, 0, 0 for value, prediction in check: if (prediction and value): tp += 1 if (prediction and not value): fp += 1 if (not prediction and value): fn += 1 if (not prediction and not value): tn += 1 print ('TP: {0}, TN: {1}, FP: {2}, FN: {3}'.format(tp, tn, fp, fn)) print ("Precision Score : %f" % metrics.precision_score(y_test, res)) print ("Recall Score : %f" % metrics.recall_score(y_test, res)) return roc_curve(y_test, prob)
def Extreme_rf_dis(n_trees, X, Y, train_indices, test_indices, seed): clf = ExtraTreesClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=-1) clf = clf.fit(X[train_indices], Y[train_indices]) pred = clf.predict(X[test_indices]) weight = clf.score(X[test_indices], Y[test_indices]) #print(1 - clf.oob_score_) n_samples = X.shape[0] dis = np.zeros((n_samples, n_samples)) for i in range(n_samples): dis[i][i] = 1 res = clf.apply(X) for i in range(n_samples): for j in range(i + 1, n_samples): a = np.ravel(res[i]) b = np.ravel(res[j]) score = a == b d = float(score.sum()) / n_trees dis[i][j] = dis[j][i] = d X_features1 = np.transpose(dis) X_features2 = X_features1[train_indices] X_features3 = np.transpose(X_features2) return X_features3[train_indices], X_features3[test_indices], weight, pred
X = training_data.to_numpy() reducer = umap.UMAP(random_state=42) embedding = reducer.fit_transform(X) plt.figure() plt.scatter(embedding[:, 0], embedding[:, 1], c=y, cmap="Spectral", s=8) plt.gca().set_aspect("equal", "datalim") cb = plt.colorbar() loc = np.arange(0, max(y) + 0.5, 1) cb.set_ticks(loc) plt.title("UMAP projection of Titanic dataset") # Use Extra Trees Classifier Embedding model = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=10) model.fit(X, y) leaves = model.apply(X) reducer = umap.UMAP(metric='hamming', random_state=42) embedding = reducer.fit_transform(leaves) # plotting the embedding plt.figure() plt.scatter(embedding[:, 0], embedding[:, 1], c=y, cmap="Spectral", s=8) plt.gca().set_aspect("equal", "datalim") cb = plt.colorbar() loc = np.arange(0, max(y) + 0.5, 1) cb.set_ticks(loc) plt.title( "UMAP Projection of Titanic Dataset\n Using Extra Trees Classifier Embedding" ) # Use DecisionTreeClassifier Embedding model = DecisionTreeClassifier(max_leaf_nodes=2)