def test_classification_toy(): """Check classification on a toy dataset.""" # Random forest clf = RandomForestClassifier(n_estimators=10, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) clf = RandomForestClassifier(n_estimators=10, max_features=1, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) # also test apply leaf_indices = clf.apply(X) assert_equal(leaf_indices.shape, (len(X), clf.n_estimators)) # Extra-trees clf = ExtraTreesClassifier(n_estimators=10, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) clf = ExtraTreesClassifier(n_estimators=10, max_features=1, random_state=1) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf)) # also test apply leaf_indices = clf.apply(X) assert_equal(leaf_indices.shape, (len(X), clf.n_estimators))
def modelselect(input_filename, num_test_examples, block_size, n_estimators=100): # Perform some model selection to determine good parameters # Load data X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_train = encoder.transform(forest.apply(X_train)) learner = SGDClassifier( loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.001, average=10 ** 4, eta0=0.5, class_weight="balanced", ) metric = "f1" losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"] penalties = ["l2", "l1", "elasticnet"] alphas = 10.0 ** numpy.arange(-5, 0) learning_rates = ["constant", "optimal", "invscaling"] param_grid = [{"alpha": alphas, "loss": losses, "penalty": penalties, "learning_rate": learning_rates}] grid_search = GridSearchCV(learner, param_grid, n_jobs=-1, verbose=2, scoring=metric, refit=True) grid_search.fit(X_train, y_train) print(grid_search.best_params_, grid_search.best_score_) return grid_search
def fit(self, **kwargs) -> Model: feature_list = kwargs.get('feature_list', None) if not feature_list: self.name = self.name + '(-irt)' self.train_x = self.select_features(self.feature.features_train, feature_list) self.train_y = self.feature.label_train.values self.feature_names = self.train_x.columns self.train_x, self.train_y = self.tf_sample(self.train_x, self.train_y) rf = RandomForestClassifier(**self.param) rf_enc = OneHotEncoder() rf_lm = LogisticRegression(penalty='l2', C=1, solver='lbfgs') rf.fit(self.train_x, self.train_y) rf_enc.fit(rf.apply(self.train_x)) rf_lm.fit(rf_enc.transform(rf.apply(self.train_x)), self.train_y) self.rf = rf self.rf_enc = rf_enc self.model = rf_lm # 评估训练集上的效果 self.train_y_pred = self.predict(self.train_x) self.train_y = np.array(self.train_y) self.train_y_pred = np.array(self.train_y_pred) self.train_ev = self.evaluation.evaluate(y_true=self.train_y, y_pred=self.train_y_pred, threshold=0.5) return self
def runRfStack(inputfile,outputfile): ''' 输入输出文件 ''' df_all = pd.read_csv(inputfile) df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) # 默认填充的0,显示使用一个负数尝试一下 df_all.replace([np.inf, -np.inf], np.nan,inplace=True) df_all = df_all.fillna(0) # 默认填充的0,显示使用一个负数尝试一下 features = df_all.columns[0:] features = list(features) features.remove('EID') label = 'TARGET' clf = RandomForestClassifier( n_estimators=50,#50棵树 max_depth=7, n_jobs=4, random_state=101) df_all_prov11,df_all_prov12 = split_data_with_prov(df_all) ###################### prov == 11 df_train11,df_test11 = xtrain_and_test(df_all_prov11) X_train11 = df_train11[features] Y_label11 = df_train11[label] X_test11 = df_test11[features] clf.fit(X_train11,Y_label11) column = ['STACKFEATURE'+str(i) for i in range(50)] df_new_feature11 = pd.DataFrame(clf.apply(df_all_prov11[features]),columns=column) df_all_prov11[column] = df_new_feature11 ###################### prov == 12 df_train12,df_test12 = xtrain_and_test(df_all_prov12) X_train12 = df_train12[features] Y_label12 = df_train12[label] X_test12 = df_test12[features] clf.fit(X_train12,Y_label12) column = ['STACKFEATURE'+str(i) for i in range(50)] df_new_feature12 = pd.DataFrame(clf.apply(df_all_prov12[features]),columns=column) df_all_prov12[column] = df_new_feature12 # 合并 df_all = df_all_prov11.append(df_all_prov12) df_all.to_csv(outputfile,index=False,index_label=False) del df_all_prov11,df_all_prov12,df_all return outputfile
def RandomForestLR(): RF = RandomForestClassifier(n_estimators=100, max_depth=4) RF.fit(X_train, Y_train) OHE = OneHotEncoder() OHE.fit(RF.apply(X_train)) LR = LogisticRegression() LR.fit(OHE.transform(RF.apply(X_train_lr)), Y_train_lr) Y_pred = LR.predict_proba(OHE.transform(RF.apply(X_test)))[:, 1] fpr, tpr, _ = roc_curve(Y_test, Y_pred) auc = roc_auc_score(Y_test, Y_pred) print("RandomForest+LogisticRegression:", auc) return fpr, tpr
def rf_lr_model(): """ RandomForest + LR """ rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth) rf_enc = OneHotEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr, tpr, _ = roc_curve(y_test, y_pred) print 'RandomForest+LR AUC: {0}'.format(auc(fpr, tpr))
def RandomForestLR(X_train, y_train, X_test, y_test, X_train_lr, y_train_lr): rf = RandomForestClassifier(max_depth=3, n_estimators=50) rf_enc = OneHotEncoder() rf_lr = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lr.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lr = rf_lr.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lr, tpr_rf_lr, _ = roc_curve(y_test, y_pred_rf_lr) auc = roc_auc_score(y_test, y_pred_rf_lr) print("RF+LR:", auc) return fpr_rf_lr, tpr_rf_lr
def RF_Logit(X_train, y_train, X_test): X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) grd = RandomForestClassifier(max_depth=10, max_features=9) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)), y_train_lr) y_hat_RF_log = grd_lm.predict_proba(grd_enc.transform( grd.apply(X_test)))[:, 1] return y_hat_RF_log
def main(): # initialize sklearn objects rf = RandomForestClassifier(n_estimators=300, max_depth=3, verbose=1, random_state=SEED) logitsgd = SGDClassifier(loss='log', n_jobs=-1, verbose=1) encoder = OneHotEncoder() train, click = load_train_data(train_loc) # rf feature transformation rf.fit(train, click) train_rf = rf.apply(train) train = None # encode rf features for logit print('fitting encoder ... ') encoder.fit(train_rf) print('transforming ...') embedded = [] for row in train_rf: embedded = vstack((embedded, encoder.transform(row))) train_rf = None # train model logitsgd.fit(X=embedded, y=click) embedded = None # load testing data test = load_test_data(test_loc) # rf transform test test_rf = rf.apply(test) test = None # encode test print('transforming ...') embedded = [] for row in test_rf: embedded = vstack((embedded, encoder.transform(row))) test_rf = None # make predictions prediction = logitsgd.predict_proba(embedded_test) # save predictions prediction = np.array(prediction) np.savetxt("predictions.csv", prediction, delimiter=",")
def smts(aTrainY, aTrainX, aTestX, aR, aJins, aNTree): tTrainOut = pd.DataFrame() tTestOut = pd.DataFrame() tTrainX = aTrainX.iloc[:, 1:len(aTrainX.columns)] #trainのsidリスト tTrSids = pd.DataFrame(aTrainX[aTrainX.columns[0]], columns=[aTrainX.columns[0]]) tTestX = aTestX.iloc[:, 1:len(aTestX.columns)] #testのsidリスト tTeSids = pd.DataFrame(aTestX[aTestX.columns[0]], columns=[aTestX.columns[0]]) # uniqueな状態になったsid sid_tr = pd.DataFrame() sid_te = pd.DataFrame() # 学習機RFinsをaJins個作成 for i in range(aJins): clf = RandomForestClassifier(n_estimators=aNTree, max_leaf_nodes=int((aR + 2) / 2)) #trainで学習 clf.fit(tTrainX, aTrainY) #train testともにNode計算 tIdxTr = clf.apply(tTrainX) tIdxTe = clf.apply(tTestX) #変換完了(1つのTree) [N x R] sid_tr, tTrX = H_jX(tIdxTr, tTrSids, aR) sid_te, tTeX = H_jX(tIdxTe, tTeSids, aR) #outputに結合 tTrainOut = pd.concat([tTrainOut, tTrX], axis=1, ignore_index=True) tTestOut = pd.concat([tTestOut, tTeX], axis=1, ignore_index=True) print("Done " + str(i + 1) + "th RF") #TrainOutは[sid]+[N x RJins] #TestOutは[sid]+[N x RJins] #DataFrame型にする return pd.concat([sid_tr, tTrainOut], axis=1), pd.concat([sid_te, tTestOut], axis=1)
def main(): # initialize sklearn objects rf = RandomForestClassifier(n_estimators = 300, max_depth = 3, verbose = 1, random_state = SEED) logitsgd = SGDClassifier(loss ='log', n_jobs = -1, verbose = 1) encoder = OneHotEncoder() train, click = load_train_data(train_loc) # rf feature transformation rf.fit(train, click) train_rf = rf.apply(train) train = None # encode rf features for logit print('fitting encoder ... ') encoder.fit(train_rf) print('transforming ...') embedded = [] for row in train_rf: embedded = vstack((embedded, encoder.transform(row))) train_rf = None # train model logitsgd.fit(X = embedded, y = click) embedded = None # load testing data test = load_test_data(test_loc) # rf transform test test_rf = rf.apply(test) test = None # encode test print('transforming ...') embedded = [] for row in test_rf: embedded = vstack((embedded, encoder.transform(row))) test_rf = None # make predictions prediction = logitsgd.predict_proba(embedded_test) # save predictions prediction = np.array(prediction) np.savetxt("predictions.csv", prediction, delimiter = ",")
def _most_informative(self, X, clusterer, neighborhoods): n = X.shape[0] l = len(neighborhoods) neighborhoods_union = set() for neighborhood in neighborhoods: for i in neighborhood: neighborhoods_union.add(i) unqueried_indices = set(range(n)) - neighborhoods_union # If there is only one neighborhood then choose the point randomly if l <= 1: return np.random.choice(list(unqueried_indices)), [1] # Learn a random forest classifier n_estimators = 50 rf = RandomForestClassifier(n_estimators=n_estimators) rf.fit(X, clusterer.labels_) # Compute the similarity matrix leaf_indices = rf.apply(X) S = np.zeros((n, n)) for i in range(n): for j in range(n): S[i, j] = (leaf_indices[i, ] == leaf_indices[j, ]).sum() S = S / n_estimators p = np.empty((n, l)) uncertainties = np.zeros(n) expected_costs = np.ones(n) # For each point that is not in any neighborhood... for x_i in range(n): if not x_i in neighborhoods_union: for n_i in range(l): p[x_i, n_i] = (S[x_i, neighborhoods[n_i]].sum() / len(neighborhoods[n_i])) # If the point is not similar to any neighborhood set equal probabilities of belonging to each neighborhood if np.all(p[x_i, ] == 0): p[x_i, ] = np.ones(l) p[x_i, ] = p[x_i, ] / p[x_i, ].sum() if not np.any(p[x_i, ] == 1): positive_p_i = p[x_i, p[x_i, ] > 0] uncertainties[x_i] = -(positive_p_i * np.log2(positive_p_i)).sum() expected_costs[x_i] = (positive_p_i * range(1, len(positive_p_i) + 1)).sum() else: uncertainties[x_i] = 0 expected_costs[x_i] = 1 # ? normalized_uncertainties = uncertainties / expected_costs most_informative_i = np.argmax(normalized_uncertainties) return most_informative_i, p[most_informative_i]
def kfingerprinting(X_train,X_test,y_train,y_test): # logger.info('training...') model = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True) model.fit(X_train, y_train) # M = model.predict(X_test) # for i in range(0,len(M)): # x = M[i] # label = str(Y_test[i][0])+'-'+str(Y_test[i][1]) # logger.info('%s: %s'%(str(label), str(x))) acc = model.score(X_test, y_test) #logger.info('Accuracy = %.4f'%acc) train_leaf = model.apply(X_train) test_leaf = model.apply(X_test) # print(model.feature_importances_) # joblib.dump(model, 'dirty-trained-kf.pkl') return train_leaf, test_leaf
class Forest(): def __init__(self, n_estimators=10, categorical_features=[]): self.encoder = OneHotEncoder(categorical_features=categorical_features) self.forest = RandomForestClassifier(n_estimators=n_estimators) def fit(self, X, y): self.encoder.fit(X) self.forest.fit(self.encoder.transform(X), y) return self def predict(self, X): return self.forest.predict(self.encoder.transform(X)) def votes(self, X): # FIXME There is probably a more clever way of doing this X_enc = self.encoder.transform(X) predictions = [t.predict(X_enc) for t in self.forest.estimators_] votes = pd.DataFrame.from_dict( dict(zip(range(len(self.forest.estimators_)), predictions))) return votes.transpose().sum() def score(self, X, y): return self.forest.score(self.encoder.transform(X), y) def apply(self, X): return self.forest.apply(X)
def rf_dis(n_trees, X, Y, train_indices, test_indices, seed): clf = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=-1) clf = clf.fit(X[train_indices], Y[train_indices]) pred = clf.predict(X[test_indices]) weight = clf.score(X[test_indices], Y[test_indices]) #print(1 - clf.oob_score_) n_samples = X.shape[0] dis = np.zeros((n_samples, n_samples)) for i in range(n_samples): dis[i][i] = 1 res = clf.apply(X) for i in range(n_samples): for j in range(i + 1, n_samples): a = np.ravel(res[i]) b = np.ravel(res[j]) score = a == b d = float(score.sum()) / n_trees dis[i][j] = dis[j][i] = d X_features1 = np.transpose(dis) X_features2 = X_features1[train_indices] X_features3 = np.transpose(X_features2) return X_features3[train_indices], X_features3[test_indices], weight, pred
def rf_dis(n_trees, X, Y, train_indices, test_indices, seed): clf = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1) clf = clf.fit(X[train_indices], Y[train_indices]) pred = clf.predict(X[test_indices]) prediction = clf.predict(X) prob = clf.predict_proba(X[test_indices]) weight = clf.oob_score_ #clf.score(X[test_indices], Y[test_indices]) print(clf.score(X[train_indices], Y[train_indices])) #print(1 - clf.oob_score_) n_samples = X.shape[0] trees = clf.estimators_ dis = np.zeros((n_samples, n_samples)) for i in range(n_samples): dis[i][i] = 1 res = clf.apply(X) www = 0.9 pre = np.zeros((n_trees, n_samples)) prepro = np.zeros((n_trees, n_samples)) for i in range(n_trees): pre[i] = trees[i].predict(X) sss = trees[i].predict_proba(X) ss = [] for j in range(n_samples): ss.append(max(sss[j])) prepro[i] = ss pre = pre.transpose() prepro = prepro.transpose() for i in range(n_samples): for j in range(i + 1, n_samples): a = np.ravel(res[i]) b = np.ravel(res[j]) c = np.ravel(pre[i]) d = np.ravel(pre[j]) e = np.ravel(prepro[i]) f = np.ravel(prepro[j]) score = 0 for k in range(n_trees): if a[k] == b[k]: s1 = 1 else: s1 = 0 if c[k] == d[k]: s2 = min(e[k], f[k]) else: s2 = 0 print(s2) s = s1 * www + s2 * (1 - www) score = score + s dis[i][j] = dis[j][i] = score / n_trees X_features1 = np.transpose(dis) X_features2 = X_features1[train_indices] X_features3 = np.transpose(X_features2) return X_features3[train_indices], X_features3[ test_indices], weight, pred, prob, clf
def test_drf_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.RandomForestClassifier #Run h2o4gpu version of RandomForest Regression drf = Solver(backend=backend, random_state=1234, oob_score=True) print("h2o4gpu fit()") drf.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import RandomForestClassifier drf_sk = RandomForestClassifier(random_state=1234, oob_score=True, max_depth=3) print("Scikit fit()") drf_sk.fit(X, y) if backend == "sklearn": assert (drf.predict(X) == drf_sk.predict(X)).all() == True assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X)).all() == True assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True assert (drf.apply(X) == drf_sk.apply(X)).all() == True print("Estimators") print(drf.estimators_) print(drf_sk.estimators_) print("n_features") print(drf.n_features_) print(drf_sk.n_features_) assert drf.n_features_ == drf_sk.n_features_ print("n_classes_") print(drf.n_classes_) print(drf_sk.n_classes_) assert drf.n_classes_ == drf_sk.n_classes_ print("n_features") print(drf.classes_) print(drf_sk.classes_) assert (drf.classes_ == drf_sk.classes_).all() == True print("n_outputs") print(drf.n_outputs_) print(drf_sk.n_outputs_) assert drf.n_outputs_ == drf_sk.n_outputs_ print("Feature importance") print(drf.feature_importances_) print(drf_sk.feature_importances_) assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True print("oob_score") print(drf.oob_score_) print(drf_sk.oob_score_) assert drf.oob_score_ == drf_sk.oob_score_
def train(input_filename, num_train_examples, num_test_examples, block_size): # Load initial training data and test data X_train, y_train, X_test, y_test, scaler = loaddata( input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=150, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_test = encoder.transform(forest.apply(X_test)) # Make sure that classes are weighted inversely to their frequencies weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train)) class_weights = {0: weights[0], 1: weights[1]} learner = SGDClassifier(loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.0001, average=10**4, eta0=1.0, class_weight=class_weights) num_passes = 3 aucs = [] for j in range(num_passes): for i in range(0, num_train_examples, block_size): df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size) X_train = df.values[:, 1:] X_train = scaler.transform(X_train) X_train = encoder.transform(forest.apply(X_train)) y_train = numpy.array(df.values[:, 0], numpy.int) del df learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1])) y_pred_prob = learner.decision_function(X_test) auc = roc_auc_score(y_test, y_pred_prob) aucs.append([i + num_train_examples * j, auc]) print(aucs[-1]) df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"]) df = df.set_index("Iterations") return df
def kfingerprinting(X_train, X_test, y_train, y_test): logger.info('training...') model = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True) model.fit(X_train, y_train) # M = model.predict(X_test) # for i in range(0,len(M)): # x = M[i] # label = str(Y_test[i][0])+'-'+str(Y_test[i][1]) # logger.info('%s: %s'%(str(label), str(x))) acc = model.score(X_test, y_test) logger.info('Accuracy = %.4f' % acc) train_leaf = zip(model.apply(X_train), y_train) test_leaf = zip(model.apply(X_test), y_test) joblib.dump(model, 'ranpad2_0610_2057_norm.pkl') return train_leaf, test_leaf
def _fit(self, dataset, **options): # self.param = param # print('model GBDT_LR fit begin:') # GBDT 模型 rf = RandomForestClassifier(**options) rf.fit(dataset.x, dataset.y) # enc = OneHotEncoder() enc.fit(rf.apply(dataset.x)[:, :, 0]) lm = LogisticRegression(**self.model_params) x = enc.transform(rf.apply(dataset.x)[:, :, 0]) lm.fit(x, dataset.y) self.tree = rf self.enc = enc self.m = lm
def modelselect(input_filename, num_test_examples, block_size, n_estimators=100): # Perform some model selection to determine good parameters # Load data X_train, y_train, X_test, y_test, scaler = loaddata( input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_train = encoder.transform(forest.apply(X_train)) learner = SGDClassifier(loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.001, average=10**4, eta0=0.5, class_weight="balanced") metric = "f1" losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"] penalties = ["l2", "l1", "elasticnet"] alphas = 10.0**numpy.arange(-5, 0) learning_rates = ["constant", "optimal", "invscaling"] param_grid = [{ "alpha": alphas, "loss": losses, "penalty": penalties, "learning_rate": learning_rates }] grid_search = GridSearchCV(learner, param_grid, n_jobs=-1, verbose=2, scoring=metric, refit=True) grid_search.fit(X_train, y_train) print(grid_search.best_params_, grid_search.best_score_) return grid_search
def train(input_filename, num_train_examples, num_test_examples, block_size): # Load initial training data and test data X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=150, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_test = encoder.transform(forest.apply(X_test)) # Make sure that classes are weighted inversely to their frequencies weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train)) class_weights = {0: weights[0], 1: weights[1]} learner = SGDClassifier( loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.0001, average=10 ** 4, eta0=1.0, class_weight=class_weights, ) num_passes = 3 aucs = [] for j in range(num_passes): for i in range(0, num_train_examples, block_size): df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size) X_train = df.values[:, 1:] X_train = scaler.transform(X_train) X_train = encoder.transform(forest.apply(X_train)) y_train = numpy.array(df.values[:, 0], numpy.int) del df learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1])) y_pred_prob = learner.decision_function(X_test) auc = roc_auc_score(y_test, y_pred_prob) aucs.append([i + num_train_examples * j, auc]) print(aucs[-1]) df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"]) df = df.set_index("Iterations") return df
class Train(object): """docstring for TrainModel""" def preprocess_model(self): '''This allows preprocessing using logistic regression''' X_train, X_train_lr, y_train, y_train_lr = train_test_split( self.train, self.predictors, test_size=0.5) encode = OneHotEncoder() logistic = LogisticRegression() self.clf = RandomForestClassifier(n_estimators=512, oob_score=True, n_jobs=-1) self.clf.fit(X_train, y_train) encode.fit(self.clf.apply(X_train)) self.predmodel = logistic.fit( encode.transform(self.clf.apply(X_train_lr)), y_train_lr) def train_model(self): '''This is standard model training''' '''For RandomForestClassifier to work their must be no nan values, one way of handling this is to use the --impute option. This uses mean imputation, which is the least information imputer, imputation is done by feature ''' if np.any(np.isnan(self.train)): warnings.warn('RandomForestClassifier requires no missing data,\ features being imputed by mean') X = self.train imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) self.train = imp.transform(X) self.clf = RandomForestClassifier(n_estimators=512, oob_score=True, n_jobs=-1) self.predmodel = self.clf.fit(X=self.train, y=self.predictors, sample_weight=self.weights) def __init__(self, train): self.train = train.train self.predictors = train.predictors self.features = train.feature_names self.weights = train.weights
def RF_openworld(mon_type, path_to_dict = dic_of_feature_data): '''Produces leaf vectors used for classification.''' mon_training, mon_test = mon_train_test_references(mon_type, path_to_dict) unmon_training, unmon_test = unmon_train_test_references(path_to_dict) training = mon_training + unmon_training test = mon_test + unmon_test tr_data, tr_label1 = zip(*training) tr_label = zip(*tr_label1)[0] te_data, te_label1 = zip(*test) te_label = zip(*te_label1)[0] print "Training ..." model = RandomForestClassifier(n_jobs=-1, n_estimators=num_Trees, oob_score=True) model.fit(tr_data, tr_label) train_leaf = zip(model.apply(tr_data), tr_label) test_leaf = zip(model.apply(te_data), te_label) return train_leaf, test_leaf
class Train(object): """docstring for TrainModel""" def preprocess_model(self): '''This allows preprocessing using logistic regression''' X_train, X_train_lr, y_train, y_train_lr = train_test_split(self.train, self.predictors, test_size=0.5) encode = OneHotEncoder() logistic = LogisticRegression() self.clf = RandomForestClassifier(n_estimators=512, oob_score=True, n_jobs=-1) self.clf.fit(X_train, y_train) encode.fit(self.clf.apply(X_train)) self.predmodel = logistic.fit(encode.transform(self.clf.apply(X_train_lr)), y_train_lr) def train_model(self): '''This is standard model training''' '''For RandomForestClassifier to work their must be no nan values, one way of handling this is to use the --impute option. This uses mean imputation, which is the least information imputer, imputation is done by feature ''' if np.any(np.isnan(self.train)): warnings.warn('RandomForestClassifier requires no missing data,\ features being imputed by mean') X = self.train imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(X) self.train = imp.transform(X) self.clf = RandomForestClassifier(n_estimators=512, oob_score=True, n_jobs=-1) self.predmodel = self.clf.fit(X=self.train, y=self.predictors, sample_weight=self.weights) def __init__(self, train): self.train = train.train self.predictors = train.predictors self.features = train.feature_names self.weights = train.weights
def RF_openworld(mon_type, path_to_dict=dic_of_feature_data): '''Produces leaf vectors used for classification.''' mon_training, mon_test = mon_train_test_references(mon_type, path_to_dict) unmon_training, unmon_test = unmon_train_test_references(path_to_dict) training = mon_training + unmon_training test = mon_test + unmon_test tr_data, tr_label1 = zip(*training) tr_label = zip(*tr_label1)[0] te_data, te_label1 = zip(*test) te_label = zip(*te_label1)[0] print "Training ..." model = RandomForestClassifier(n_jobs=-1, n_estimators=num_Trees, oob_score=True) model.fit(tr_data, tr_label) train_leaf = zip(model.apply(tr_data), tr_label) test_leaf = zip(model.apply(te_data), te_label) return train_leaf, test_leaf
def rf_lr(X_train, X_test, y_train, y_test): """ RF + LR :param X_train: :param X_test: :param y_train: :param y_test: :return: """ # 基于随机森林的监督变换 rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=3) rf.fit(X_train, y_train) # 得到 OneHot 编码 rf_enc = OneHotEncoder(categories='auto') rf_enc.fit(rf.apply(X_train)) # 使用 OneHot 编码作为特征,训练 LR rf_lr = LogisticRegression(solver='lbfgs', max_iter=1000) rf_lr.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) # 使用 LR 进行预测 y_pred_rf_lr = rf_lr.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lr, tpr_rf_lr, _ = roc_curve(y_test, y_pred_rf_lr) return fpr_rf_lr, tpr_rf_lr
class EntityEmbeddingTree(BaseEstimator, TransformerMixin): def __init__(self, *, numeric_columns, categorical_columns): self.__numeric_columns = numeric_columns self.__categorical_columns = categorical_columns self.__target_encoder, self.__one_hot_encoder = [ None for _ in range(2) ] self.__max_target, self.__max_param = [None for _ in range(2)] self.__clf = None def fit(self, X, y): X = X.copy(deep=True) y = y.copy(deep=True) self.__target_encoder = TargetEncoder() X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.fit_transform( X[self.__categorical_columns], y) self.__max_target, self.__max_param = optimize_rf(X, y) self.__clf = RandomForestClassifier( min_samples_leaf=max( min(self.__max_param["min_samples_leaf"], 1.0), 0), n_estimators=max(int(round(self.__max_param["n_estimators"])), 1)) self.__clf.fit(X, y) gc.collect() return self def transform(self, X): X = X.copy(deep=True) X[self.__numeric_columns] = X[self.__numeric_columns].fillna(-9999.0) X[self.__categorical_columns] = X[self.__categorical_columns].fillna( "missing").astype(str) X[self.__categorical_columns] = self.__target_encoder.transform( X[self.__categorical_columns]) gc.collect() return pd.DataFrame(self.__clf.apply(X)).astype(str) def fit_transform(self, X, y=None, **fit_params): self.fit(X=X, y=y) return self.transform(X)
def detect(self, X, y): X, y = self._check_everything(X, y) forest = RandomForestClassifier(n_estimators=self.n_estimators, max_leaf_nodes=self.max_leaf_nodes, n_jobs=self.n_jobs, random_state=self.random_state).fit( X, y) Xs = forest.apply(X) knn = KNeighborsClassifier(n_neighbors=self.n_neighbors, metric='hamming', algorithm='brute', weights=self.weight, n_jobs=self.n_jobs).fit(Xs, y) return self._get_kdn(knn, y)
def runRfStack(inputfile, outputfile): ''' 输入输出文件 ''' df_all = pd.read_csv(inputfile) df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) # 默认填充的0,显示使用一个负数尝试一下 df_all.replace([np.inf, -np.inf], np.nan, inplace=True) df_all = df_all.fillna(0) # 默认填充的0,显示使用一个负数尝试一下 features = df_all.columns[0:] features = list(features) features.remove('EID') label = 'TARGET' df_train, df_test = xtrain_and_test(df_all) clf = RandomForestClassifier( n_estimators=50, #50棵树 max_depth=7, n_jobs=4, random_state=101) X_train = df_train[features] Y_label = df_train[label] X_test = df_test[features] clf.fit(X_train, Y_label) column = ['STACKFEATURE' + str(i) for i in range(50)] df_new_feature = pd.DataFrame(clf.apply(df_all[features]), columns=column) df_all[column] = df_new_feature df_all.to_csv(outputfile, index=False, index_label=False) del df_train, df_test, df_all return outputfile
def rf_embed(feature, labels, n_estimators=100, max_depth=3): """construct an embedding using a random forest. Args: feature (np.ndarray): a matrix of shape (len(labels),D) with D>0. labels (list): a list of integers on the range [0,1] n_estimators (int): the number of trees in the random forest max_depth (int): the maximum depth of each tree Returns: np.ndarray: a matrix containg all pairwise similarities for a single categorical distribution (feature). """ clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, bootstrap=True, criterion='entropy', class_weight='balanced') clf.fit(feature, labels) leaves = clf.apply(feature) embedded = np.array( OneHotEncoder(categories='auto').fit_transform(leaves).todense()) return 1. - cosine_distances(embedded)
# 隨機森林擬合後, 再將葉編碼 (*.apply) 結果做獨熱 / 邏輯斯迴歸 rf = RandomForestClassifier(n_estimators=20, min_samples_split=10, min_samples_leaf=5, max_features=4, max_depth=3, bootstrap=True) onehot = OneHotEncoder() lr = LogisticRegression(solver='lbfgs', max_iter=1000) """ Your Code Here (Hint : 隨機森林的葉編碼(.apply)不需要加上[:, :, 0], 直接用rf.apply()調用即可, 本作業其餘寫法相同) """ rf.fit(train_X, train_Y) onehot.fit(rf.apply(train_X)) lr.fit(onehot.transform(rf.apply(val_X)), val_Y) # 將梯度提升樹+葉編碼+邏輯斯迴歸結果輸出 pred_rf_lr = lr.predict_proba(onehot.transform(rf.apply(test_X)))[:, 1] fpr_rf_lr, tpr_rf_lr, _ = roc_curve(test_Y, pred_rf_lr) # 將梯度提升樹結果輸出 pred_rf = rf.predict_proba(test_X)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(test_Y, pred_rf) import matplotlib.pyplot as plt # 將結果繪圖 plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rf, tpr_rf, label='RandomForest') plt.plot(fpr_rf_lr, tpr_rf_lr, label='RandomForest + LR') plt.xlabel('False positive rate')
from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt wine = load_wine() x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3) dtc = DecisionTreeClassifier(random_state=0) rfc = RandomForestClassifier(random_state=0, oob_score=True) dtc.fit(x_train, y_train) rfc.fit(x_train, y_train) print("袋外数据", rfc.oob_score_) rfc = RandomForestClassifier(n_estimators=25) rfc.fit(x_train, y_train) score = rfc.score(x_test, y_test) fi = rfc.feature_importances_ result_apply = rfc.apply(x_test) result_predict = rfc.predict(x_test) result_proba = rfc.predict_proba(x_test)
# y_pred_rf = model.predict_proba(enc.transform(model.apply(X_test)))[:, 1] # model_prob = model.predict_proba(X_test) # score=log_loss(Y_test,model_prob) # score_mean=mean_squared_error(Y_test,model.predict(X_test)) # print("Score:",score) # print("MSE:",score_mean) # print("model_prob",model_prob) # model_prob=model_prob.reshape(1,-1) # rf = RandomForestClassifier(max_depth=3) rf_enc = OneHotEncoder() rf_lm = LogisticRegressionCV(cv=5) rf.fit(X_train, Y_train.ravel()) rf_enc.fit(rf.apply(X_train)) model_used = rf_lm.fit(rf_enc.transform(rf.apply(X_test)), Y_test.ravel()) preds = rf.predict(X_test) print(type(model_used)) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(Y_test, y_pred_rf_lm, pos_label='pos') roc_auc = auc(fpr_rf_lm, tpr_rf_lm) plt.plot(fpr_rf_lm, tpr_rf_lm, label=str(roc_auc)) plt.title('ROC curve-Test Set (With Imbalance removal using SMOTE), AUC: ' + str(roc_auc)) plt.xlabel('False positive rate') plt.plot([0, 1], [0, 1], 'k--') # plt.legend('AUC:'str(round(roc_auc))) plt.ylabel('True positive rate')
class Classifier: def __init__(self, max_depth, directory, tree_file_name='tree.txt', title='Embedded Values', already_classified=False, n_neighbors=100, train=True, n_estimators=10, file_included_in_directory=False, file_name='Untitled.csv', reduce=False, path='auto', dimension=3, mock=False, index='gene_callers_id', create_folder=False, folder_name='folder', separator=None, norm=True, _filter=epsilon, rows=100, columns=100): self.algorithm_filename = 'metagenome-centric_classifier_algorithm.sav' self.train = train self.n_estimators = n_estimators self.max_depth = max_depth self.embedded = Embedding(n_neighbors, directory, path=path, reduced_dimension=dimension, train=train, mock=mock, file_name=file_name, index=index, create_folder=create_folder, folder_name=folder_name, separator=separator, norm=norm, _filter=_filter, rows=rows, columns=columns, tree_file='tree.txt') self.directory = self.embedded.directory self.dataframe = self.embedded.embedded_dataframe self.X = self.embedded.coverage_values if train: self.train_data(tree_file_name, title) self.fit_data() def train_data(self, tree_file_name, title): ''' Allows user to manually train the data ''' directory = self.embedded.directory coverage_values_file = self.embedded.embedded_coverage_values_file classified_values_file = self.embedded.embedded_classified_values_file training_data = training.Train( directory=directory, coverage_values_file=coverage_values_file, classified_values_file=classified_values_file, tree_file=tree_file_name, title=title) self.X = training_data.coverage_values self.y = training_data.classified_values def save_model(self): ''' Saves model to a binary file using pickle ''' with open(self.algorithm_filename, 'wb') as f: pickle.dump(self.model, f) def load_model(self): ''' Loads model to a binary file using pickle ''' with open(self.algorithm_filename, 'wb') as f: return pickle.load(open(self.algorithm_filename, 'wb')) def fit_data(self): ''' Performs the random forest classifier ''' # checks if random forest classifier has already been created try: self.model = self.load_model() except EOFError: self.model = RandomForestClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth) if self.train: self.model.fit(self.X, self.y) else: self.model.apply(self.X) self.save_model()
# Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
def show_roc(classifier, with_probas): cv = StratifiedKFold(labels[:-1], n_folds=5) for i, (train, test) in enumerate(cv): vectorizer = CountVectorizer(vocabulary=vocab) features = vectorizer.fit_transform(data[train]) #transformer = TfidfTransformer() #tfidf_features = transformer.fit(features).transform(features) #X = np.array(tfidf_features.todense()) #X = preprocess(features.toarray()) X = features.toarray() y = labels[train] X, X1, y, y1 = train_test_split(X, y, test_size=0.5) clf1 = RandomForestClassifier(n_estimators=20) enc = OneHotEncoder() clf2 = RandomForestClassifier(n_estimators=10) clf1.fit(X, y) enc.fit(clf1.apply(X)) clf2.fit(enc.transform(clf1.apply(X1)), y1) #clf = classifier.fit(X, y) X_test = vectorizer.transform(data[test]) #t_f = preprocess(t_features.toarray()) y_test = labels[test] #res = clf.predict(t_f) res = clf2.predict(enc.transform(clf1.apply(X_test))) if with_probas: res_p = clf2.predict_proba(enc.transform(clf1.apply(X_test))) #res_p = clf.predict_proba(t_features) fpr, tpr, _ = roc_curve(y_test, res_p[:,1]) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) check = zip(y_test, res) tp, tn, fp, fn = 0, 0, 0, 0 for value, prediction in check: if (prediction and value): tp += 1 if (prediction and not value): fp += 1 if (not prediction and value): fn += 1 if (not prediction and not value): tn += 1 print ('TP: {0}, TN: {1}, FP: {2}, FN: {3}'.format(tp, tn, fp, fn)) print ("Precision Score : %f" % metrics.precision_score(y_test, res)) print ("Recall Score : %f" % metrics.recall_score(y_test, res)) print ("Accuracy : %.4g" % metrics.accuracy_score(y_test, res)) print ("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, res)) if with_probas: plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") plt.show()