def eval_subset(train, test): n_clusters = len(np.unique(train[2])) clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1) clf.fit(train[0], train[2]) DTacc = float(clf.score(test[0], test[2])) clf = KNeighborsClassifier(n_neighbors=1, algorithm='brute', n_jobs=1) clf.fit(train[0], train[2]) acc = float(clf.score(test[0], test[2])) LR = LinearRegression(n_jobs=-1) LR.fit(train[0], train[1]) MSELR = float(((LR.predict(test[0]) - test[1])**2).mean()) MSE = float((((decoder((train[0], train[1]), (test[0], test[1])) - test[1])**2).mean())) max_iters = 10 cnmi, cacc = 0.0, 0.0 for iter in range(max_iters): nmi, acc = unsupervised_evaluation.evaluation(train[0], n_clusters=n_clusters, y=train[2]) cnmi += nmi / max_iters cacc += acc / max_iters print('nmi = {:.3f}, acc = {:.3f}'.format(cnmi, cacc)) print('acc = {:.3f}, DTacc = {:.3f}, MSELR = {:.3f}, MSE = {:.3f}'.format( acc, DTacc, MSELR, MSE)) return MSELR, MSE, acc, DTacc, float(cnmi), float(cacc)
def predict_et(): X = pd.read_csv('data/X_train.csv', header=0) y = pd.read_csv('data/y_train.csv', header=0) #X= X.drop(['id'],axis=1) #X= X.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1) y = y['fault_severity'] testX = pd.read_csv('data/X_test.csv', header=0) testY = pd.read_csv('data/y_test.csv', header=0) testX1 = testX #testX1= testX.drop(['id'],axis=1) #testX1=testX.drop(['revnum','rnumsh','rnumsh0','rnumsh1','numsh0','numsh1','num'],axis=1) testY = testY['fault_severity'] et = ExtraTreesClassifier(n_estimators=440, random_state=1) et.fit(X, y) print(et.score(X, y)) print(et.score(testX1, testY)) # prediction testy = et.predict_proba(testX1) pred_cols = ['predict_{}'.format(i) for i in range(3)] submission = pd.DataFrame(et.predict_proba(testX1), index=testX.id, columns=pred_cols) print(multiclass_log_loss(testY.values, submission.values)) submission.to_csv('et_output.csv', index_label='id')
def plot_confusion_matrix(model, relevant_features_new, y_new, threshold_classification): extra_trees = ExtraTreesClassifier(n_estimators=1000, random_state=0) base_classification = Base_Classification(model, extra_trees) #sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0) sss = StratifiedKFold(n_splits=3, shuffle=False, random_state=10) for train_index, test_index in sss.split(relevant_features_new, y_new): x_train, x_test = relevant_features_new.iloc[ train_index, :], relevant_features_new.iloc[test_index, :] y_train, y_test = y_new.iloc[train_index, :], y_new.iloc[test_index, :] break #x_train, x_test, y_train, y_test = train_test_split(relevant_features_new, y_new, test_size=0.3, random_state=42) extra_trees.fit(x_train, y_train) pred = extra_trees.predict_proba(x_test) pred = pd.DataFrame(pred, columns=extra_trees.classes_) valid_indexes = base_classification.get_accuracy.get_indexes_with_valid_predictions( pred, threshold_classification) x_test_valid = x_test.iloc[valid_indexes, :] y_test_valid = y_test.iloc[valid_indexes, :] base_classification.get_accuracy.plot_confusion_matrix( x_test_valid, y_test_valid, extra_trees) print("Accuracy => {}".format(extra_trees.score(x_test_valid, y_test_valid))) base_classification.get_accuracy.plot_confusion_matrix( x_test, y_test, extra_trees) print("Accuracy => {}".format(extra_trees.score(x_test, y_test)))
def model_training(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5) accs = [] depths = np.arange(60, 200, 20) for i in depths: print('training extra tree classifier, n estimators = {}'.format(i)) etf = ExtraTreesClassifier(n_estimators=i, max_depth=None, min_samples_split=2, random_state=5).fit(X_train, y_train) print('accuracy {}'.format(round(etf.score(X_test, y_test), 3))) accs += [etf.score(X_test, y_test)] print('top accuracy {}'.format(round(max(accs), 3))) dpth = depths[accs.index(max(accs))] etf = ExtraTreesClassifier(n_estimators=dpth, max_depth=None, min_samples_split=2, random_state=5).fit(X_train, y_train) joblib.dump(etf, 'model_extratrees.pkl') print('model saved')
def dimensionReduction( data, target, fea_alg='et' ): #featureSelection(trainingSet,trainingLabels,testSet,testLabels,fea_alg = 'dt'): nFold = 5 skf = StratifiedKFold(n_splits=nFold) for train_index, test_index in skf.split(data, target): pass trainingSet = data[train_index] trainingLabels = target[train_index] testSet = data[test_index] testLabels = target[test_index] # random forest, feature_importances_, feature importances if fea_alg == 'et': clf = ExtraTreesClassifier(n_estimators=300, random_state=0, max_features="sqrt") clf.fit(trainingSet, trainingLabels) select = clf.feature_importances_ score0 = clf.score(testSet, testLabels) model = SelectFromModel(clf, prefit=True) train_new = model.transform(trainingSet) test_new = model.transform(testSet) score1 = clf.fit(train_new, trainingLabels).score(test_new, testLabels) print train_new.shape[1], score0, score1, select[:5] if fea_alg == 'lsvc': clf = LinearSVC(C=0.01, penalty="l1", dual=False).fit(data, target) clf.fit(trainingSet, trainingLabels) select = clf.coef_ score0 = clf.score(testSet, testLabels) model = SelectFromModel(clf, prefit=True) train_new = model.transform(trainingSet) test_new = model.transform(testSet) score1 = clf.fit(train_new, trainingLabels).score(test_new, testLabels) print train_new.shape[1], score0, score1, select[:5] # naive bayesian, sigma_ : array, shape (n_classes, n_features), variance of each feature per class elif fea_alg == 'nb': clf = GaussianNB() clf.fit(trainingSet, trainingLabels) feature_rank = clf.sigma_ ind = np.argsort(np.sum(feature_rank, axis=0)) max_score = 0 max_i = 0 for i in range(0, len(ind) + 1, 10): score = clf.fit(trainingSet[:, ind[:i + 1]], trainingLabels).score(testSet[:, ind[:i + 1]], testLabels) if score > max_score: max_score = score max_i = i feature_ind = ind[:max_i + 1] data = data[feature_ind] return data
def train_l2_et(x_train, x_test, y_train, y_test): clf = ExtraTreesClassifier(n_estimators=256) clf.fit(x_train, y_train) if y_test is not None: print('ExtraTreesClassifier:', clf.score(x_test, y_test)) else: print('ExtraTreesClassifier:', clf.score(x_train, y_train)) return np.reshape(clf.predict(x_train), (-1, 1))
def get_ERT(Xtrain, Xtest, Ytrain, Ytest, gtree): # Extremely Randomized Trees ert = ExtraTreesClassifier(n_estimators=1000,max_features=gtree.best_estimator_.max_features,max_depth=gtree.best_estimator_.max_depth,min_samples_split=gtree.best_estimator_.min_samples_split,n_jobs=-1) ert.fit(Xtrain,Ytrain) scores = np.empty((2)) scores[0] = ert.score(Xtrain,Ytrain) scores[1] = ert.score(Xtest,Ytest) print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0]*100)) print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1]*100)) return ert
def train_l1_et(x_train, x_test, y_train, y_test): clf = ExtraTreesClassifier(n_estimators=256, n_jobs=-1) clf.fit(x_train, y_train) if y_test is not None: print('ExtraTreesClassifier:', clf.score(x_test, y_test)) else: print('ExtraTreesClassifier:', clf.score(x_train, y_train)) test_res = np.reshape(clf.predict(x_train), (-1, 1)) train_res = np.reshape(clf.predict(x_test), (-1, 1)) return [test_res, train_res]
def EnsembleMethods(X, y): # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=TRAIN_TEST_SPLIT_RATIO) # get randomized PCA model num_components = 120 print("Extracting the top %d eigenfaces from %d faces" % (num_components, X_train.shape[0])) pca = RandomizedPCA(n_components=num_components, whiten=True).fit(X_train) # use the PCA model on our training set and test set. print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") # get decision tree classifier decision_tree_classifier = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) # use decision tree classifier to fit the data. decision_tree_classifier.fit(X_train_pca, y_train) # print the performance of decision tree classifier print("====== Decision Tree Classifier ========") print('TRAIN SCORE', decision_tree_classifier.score(X_train_pca, y_train)) print('TEST SCORE', decision_tree_classifier.score(X_test_pca, y_test)) # get random forest classifier random_forest_classifier = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) # use random forest classifier to fit the data. random_forest_classifier.fit(X_train_pca, y_train) # print the performance of decision tree classifier print("====== Random Forest Classifier ========") print('TRAIN SCORE', random_forest_classifier.score(X_train_pca, y_train)) print('TEST SCORE', random_forest_classifier.score(X_test_pca, y_test)) # get extra trees classifier extra_trees_classifier = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) # use extra trees classifier to fit the data. extra_trees_classifier.fit(X_train_pca, y_train) # print the performance of decision tree classifier print("====== Extra Trees Classifier ========") print('TRAIN SCORE', extra_trees_classifier.score(X_train_pca, y_train)) print('TEST SCORE', extra_trees_classifier.score(X_test_pca, y_test))
def test3(): print("3. Testing softmax for full harmonization...") trainXc, trainyc = load_dataset("train", "data/chorales_rnn.hdf5") devXc, devyc = load_dataset("dev", "data/chorales_rnn.hdf5") testXc, testyc = load_dataset("test", "data/chorales_rnn.hdf5") stack = lambda x1, x2: numpy.vstack((x1, x2)) hstack = lambda x1, x2: numpy.hstack((x1, x2)) # Remove Oracle features trainXc = [X[:, range(0,10)] for X in trainXc] devXc = [X[:, range(0,10)] for X in devXc] testXc = [X[:, range(0,10)] for X in testXc] # Aggregate data Xtrain = stack(reduce(stack, trainXc), reduce(stack, devXc)) ytrain = hstack(reduce(hstack, trainyc), reduce(hstack, devyc)) Xtest, ytest = reduce(stack, testXc), reduce(hstack, testyc) # Remove padding ypadding = ytest.max() Xtrain_up, ytrain_up, Xtest_up, ytest_up = [], [], [], [] for idx, p in enumerate(ytrain): if p != ypadding: Xtrain_up.append(Xtrain[idx]) ytrain_up.append(ytrain[idx]) for idx, p in enumerate(ytest): if p != ypadding: Xtest_up.append(Xtest[idx]) ytest_up.append(ytest[idx]) Xtrain, ytrain, Xtest, ytest = numpy.array(Xtrain_up), numpy.array(ytrain_up), \ numpy.array(Xtest_up), numpy.array(ytest_up) encoder, Xtrainsparse, Xtestsparse = encode(Xtrain, Xtest) RF = RandomForestClassifier(10, "entropy", None) RF.fit(Xtrain, ytrain) # Write full harmonization data with h5py.File('data/chorales_sm.hdf5', "w", libver="latest") as f: f.create_dataset("Xtrain", Xtrain.shape, dtype="i", data=Xtrain) f.create_dataset("ytrain", ytrain.shape, dtype="i", data=ytrain) f.create_dataset("Xtest", Xtest.shape, dtype="i", data=Xtest) f.create_dataset("ytest", ytest.shape, dtype="i", data=ytest) print "Full harmonization data written" score_RF_train = RF.score(Xtrain, ytrain) score_RF_test = RF.score(Xtest, ytest) print "R-FOREST: %.2f%% training, %.2f%% test" % (score_RF_train * 100, score_RF_test * 100) ERF = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0) ERF.fit(Xtrainsparse, ytrain) score_ERF_train = ERF.score(Xtrainsparse, ytrain) score_ERF_test = ERF.score(Xtestsparse, ytest) print "EXTRA TREES: %.2f%% training, %.2f%% test" % (score_ERF_train * 100, score_ERF_test * 100) logit = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1) logit.fit(Xtrainsparse, ytrain) score_logit_train = logit.score(Xtrainsparse, ytrain) score_logit_test = logit.score(Xtestsparse, ytest) print "LOGIT: %.2f%% training, %.2f%% test" % (score_logit_train * 100, score_logit_test * 100)
def extratrees_clf(): # 3.2. Create classifier. max_features = int(np.sqrt(train_dataset_X.shape[1])) clf = ExtraTreesClassifier(random_state=42, n_jobs=-1, n_estimators=100, max_features=max_features) # 3.3. Fit classifier. clf.fit(X_train, y_train) # 4. Calculate score. # FAILED TO CONVERGE print("Train set score: {0}".format(clf.score(X_train, y_train))) # 0.9925 print("Test set score: {0}".format(clf.score(X_test, y_test))) # 0.782
def learn(f): global raw_data print 'testing classifier' data = raw_data[raw_data['label'] != 'unknown'] data = data[data['file type'] == 'EXECUTE'] X = data.as_matrix(f) y = np.array(data['label'].tolist()) #clf = RandomForestClassifier(n_estimators=100) clf = ExtraTreesClassifier(n_estimators=100) #clf = AdaBoostClassifier() scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10) print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) seed = 3301 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) clf.fit(X_train, y_train) scores = clf.score(X_test, y_test) print("actual accuracy: %0.2f" % scores) importances = zip(f, clf.feature_importances_) importances.sort(key=lambda k: k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(30), im[1] #y_pred = clf.predict(X_test) #labels = ['good', 'bad'] #cm = confusion_matrix(y_test, y_pred, labels) #plot_cm(cm, labels) #joblib.dump(clf, 'model.pkl') return clf
def train_model(train): le = LabelEncoder() cols = ['Term', 'Home Ownership'] train['Loan Status'] = le.fit_transform(train['Loan Status']) train = pd.get_dummies(data=train, columns=cols, drop_first=True) X = train.drop(columns=[ 'Purpose', 'Monthly Debt', 'Years of Credit History', 'Number of Open Accounts', 'Number of Credit Problems', 'Current Credit Balance', 'Maximum Open Credit', 'Bankruptcies', 'Tax Liens' ]) y = train['Loan Status'] from imblearn.over_sampling import RandomOverSampler ros = RandomOverSampler() X_ros, y_ros = ros.fit_sample(X, y) #print(X_ros.shape[0] - X.shape[0], 'new random picked points') y = X_ros['Loan Status'].values X = X_ros.drop(columns=['Loan Status']).values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = ExtraTreesClassifier() model.fit(X_train, y_train) pred = model.predict(X_test) return model, model.score(X_test, y_test)
def ERFC_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting Extreme Random Forest Classifier***************") t0 = time() clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("Extreme Random Forest Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending Extreme Random Forest Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def test_vocabulary(vocabulary_sizes, desc_sel, desc_tr, desc_te): score_list = [] for num_clusters in vocabulary_sizes: kmeans = load_or_compute_pickle(num_clusters) # Construct training data labels train_labels = [i // 15 for i in range(150)] test_labels = train_labels #in this case, since both are 10x15 images # Calculate the bag of words for training and test data data_train = bag_of_words_histogram(desc_tr, kmeans, num_clusters).reshape( 150, num_clusters) data_test = bag_of_words_histogram(desc_te, kmeans, num_clusters).reshape( 150, num_clusters) print('Computing RF for a vocabulary of', num_clusters) # Use best performing parameters for RF RFC = ExtraTreesClassifier(n_estimators=100, max_depth=10, bootstrap=False, random_state=0).fit(data_train, train_labels) score = RFC.score(data_test, test_labels) score_list.append(score) print('score:', score) pickle_out = open('vocabulary_scores.pickle', 'wb') pickle.dump(score_list, pickle_out) pickle_out.close() return score_list
def get_ERT(Xtrain, Ytrain, baseTree, Xtest = None , Ytest = None, verbose = 0): # Extremely Randomized Trees ert = ExtraTreesClassifier(n_estimators=1000,max_features=baseTree.best_estimator_.max_features, max_depth=baseTree.best_estimator_.max_depth, min_samples_split=baseTree.best_estimator_.min_samples_split,n_jobs=-1) ert.fit(Xtrain,Ytrain) if (verbose == 1): scores = np.empty((2)) scores[0] = ert.score(Xtrain,Ytrain) print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0]*100)) if (type(Xtest) != type(None)): scores[1] = ert.score(Xtest,Ytest) print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1]*100)) return ert
def learn(f): global raw_data print 'testing classifier' data = raw_data[raw_data['label'] != 'unknown'] data = data[data['file type'] == 'EXECUTE'] X = data.as_matrix(f) y = np.array(data['label'].tolist()) #clf = RandomForestClassifier(n_estimators=100) clf = ExtraTreesClassifier(n_estimators=100) #clf = AdaBoostClassifier() scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10) print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) seed = 3301 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) clf.fit(X_train, y_train) scores = clf.score(X_test, y_test) print("actual accuracy: %0.2f" % scores) importances = zip(f, clf.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(30), im[1] #y_pred = clf.predict(X_test) #labels = ['good', 'bad'] #cm = confusion_matrix(y_test, y_pred, labels) #plot_cm(cm, labels) #joblib.dump(clf, 'model.pkl') return clf
def do_extra_trees(md = None): from sklearn.ensemble import ExtraTreesClassifier train_X, train_Y, test_X, test_Y = analysis_glass() ETC = ExtraTreesClassifier(n_estimators=100, max_depth = md) ETC.fit(train_X, train_Y) return ETC.score(test_X, test_Y)
def get_ERT(Xtrain, Ytrain,tree, Xtest = None , Ytest = None, verbose = 0): # Extremely Randomized Trees ert = ExtraTreesClassifier(n_estimators=1000,max_features=tree.best_estimator_.max_features, max_depth=tree.best_estimator_.max_depth, min_samples_split=tree.best_estimator_.min_samples_split,n_jobs=-1) ert.fit(Xtrain,Ytrain) if (verbose == 1): scores = np.empty((2)) scores[0] = ert.score(Xtrain,Ytrain) print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0]*100)) if (type(Xtest) != type(None)): scores[1] = ert.score(Xtest,Ytest) print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1]*100)) return ert
def random_forest_cross_validate(targets, features, nprocesses=-1): cv = cross_validation.KFold(len(features), k=5, indices=False) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] for i, (traincv, testcv) in enumerate(cv): cfr = ExtraTreesClassifier( n_estimators=100, max_features=None, verbose=2, compute_importances=True, n_jobs=nprocesses, random_state=0, ) print "Fitting cross validation #{0}".format(i) cfr.fit(features[traincv], targets[traincv]) print "Scoring cross validation #{0}".format(i) cfr.set_params(n_jobs=1) # read in the features to predict, remove bad columns score = cfr.score(features[testcv], targets[testcv]) print "Score for cross validation #{0}, score: {1}".format(i, score) mean_diff = get_metric(cfr, features[testcv], targets[testcv]) print "Mean difference: {0}".format(mean_diff) results.append(mean_diff) print "Features importance" features_list = [] for j, importance in enumerate(cfr.feature_importances_): if importance > 0.0: column = features.columns[j] features_list.append((column, importance)) features_list = sorted(features_list, key=lambda x: x[1], reverse=True) for j, tup in enumerate(features_list): print j, tup pickle.dump(features_list, open("important_features.p", 'wb')) print "Mean difference: {0}".format(mean_diff) results.append(mean_diff)
def ExtrExtraTrees_classification(train, test, train_labels, test_labels, res={}): """ :param train: training data, iterable/list :param test: testing data, iterable/list :param train_labels: training labels, iterable/list :param test_labels: testing labels, iterable/list :return: / --> Saves data in folder "Results" """ print("Classifying with ExtraTrees...") extra = ExtraTreesClassifier() extra.fit(train, train_labels) prediction = extra.predict(test) utils.report_and_confmat(test_labels, prediction, "ExtraTrees") score = extra.score(test, test_labels) res["ExtraTrees"] = { "model": extra, "accuracy": score, "name": "ExtraTreesClassifier" } print("ExtraTrees ended...") return score, extra
def many_classify_dtree(X,Y): print("Building the model for decision trees...") x = [] x.append(X.loc[0:15000]) x.append(X.loc[15000:30000]) x.append(X.loc[30000:45000]) x.append(X.loc[45000:59999]) y = [] y.append(Y.loc[0:15000]) y.append(Y.loc[15000:30000]) y.append(Y.loc[30000:45000]) y.append(Y.loc[45000:60000]) scores = [] for i in range(0,4): X_train, X_test, y_train, y_test = cross_validation.train_test_split(x[i], y[i], test_size=0.1) start_time = datetime.now() #print(start_time) clf = ExtraTreesClassifier(n_estimators=10) y_train = np.ravel(y_train) y_test = np.ravel(y_test) clf = clf.fit(X_train,y_train) end_time = datetime.now() #print(end_time) scores.append(clf.score(X_test,y_test)) s = 0 for i in range(0,4): s= s +scores[i] #print(scores[i]) print("Classification Score using Decision Tree with Drift Detection:" + str(s/4))
def do_extra_trees(md=None): from sklearn.ensemble import ExtraTreesClassifier train_X, train_Y, test_X, test_Y = analysis_glass() ETC = ExtraTreesClassifier(n_estimators=100, max_depth=md) ETC.fit(train_X, train_Y) return ETC.score(test_X, test_Y)
def Extreme_rf_dis(n_trees, X, Y, train_indices, test_indices, seed): clf = ExtraTreesClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=-1) clf = clf.fit(X[train_indices], Y[train_indices]) pred = clf.predict(X[test_indices]) weight = clf.score(X[test_indices], Y[test_indices]) #print(1 - clf.oob_score_) n_samples = X.shape[0] dis = np.zeros((n_samples, n_samples)) for i in range(n_samples): dis[i][i] = 1 res = clf.apply(X) for i in range(n_samples): for j in range(i + 1, n_samples): a = np.ravel(res[i]) b = np.ravel(res[j]) score = a == b d = float(score.sum()) / n_trees dis[i][j] = dis[j][i] = d X_features1 = np.transpose(dis) X_features2 = X_features1[train_indices] X_features3 = np.transpose(X_features2) return X_features3[train_indices], X_features3[test_indices], weight, pred
def learnly(): clf = ExtraTreesClassifier(n_estimators=30) clf.fit(features_train, labels_train) clf.predict(features_train) score = clf.score(features_test, labels_test) print(score) stop = "stop" return clf, score
def extraTreesClassifier(X, Y, X_test, Y_test): clf = ExtraTreesClassifier(n_estimators=10, random_state=0) fitXY = clf.fit(X, Y) score = fitXY.score(X, Y) print('Training set score: ' + str(score)) score = clf.score(X_test, Y_test) print('Test set score: ' + str(score))
def EnsembleMethod(X, y): # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=TRAIN_TEST_SPLIT_RATIO) # train with decision tree classifier decisionTreeClassifier = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) # use the classifier to fit the data. decisionTreeClassifier.fit(X_train, y_train) # print the performance of the classifier print("====== Decision Tree Classifier ========") print('TRAIN SCORE', decisionTreeClassifier.score(X_train, y_train)) print('TEST SCORE', decisionTreeClassifier.score(X_test, y_test)) # train with random forest classifier randomForestClassifier = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) # use the classifier to fit the data. randomForestClassifier.fit(X_train, y_train) # print the performance of the classifier print("====== Random Forest Classifier ========") print('TRAIN SCORE', randomForestClassifier.score(X_train, y_train)) print('TEST SCORE', randomForestClassifier.score(X_test, y_test)) # train with extra trees classifier extraTreesClassifier = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) # use the classifier to fit the data. extraTreesClassifier.fit(X_train, y_train) # print the performance of the classifier print("======= Extra Trees Classifier ========") print('TRAIN SCORE', extraTreesClassifier.score(X_train, y_train)) print('TEST SCORE', extraTreesClassifier.score(X_test, y_test))
def classify(X,Y): print("Building the model for random forests...") X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.1) clf = ExtraTreesClassifier(n_estimators=10) y_train = np.ravel(y_train) y_test = np.ravel(y_test) clf = clf.fit(X_train,y_train) print("Classification Score using Random Forests:" + str(clf.score(X_test,y_test)))
def do_predict_by_cv_and_norm(x_train, y_train, x_test, y_test, times_num, flag='ExtraTrees'): """ 使用规范化技术预测模型在测试集上的准确度 :param x_train: 样本集中的特征向量列表 :param y_train: 样本集中的标签列表 :param x_test: 测试集中的特征向量列表 :param y_test: 测试集中的标签列表 :param times_num: 模型的n_estimators值 :param flag: 模型指示词 :return: 返回预测的准确度 """ clf = ExtraTreesClassifier(n_estimators=10) file_path = '/Users/ming.zhou/NLP/DiscourseStructures/result/' + flag + 'PredictResult20171117.text' if flag == 'RandomForest': clf = RandomForestClassifier(n_estimators=10) elif flag == 'DecisionTree': clf = DecisionTreeClassifier() elif flag == 'SVM': clf = svm.SVC() normalizer = preprocessing.Normalizer().fit(x_train) x_train_norm = normalizer.transform(x_train) x_test_norm = normalizer.transform(x_test) clf.fit(x_train_norm, y_train) y_pred = clf.predict(x_test_norm) score = clf.score(x_test_norm, y_test) print(str(score) + '\n') print(y_test + '\n') print(y_pred + '\n') print('test_Y len is %s,y_pred len is %s \n' % (len(y_test), len(y_pred))) result = open(file_path, 'a') result_content = list() result_content.append('**********************step=' + str(times_num) + '**********************' + '\n') result_content.append(str(score) + '\n') result_content.append('test_Y:' + '\n') result_content.append(str(y_test) + '\n') result_content.append('y_pred:' + '\n') result_content.append(str(y_pred) + '\n') result_content.append('test_Y len is ' + str(len(y_test)) + ',and y_pred len is ' + str(len(y_pred)) + '\n') for i in range(len(y_test)): y_and_ypred = str(y_test[i]) + '-' + str(y_pred[i]) if y_test[i] != y_pred[i]: if y_and_ypred not in compare: compare[y_and_ypred] = 1 else: compare[y_and_ypred] = compare[y_and_ypred] + 1 # sortedResult = sorted(compare.items(), key=lambda d: -d[1]) # print(sortedResult) # result_content.append(str(sortedResult) + '\n') result.writelines(result_content) result.close() return score
def et_classifier(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame, y_val: np.ndarray) -> tuple: x_trn, x_val = x_trn.copy(), x_val.copy() y_trn, y_val = y_trn.copy(), y_val.copy() model = ExtraTreesClassifier(n_estimators=400, min_samples_leaf=16, class_weight='balanced', n_jobs=-1, random_state=7) _ = model.fit(x_trn, y_trn) training_score = model.score(x_trn, y_trn) validation_score = model.score(x_val, y_val) clf_report = classification_report(y_val, model.predict(x_val)) ck_score = cohen_kappa_score(y_val, model.predict(x_val)) return model, training_score, validation_score, clf_report, ck_score
def evaluate_et(trainX, trainy, testX, testy, params): sc = StandardScaler() trainX = sc.fit_transform(trainX) testX = sc.transform(testX) model = ExtraTreesClassifier(**params) model.fit(trainX, trainy) test_acc = model.score(testX, testy) pred = model.predict_proba(testX) return model, test_acc, pred
def classify(X,Y,test_data,test_labels): print("Building the model for random forests...") Y = np.ravel(Y) test_labels = np.ravel(test_labels) clf = ExtraTreesClassifier(n_estimators=10) clf = clf.fit(X,Y) print("Classification Score using Random Forests:" + str(clf.score(test_data,test_labels))) output = clf.predict(test_data) return output
def get_ERT(Xtrain, Xtest, Ytrain, Ytest, gtree): # Extremely Randomized Trees ert = ExtraTreesClassifier( n_estimators=1000, max_features=gtree.best_estimator_.max_features, max_depth=gtree.best_estimator_.max_depth, min_samples_split=gtree.best_estimator_.min_samples_split, n_jobs=-1) ert.fit(Xtrain, Ytrain) scores = np.empty((2)) scores[0] = ert.score(Xtrain, Ytrain) scores[1] = ert.score(Xtest, Ytest) print('Extremely Randomized Trees, train: {0:.02f}% '.format(scores[0] * 100)) print('Extremely Randomized Trees, test: {0:.02f}% '.format(scores[1] * 100)) return ert
def classify(X, Y): print("Building the model for random forests...") X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, Y, test_size=0.1) clf = ExtraTreesClassifier(n_estimators=10) y_train = np.ravel(y_train) y_test = np.ravel(y_test) clf = clf.fit(X_train, y_train) print("Classification Score using Random Forests:" + str(clf.score(X_test, y_test)))
def classify(X, Y, test_data, test_labels): print("Building the model for random forests...") Y = np.ravel(Y) test_labels = np.ravel(test_labels) clf = ExtraTreesClassifier(n_estimators=10) clf = clf.fit(X, Y) print("Classification Score using Random Forests:" + str(clf.score(test_data, test_labels))) output = clf.predict(test_data) return output
def extract_tree(train_vecs, y_train, test_vecs, y_test): clf = ExtraTreesClassifier(n_estimators=10, max_depth=10, min_samples_split=2, n_jobs=1, random_state=0) clf.fit(train_vecs, y_train) joblib.dump(clf, storedpaths + 'model_extracttree.pkl') test_scores = clf.score(test_vecs, y_test) return test_scores
def et_classify(self): print "Extra Trees" clf = ExtraTreesClassifier() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) pred = clf.predict(self.test_descr) print "Pred ", pred print "Mean : %3f" % mean print "Feature Importances ", clf.feature_importances_
def ExRF(n_trees, seed, train_x, train_y, test_x, test_y): clf = ExtraTreesClassifier(n_estimators=n_trees, random_state = seed, oob_score=True) clf = clf.fit(train_x,train_y) oob_error = 1 - clf.oob_score_ test_error = clf.score(test_x,test_y) test_auc = clf.predict_proba(test_x) #filename = './tmp1/RF_%d_.pkl'%seed #_ = joblib.dump(clf, filename, compress=9) return test_error
def train_model(stats, X_train, Y_train, X_test=None, Y_test=None): print "Training ExtraTrees classifier" clf = Classifier(n_estimators=n_estimators,n_jobs=30, min_samples_leaf=nodesize, #class_weight='balanced_subsample', ) clf.fit(X_train,Y_train) stats["train_acc"] = clf.score(X_train, Y_train) print "Training complete" print 'Training Accuracy: %.3f'%stats["train_acc"] # Breakout early if no test set is given if X_test is None: return clf, stats stats["test_acc"] = clf.score(X_test, Y_test) print 'Testing Accuracy: %.3f'%stats["test_acc"] X_test_TP = X_test[Y_test==1] Y_test_TP = Y_test[Y_test==1] stats["test_acc_TP"] = clf.score(X_test_TP, Y_test_TP) print 'Testing Accuracy TP: %.3f'%stats["test_acc_TP"] X_test_FP = X_test[Y_test==0] Y_test_FP = Y_test[Y_test==0] stats["test_acc_FP"] = clf.score(X_test_FP, Y_test_FP) print 'Testing Accuracy FP: %.3f'%stats["test_acc_FP"] pred_probas = clf.predict_proba(X_test)[:,1] Y_predict = clf.predict(X_test) total_contacts = Y_test.sum() predicted_contacts = Y_predict[Y_test==1].sum() print 'Total contacts predicted %i/%i'%(predicted_contacts,total_contacts) fpr,tpr,_ = roc_curve(Y_test, pred_probas) stats["ROC_AUC"] = auc(fpr,tpr) print "ROC area under the curve", stats["ROC_AUC"] return clf, stats
def train_data_and_score_tree(features,labels, cv, depth): f_train, f_test, l_train, l_test = cross_validation.train_test_split( features, labels, test_size=cv,random_state=0 ) clf = ExtraTreesClassifier(max_depth=depth) # clf = DecisionTreeClassifier(max_depth=depth) clf = clf.fit(f_train,l_train) score = clf.score(f_test,l_test) return score,clf
def main(): results = {} for currency in currencies: logging.info('Currency: {0}'.format(currency)) # get data data = pd.read_csv( r'../../data/' + currency + '1440.csv', names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'], parse_dates=[[0, 1]], index_col=0, ).astype(float) logging.info('Loaded {0} rows'.format(len(data))) # print data.tail() # extract features features = extractFeatures(data) # print features.tail() # set rewards rewards = calculateRewards(data) rewards = rewards[-len(features):] # print rewards.tail() # train split X_train, X_test, y_train, y_test = cross_validation.train_test_split( features, rewards, test_size=0.40, # random_state=shuffle, ) logging.info('Data splitted') # create classifier logging.info('Classifier: training...') # rfc = RandomForestClassifier(n_estimators=30) rfc = ExtraTreesClassifier(n_estimators=20, oob_score=True, bootstrap=True) rfc.fit(X_train, y_train) # saving logging.info('Classifier: saving...') externals.joblib.dump(rfc, 'models/' + currency + '.pkl', compress=9) # score logging.info('Classifier: scoring...') results[currency] = { 'score': rfc.score(X=X_test, y=y_test), 'oob': rfc.oob_score_, } # break for currency, scores in results.iteritems(): logging.info('{0} score:{1:.2f} oob:{2:.2f}'.format(currency, scores['score'], scores['oob']))
def train_classifier(self): # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) # train the classifier print ("Fitting data ...") clf = ExtraTreesClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit(X_tfidf, y_train) ################## # get cross validation score ################## scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier on test data ################## X_test_CV = count_vect.transform(docs_test) print ("Shape of test data is "+str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) y_predicted = clf.predict(X_test_tfidf) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf,y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf,count_vect
def do_classify(X,y,Xtest,ytest): importance=[] scores=[] for i in range(100): clf = ExtraTreesClassifier(n_estimators=10) clf = clf.fit(X, y) scores.append(clf.score(Xtest,ytest)) importance.append(clf.feature_importances_) mean_importance=np.mean(importance,axis=0) mean_scores=np.mean(scores) return mean_importance,mean_scores
def extreme_tree(X_train, X_test, Y_train, Y_test): estimators = [10, 100, 500] criterion = ["gini", "entropy"] max_features = ["auto", "sqrt", "log2"] for est in estimators: for cr in criterion: for mf in max_features: extre_model = ExtraTreesClassifier(n_jobs=8, random_state=np.random.RandomState(), n_estimators=est, criterion=cr, max_features=mf) extre_model.fit(X_train, Y_train) score = extre_model.score(X_test, Y_test) print "ExtraTreesClassifier(n_jobs=8, random_state=np.random.RandomState(), n_estimators=%d, criterion=%s, max_features=%s) -> %.4f" % (est, cr, mf, score)
def trainClassifiersAndSave(computeScore=False): for db in dbs: if (not os.path.exists("clfs/" + db)): clf = ExtraTreesClassifier(n_estimators=100, random_state=0, n_jobs=-1, verbose=100) saveTrainedClassifier(db, clf) elif (computeScore): clf = joblib.load("clfs/" + db) if (computeScore): print("Loading test data...") loaded = loadDB(db + ".csv") X_test = loaded[:, 0:-1] y_test = loaded[:, -1] print("Normalized score is {}".format(clf.score(X_test, y_test))) X_test = y_test = 0
def extreamly_random_forest(train_data, predictors): # Applying method max_score = 0 best_n = 0 for n in range(1, 100): rfc_scr = 0. rfc = ExtraTreesClassifier(n_estimators=n) for train, test in KFold(len(train_data), n_folds=10, shuffle=True): rfc.fit(train_data[predictors].T[train].T, train_data["Survived"].T[train].T) rfc_scr += rfc.score(train_data[predictors].T[test].T, train_data["Survived"].T[test].T)/10 if rfc_scr > max_score: max_score = rfc_scr best_n = n print(best_n, max_score) rfc = ExtraTreesClassifier(best_n) # Creating submission create_submission(rfc, train_data, test_data, predictors, "rfcsurvivors.csv")
def runLogitAndNB(Xtrainsparse, Xtestsparse): for i in range(len(ytrainraw[0])): print "Output type %i" % i logit1 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=1) logit2 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=100) logit3 = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', C=10000) nb1 = naive_bayes.MultinomialNB(alpha=0.01, fit_prior=True, class_prior=None) nb2 = naive_bayes.MultinomialNB(alpha=0.1, fit_prior=True, class_prior=None) nb3 = naive_bayes.MultinomialNB(alpha=1, fit_prior=True, class_prior=None) RF1 = RandomForestClassifier(1, "entropy", None) RF2 = RandomForestClassifier(10, "entropy", None) RF3 = RandomForestClassifier(20, "entropy", None) ytrain = numpy.hstack((ytrainraw[:, i], ydevraw[:, i])) ytest = ytestraw[:, i] RF1.fit(Xtrainsparse, ytrain) RF2.fit(Xtrainsparse, ytrain) RF3.fit(Xtrainsparse, ytrain) scores = [RF1.score(Xtestsparse, ytest), RF2.score(Xtestsparse, ytest), RF3.score(Xtestsparse, ytest)] print "R-FOREST: Best score %.2f%%, min of %.2f%%" % (max(scores) * 100, min(scores) * 100) ERF = ExtraTreesClassifier(n_estimators=40, max_depth=None, min_samples_split=1, random_state=0) ERF.fit(Xtrainsparse, ytrain) print "EXTRA TREES: Best score %.2f%%" % (ERF.score(Xtestsparse, ytest) * 100) nb1.fit(Xtrainsparse, ytrain) nb2.fit(Xtrainsparse, ytrain) nb3.fit(Xtrainsparse, ytrain) scores = [nb1.score(Xtestsparse, ytest), nb2.score(Xtestsparse, ytest), nb3.score(Xtestsparse, ytest)] print "MULTI-NB: Best score %.2f%%" % (max(scores) * 100) logit1.fit(Xtrainsparse, ytrain) logit2.fit(Xtrainsparse, ytrain) logit3.fit(Xtrainsparse, ytrain) scores = [logit1.score(Xtestsparse, ytest), logit2.score(Xtestsparse, ytest), logit3.score(Xtestsparse, ytest)] print "LOGIT: Best score %.2f%%" % (max(scores) * 100) most_common = lambda lst : max(set(list(lst)), key=list(lst).count) print "Most common class frequency: %.1f%% (train) %.1f%% (test)" % \ (Counter(ytrain)[most_common(ytrain)] / float(len(ytrain)) * 100., \ Counter(ytest)[most_common(ytest)] / float(len(ytest)) * 100.) print
# ------------- Random Forest (Extra Trees) --------------- # Note, 2 procs is faster than 8 truthLabels = np.array([int(x['label']) for x in p]) from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier fCount = 5#featuresNorm.shape[1] # forest = ExtraTreesClassifier(n_estimators=10, compute_importances=False, n_jobs=4, bootstrap=False, random_state=0, max_features=1)#26) forest = ExtraTreesClassifier(n_estimators=100, compute_importances=True, n_jobs=7, bootstrap=True, random_state=0, max_features=fCount) # forest = RandomForestClassifier(n_estimators=30, compute_importances=True, n_jobs=4, bootstrap=True, random_state=0, max_features=10)#26) t0 = time.time() forest.fit(X, truthLabels) print "Time:", time.time()-t0 importances = forest.feature_importances_ forestScore = forest.score(X, truthLabels) # 100% predF = forest.predict(X) print forestScore if 1: figure(3) bar(range(fCount), importances, color='k') xticks(arange(.5, featuresNorm.shape[1]+.5), featureNames, fontsize=14) yticks(fontsize=12) title('Importance Weighting of Random Forest Features', fontsize=28) xlabel("Features", fontsize=22) ylabel("Weighting", fontsize=22) axis([-.25, fCount, 0, .2]) # forest = RandomForestClassifier(n_estimators=200, compute_importances=False, n_jobs=1, bootstrap=True, random_state=0, max_features=fCount) # BEST # forest = RandomForestClassifier(n_estimators=100, compute_importances=False, n_jobs=1, bootstrap=True, random_state=0, max_features=fCount) # forest = ExtraTreesClassifier(n_estimators=20, compute_importances=False, n_jobs=1, bootstrap=True, random_state=0, max_features=fCount)
from math import * import pandas as pd import numpy as np from sklearn.ensemble import ExtraTreesClassifier import matplotlib.pyplot as plt import re,os data=pd.read_csv('red.csv') x=data[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']] y=data['quality'] clf=ExtraTreesClassifier(n_estimators=200, max_depth=None,min_samples_split=1, random_state=0) clf.fit(x,y) test=pd.read_csv('red_test.csv') x=test[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']] y=test['quality'] p=clf.predict(x) print clf.score(x,y) print clf.feature_importances_ t=np.arange(0.0,100.0) plt.plot(t,test['quality'],'--',t,p,'-') plt.show()
n_features = X_train.shape[1] print "XX:", n_features g = 1.0/float((3*n_features)) print g print "Training." clf = RandomForestClassifier(n_estimators=850, max_depth=None, max_features=int(math.sqrt(n_features)), min_samples_split=100, random_state=144, n_jobs=4); clf.fit(X_train, y_train) print "Validation set score: RF " , clf.score(X_val, y_val) clf_etree = ExtraTreesClassifier(n_estimators=1000, max_depth=None, max_features=int(math.sqrt(n_features)), min_samples_split=100, random_state=144, n_jobs=4); clf_etree.fit(X_train, y_train) print "Validation set score: ERF " , clf_etree.score(X_val, y_val) clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) clf_boost.fit(X_train, y_train) print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val) #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) #clf_gboost.fit(X_train, y_train) #print "Validation set score:LR " , clf_gboost.score(X_val, y_val) print "Classifier:" print clf, clf.get_params() print clf_etree, clf_etree.get_params() print clf_boost, clf_boost.get_params()
"kernel_window":kernel_window, "n_estimators":n_estimators, "clf_name":clf_name, "test_pdb":fold["test"].tolist(), "train_pdb":fold["train"].tolist(), "diag_window":diag_window, "ratio_TP_to_FP":ratio_TP_to_FP, } print fold["test"] print "Training ExtraTrees classifier" clf = Classifier(n_estimators=n_estimators,n_jobs=28,) #class_weight='subsample') #class_weight="auto") # ExtraTrees clf.fit(X_train,Y_train) stats["train_acc"] = clf.score(X_train, Y_train) print "Training complete" print 'Training Accuracy: %.3f'%stats["train_acc"] del X_train, Y_train gc.collect() # For testing, now load the entire dataset! X_test,Y_test = load_fold_dataset(fold["test"],load_all=True) stats["test_acc"] = clf.score(X_test, Y_test) print 'Testing Accuracy: %.3f'%stats["test_acc"] X_test_TP = X_test[Y_test==1] Y_test_TP = Y_test[Y_test==1]
def use_pipeline_temporal(self): docs_train, docs_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=42) # docs_test and y_test will be overwritten dataset_test = pd.read_csv(path_to_labelled_test_data_file_temporal, header=0, names=['posts', 'class']) docs_test = dataset_test['posts'] y_test = dataset_test['class'] ##################### # Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent ##################### pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)), ('clf', ExtraTreesClassifier()), ]) # Build a grid search to find the best parameter # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vect__use_idf': (True, False), 'clf__n_estimators': (50, 100), # 'clf__criterion': ("gini", "entropy"), # 'clf__max_depth': (None, 2, 4), # 'clf__min_samples_split': (2, 4, 6), } ################# # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained. ################# cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1) clf_gs = grid_search.fit(docs_train, y_train) ############### # print the cross-validated scores for the each parameters set explored by the grid search ############### best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1]) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) print("Score for gridsearch is %0.2f" % score) # y_predicted = clf_gs.predict(docs_test) ############### # run the classifier again with the best parameters # in order to get 'clf' for get_important_feature function! ############### ngram_range = best_parameters['vect__ngram_range'] use_idf = best_parameters['vect__use_idf'] # vectorisation count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print("Shape of train data is " + str(X_CV.shape)) # tfidf transformation tfidf_transformer = TfidfTransformer(use_idf=use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) # train the classifier print("Fitting data with best parameters ...") clf = ExtraTreesClassifier().fit(X_tfidf, y_train) ################## # get cross validation score ################## scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier on test data ################## X_test_CV = count_vect.transform(docs_test) X_test_tfidf = tfidf_transformer.transform(X_test_CV) y_predicted = clf.predict(X_test_tfidf) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf, y_test)) # Print and plot the confusion matrix print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) # import matplotlib.pyplot as plt # plt.matshow(cm) # plt.show() return clf, count_vect
def use_pipeline_with_fs(self): ##################### #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent ##################### pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)), ("selector", SelectPercentile()), ('clf', ExtraTreesClassifier()), ]) # Build a grid search to find the best parameter # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1,1), (1,2), (1,3)], 'vect__use_idf': (True, False), 'clf__n_estimators': (50,100), 'clf__criterion': ("gini", "entropy"), 'clf__max_depth': (None,2,4), 'clf__min_samples_split': (2,4,6), 'selector__score_func': (chi2, f_classif), 'selector__percentile': (85, 95, 100), } ################# # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained. ################# cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1) clf_gs = grid_search.fit(docs_train, y_train) ############### # print the cross-validated scores for the each parameters set explored by the grid search ############### best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1]) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) print("Score for gridsearch is %0.2f" % score) #y_predicted = clf_gs.predict(docs_test) ############### # run the classifier again with the best parameters # in order to get 'clf' for get_important_feature function! ############### ngram_range = best_parameters['vect__ngram_range'] use_idf = best_parameters['vect__use_idf'] score_func = best_parameters['selector__score_func'] percentile = best_parameters['selector__percentile'] # vectorisation count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation tfidf_transformer = TfidfTransformer(use_idf=use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=score_func, percentile=percentile) combined_features = Pipeline([ ("vect", count_vect), ("tfidf", tfidf_transformer), ("feat_select", selector) ]) X_features = combined_features.fit_transform(docs_train,y_train) X_test_features = combined_features.transform(docs_test) print ("Shape of train data after feature selection is "+str(X_features.shape)) print ("Shape of test data after feature selection is "+str(X_test_features.shape)) # run classifier on selected features clf = ExtraTreesClassifier().fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file,'w') for fb in feature_boolean: f.write(str(fb)+'\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################# # run classifier on test data ################# y_predicted = clf.predict(X_test_features) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(X_test_features,y_test)) # Print and plot the confusion matrix print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) # import matplotlib.pyplot as plt # plt.matshow(cm) # plt.show() return clf,count_vect
def train_classifier_use_feature_selection(self): # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=_score_func, percentile=_percentile) print ("Fitting data with feature selection ...") selector.fit(X_tfidf, y_train) # get how many features are left after feature selection X_features = selector.transform(X_tfidf) print ("Shape of array after feature selection is "+str(X_features.shape)) clf = ExtraTreesClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file,'w') for fb in feature_boolean: f.write(str(fb)+'\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #################### #test clf on test data #################### X_test_CV = count_vect.transform(docs_test) print ("Shape of test data is "+str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) # apply feature selection on test data too X_test_selector = selector.transform(X_test_tfidf) print ("Shape of array for test data after feature selection is "+str(X_test_selector.shape)) y_predicted = clf.predict(X_test_selector) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(X_test_selector,y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf, count_vect
print(scores.mean()) submission(TestK, y_pred, name="ForestUpdatedGini.csv") # ## Extremly Randomized Trees # this one led to .8134 in kaggle with training set splitted 70:30 # In[431]: from sklearn.ensemble import ExtraTreesClassifier ert = ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_depth=None, min_samples_split=1,random_state=0) ert.fit(X_train, y_train) y_pred = ert.predict(TestK) #print('Misclassified samples: %d' % (y_test != y_pred).sum()) print('Training accuracy:' , ert.score(X_train, y_train)) print('Test accuracy:' , ert.score(X_test, y_test)) scores = cross_val_score(ert, X_train, y_train) print(scores.mean()) submission(TestK, y_pred, name="ERDupdate.csv") # ## Logistic regression # In[207]: from sklearn.linear_model import LogisticRegression LogisticRegression(penalty='l1') lr = LogisticRegression(penalty='l1', C=0.1)
def train_model(): TIL_n = feat.count_TIL_corpus() decoy_n = TIL_n*_DECOY_PROPORTION FP_n = feat.count_TIL_false_pos() wiki_n = feat.count_WIKI_corpus() skip_wiki_n = wiki_n // decoy_n # Keep the number of false positives in about the same Order-of-Mag skip_FP = FP_n // TIL_n print "Skipping every {} value in FP".format(skip_FP) if FLAG_BUILD_DECOY_LIST: build_skip_query(skip_wiki_n) print "Loading features" features = Word2Vec.load(feat.f_features) dimension = 100 # default dimension ITR_decoy = query_skip_decoys() print "Building training set" ITR_train = list(feat.TIL_full_corpus_iter()) print "Building the false positive set" ITR_FP = list(feat.TIL_false_pos_iter(skip_FP)) print "Building corpus iter" ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy) ITR = list(ITR) Y = np.zeros(len(ITR)) Y[:TIL_n] = 1.0 TTS = train_test_split x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2) print "Proportion of answers {}/{}".format(y_train.sum(), y_test.sum()) print "Calculating the wordVecs for train" vec_train = np.concatenate([getWordVecs(text,weight, features,dimension) for text,weight in x_train]) print "Building the scalar" scaler = preprocessing.StandardScaler().fit(vec_train) print "Saving the scaler" joblib.dump(scaler, f_norm_scale) print "Scaling train vectors" vec_train = scaler.transform(vec_train) print "Calculating the wordVecs for test" vec_test = np.concatenate([getWordVecs(text,weight,features,dimension) for text,weight in x_test]) print "Scaling test vectors" vec_test = scaler.transform(vec_test) print "Train size/TP in sample", vec_train.shape, (y_train==1).sum() print "Test size/TP in sample", vec_test.shape, (y_test==1).sum() print "Training classifer" #from sklearn.linear_model import SGDClassifier as Classifier #from sklearn.linear_model import LogisticRegression as Classifier #from sklearn.linear_model import BayesianRidge as Classifier #from sklearn.naive_bayes import BernoulliNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.naive_bayes import GaussianNB as Classifier #from sklearn.ensemble import RandomForestClassifier as Classifier from sklearn.ensemble import ExtraTreesClassifier as Classifier # This seems to be the best... but high FP rate #from sklearn.naive_bayes import BernoulliNB as Classifier #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD #clf = Classifier(C=2500,verbose=2) # LogisiticRegression #clf = Classifier() # Naive Bayes clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees clf.fit(vec_train, y_train) print 'Test Accuracy: %.3f'%clf.score(vec_test, y_test) idx_TP = np.array(y_test) > 0 vec_TP = np.array(vec_test)[idx_TP] y_TP = np.array(y_test)[idx_TP] print 'Test Accuracy on TP: %.3f'%clf.score(vec_TP, y_TP) vec_FP = np.array(vec_test)[~idx_TP] y_FP = np.array(y_test)[~idx_TP] print 'Test Accuracy on FP: %.3f'%clf.score(vec_FP, y_FP) print "Saving the classifer" joblib.dump(clf, f_clf) #Create ROC curve from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt pred_probas = clf.predict_proba(vec_test)[:,1] fpr,tpr,_ = roc_curve(y_test, pred_probas) roc_auc = auc(fpr,tpr) plt.plot(fpr,tpr,label='area = %.2f' %roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.show()
TTS = train_test_split x_train, x_test, y_train, y_test = TTS(X, Y, test_size=0.17) print "Scaling train vectors" x_train = scalar.transform(x_train) print "Scaling text vectors" x_test = scalar.transform(x_test) print "Training classifer" from sklearn.ensemble import ExtraTreesClassifier as Classifier clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees clf.fit(x_train, y_train) print 'Test Accuracy: %.3f'%clf.score(x_test, y_test) y_test = np.array(y_test) for n in _INV_STATUS_MAP.keys(): idx = y_test==n try: score = clf.score(x_test[idx], y_test[idx]) except: score = -1 print 'Test Accuracy on {}: {:0.3f}'.format(_INV_STATUS_MAP[n], score) print print "Suggesting some new entries"