def predict_TestData(Food_df,People_df): cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0) TrainX= TrainX_df.ix[:,2:].values TestX= TestX_df.ix[:,2:].values TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) ET_classifier.fit(TrainX,TrainY) ET_prediction = ET_classifier.predict(TestX) LinSVC_classifier = svm.LinearSVC() LinSVC_classifier.fit(TrainX,TrainY) LinSVC_predict = LinSVC_classifier.predict(TestX) a=DataFrame() a["url"]=TestX_df.urls.values a["answer"]=TestY a["ET_predict"]=ET_prediction a["LinSVC_predict"]=LinSVC_predict a.to_csv("prediction_for_TestData.csv")
def crossVal(positions, X, y, missedYFile): outF = open(missedYFile, 'w') posArray = np.array(positions) # Split into training and test sss = StratifiedShuffleSplit(y, 4, test_size=0.1, random_state=442) cvRound = 0 for train_index, test_index in sss: clf = ExtraTreesClassifier(n_estimators=300, random_state=13, bootstrap=True, max_features=20, min_samples_split=1, max_depth=8, min_samples_leaf=13, n_jobs=4 ) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] pos_test = posArray[test_index] clf = clf.fit(X_train, y_train) preds = clf.predict(X_test) metrics.confusion_matrix( y_test, preds ) print( metrics.classification_report(y_test, clf.predict(X_test)) ) for loc,t,p in zip(pos_test, y_test, preds): if t=='0' and p=='1': print >> outF, loc + '\t' + str(cvRound) cvRound += 1 outF.close()
def train_UsingExtraTreesClassifier(df,header,x_train, y_train,x_test,y_test) : # training clf = ExtraTreesClassifier(n_estimators=200,random_state=0,criterion='gini',bootstrap=True,oob_score=1,compute_importances=True) # Also tried entropy for the information gain but 'gini' seemed to give marginally better fit, bith in sample & out of sample clf.fit(x_train, y_train) #estimation of goodness of fit print "Estimation of goodness of fit using the ExtraTreesClassifier is : %f \n" % clf.score(x_test,y_test) print "Estimation of out of bag score using the ExtraTreesClassifier is : %f \n \n " % clf.oob_score_ # getting paramters back, if needed clf.get_params() # get the vector of predicted prob back y_test_predicted= clf.predict(x_test) X = df[df.columns - [header[-1]]] feature_importance = clf.feature_importances_ # On a scale of 10 - make importances relative to max importance and plot them feature_importance = 10.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) #Returns the indices that would sort an array. pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(12, 6)) plt.subplot(1, 1, 1) plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, X.columns[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() return y_test_predicted
class Identifier: def __init__(self,grabable = set([]),clf = None): self.grabable = grabable #TODO if we care to, not used at the mo self.orb = orb = cv2.ORB(nfeatures = 1000)#,nlevels = 20, scaleFactor = 1.05) self.items = [ "champion_copper_plus_spark_plug", "cheezit_big_original","crayola_64_ct", "dove_beauty_bar", "elmers_washable_no_run_school_glue","expo_dry_erase_board_eraser", "feline_greenies_dental_treats","first_years_take_and_toss_straw_cups", "genuine_joe_plastic_stir_sticks","highland_6539_self_stick_notes", "kong_air_dog_squeakair_tennis_ball","kong_duck_dog_toy", "kong_sitting_frog_dog_toy", "kygen_squeakin_eggs_plush_puppies","mark_twain_huckleberry_finn", "mead_index_cards","mommys_helper_outlet_plugs","munchkin_white_hot_duck_bath_toy", "one_with_nature_soap_dead_sea_mud","oreo_mega_stuf", "paper_mate_12_count_mirado_black_warrior","rollodex_mesh_collection_jumbo_pencil_cup", "safety_works_safety_glasses", "sharpie_accent_tank_style_highlighters", "stanley_66_052" ] if not clf: print "Training new classifier" self.clf =ExtraTreesClassifier(min_samples_split = 1,n_jobs = -1,n_estimators = 150, class_weight = 'subsample') X = np.ascontiguousarray(joblib.load('labels.pkl')) Y = np.ascontiguousarray(joblib.load('features.pkl'), dtype = np.float64) Y = preprocessing.scale(Y) self.clf.fit(Y,X) else: self.clf = clf def identify(self,im,possibilites): if im is not None: kpTest, desTest = self.orb.detectAndCompute(im,None) pred = self.clf.predict(preprocessing.scale(np.array(desTest,dtype = np.float64))) c = Counter(pred) r = [(k,c[k]) for k in sorted(set(c.keys())&possibilites, key = lambda k: c[k],reverse = True)] if r: item = r[0][0] print self.items[item], return item else: return -1 else: print "Image to recognize is None"
def automatic_bernulli(): data = pd.read_csv('/home/vasiliy/Study/StadiumProject/Classifier/signs.csv', sep=';') Y = np.array(data['fight'].get_values()) np.random.shuffle(Y) data.drop(['match', 'city', 'date', 'fight'], 1, inplace=True) # data = data[['anger_over_value_relation', 'avg_likes', 'sc_max_surprise', 'sc_median_fear', # 'fear_over_value_relation']] X = data.as_matrix() features_number = 0 result = {} for features_number in range(3, 16): X_new = SelectKBest(f_classif, k=features_number).fit_transform(X, Y) # X_new = X classifier = ExtraTreesClassifier() super_means = [] for i in range(1000): kf = KFold(len(X_new), n_folds=6, shuffle=True) means = [] for training, testing in kf: classifier.fit(X_new[training], Y[training]) prediction = classifier.predict(X_new[testing]) curmean = np.mean(prediction == Y[testing]) means.append(curmean) super_means.append(np.mean(means)) print 'features_number=', features_number, 'Mean accuracy: {:.1%} '.format( np.mean(super_means)) # result['fn'+str(features_number)+'n_n'+str(n_neib)] = np.mean(super_means) score, permutation_scores, pvalue = permutation_test_score(classifier, X_new, Y, scoring="accuracy", cv=kf, n_permutations=len(Y), n_jobs=1) print ("Classification score %s (pvalue : %s)" % (score, pvalue))
def ERFC_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting Extreme Random Forest Classifier***************") t0 = time() clf = ExtraTreesClassifier(n_estimators=100,n_jobs=-1) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("Extreme Random Forest Classifier - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending Extreme Random Forest Classifier***************") return pd.DataFrame(preds2) , pd.DataFrame(preds3),pd.DataFrame(preds4)
def extratreeclassifier(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Extremely Randomized Trees" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"_Extremely_Random_Forest_metrics_test.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees %f"%test_size save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def extremeRand(trainData,testData,trainOuts,testOuts): clf = ExtraTreesClassifier(n_estimators=5, max_depth=10, min_samples_split=1, random_state=2) print(clf.fit(trainData,trainOuts)) predictions = clf.predict(testData) print(predictions) misses,error = sup.crunchTestResults(predictions,testOuts,.5) print(1-error)
def classify(X,Y,test_data,test_labels): print("Building the model for random forests...") Y = np.ravel(Y) test_labels = np.ravel(test_labels) clf = ExtraTreesClassifier(n_estimators=10) clf = clf.fit(X,Y) print("Classification Score using Random Forests:" + str(clf.score(test_data,test_labels))) output = clf.predict(test_data) return output
def EXRT(X_train,t_train,x,t,predict): for i in [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]: clf = ExtraTreesClassifier(n_estimators=500, max_depth=None, max_features=i) clf.fit(X_train, t_train) prediction = clf.predict(x) if predict: write_predictions(t,prediction) else: get_accuracy(prediction,t)
def et_classify(self): print "Extra Trees" clf = ExtraTreesClassifier() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) pred = clf.predict(self.test_descr) print "Pred ", pred print "Mean : %3f" % mean print "Feature Importances ", clf.feature_importances_
def extratree_cla(train_data, train_id, test_data, seed = None): clf = ExtraTreesClassifier(n_estimators=1000, n_jobs=4, random_state= seed)#, max_features="log2") param_grid = { 'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2'] } clf.fit(train_data, train_id) pred_class = clf.predict(test_data) pred_prob = clf.predict_proba(test_data) return pred_class, pred_prob
def test_extra_trees_3(): """Ensure that the TPOT ExtraTreesClassifier outputs the same as the sklearn version when min_weight > 0.5""" tpot_obj = TPOT() result = tpot_obj._extra_trees(training_testing_data, 0, 1., 0.6) result = result[result['group'] == 'testing'] etc = ExtraTreesClassifier(n_estimators=500, random_state=42, max_features=1., min_weight_fraction_leaf=0.5, criterion='gini') etc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, etc.predict(testing_features))
def train_classifier(self): # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print ("Shape of train data is "+str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) # train the classifier print ("Fitting data ...") clf = ExtraTreesClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit(X_tfidf, y_train) ################## # get cross validation score ################## scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted') print ("Cross validation score: "+str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier on test data ################## X_test_CV = count_vect.transform(docs_test) print ("Shape of test data is "+str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) y_predicted = clf.predict(X_test_tfidf) # print the mean accuracy on the given test data and labels print ("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf,y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf,count_vect
def PCA_reduction(posture, trainblock, componenet): currentdirectory = os.getcwd() # get the directory. parentdirectory = os.path.abspath(currentdirectory + "/../..") # Get the parent directory(2 levels up) path = parentdirectory + '\Output Files\E5-Dimensionality Reduction/posture-'+str(posture)+'/TrainBlock-'+str(trainblock)+'' if not os.path.exists(path): os.makedirs(path) i_user = 1 block = 1 AUC = [] while i_user <= 31: while block <= 6: train_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(trainblock)+"-GI.csv", dtype=float, delimiter=",") test_data = np.genfromtxt("../../Output Files/E3-Genuine Impostor data per user per posture/posture-"+str(posture)+"/User-"+str(i_user)+"/1-"+str(i_user)+"-"+str(posture)+"-"+str(block)+"-GI.csv", dtype=float, delimiter=",") target_train = np.ones(len(train_data)) row = 0 while row < len(train_data): if np.any(train_data[row, 0:3] != [1, i_user, posture]): target_train[row] = 0 row += 1 row = 0 target_test = np.ones(len(test_data)) while row < len(test_data): if np.any(test_data[row, 0:3] != [1, i_user, posture]): target_test[row] = 0 row += 1 sample_train = train_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]] sample_test = test_data[:, [3,4,5,6,7,9,11,12,13,14,15,16,17]] scaler = preprocessing.MinMaxScaler().fit(sample_train) sample_train_scaled = scaler.transform(sample_train) sample_test_scaled = scaler.transform(sample_test) pca = PCA(n_components=componenet) sample_train_pca = pca.fit(sample_train_scaled).transform(sample_train_scaled) sample_test_pca = pca.transform(sample_test_scaled) clf = ExtraTreesClassifier(n_estimators=100) clf.fit(sample_train_pca, target_train) prediction = clf.predict(sample_test_pca) auc = metrics.roc_auc_score(target_test, prediction) AUC.append(auc) block += 1 block = 1 i_user += 1 print(AUC) AUC = np.array(AUC) AUC = AUC.reshape(31, 6) np.savetxt("../../Output Files/E5-Dimensionality Reduction/posture-"+str(posture)+"/TrainBlock-"+str(trainblock)+"/PCA-"+str(componenet)+"-Component.csv", AUC, delimiter=",")
def extraTree(X, y, train, valid): clf = ExtraTreesClassifier(n_jobs = -1, n_estimators = 300, verbose = 2, random_state = 1, max_depth = 10, bootstrap = True) clf.fit(X[train], y[train]) yhat = clf.predict(X[valid]) yhat_prob = clf.predict_proba(X[valid])[:,1] print("extra tree randomForest" + str(accuracy_score(y[valid], yhat))) print(classification_report(y[valid], yhat)) print("extra tree randomForest roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob))) np.savetxt("y_extratree.csv", yhat_prob) return yhat_prob
def main(): print "Reading training data" trdata = csvtolist2D('train.csv') print "Length of training data : " ,len(trdata) print "Reading test data" testdata = csvtolist2D('test.csv') print "Length of test data : " ,len(testdata) #first row is for Headings trdata = trdata[1:] testdata = testdata[1:] labels,i=[],0 for row in trdata: labels.append(int(row[0])) trdata[i]=row[1:] i=i+1 """ print "Extracting features for trdata..." trfeatures = featureextractor(trdata) list2DtoCSV(trfeatures,"Ptrfeatures.csv") print "Extracting features for testdata..." testfeatures = featureextractor(testdata) list2DtoCSV(testfeatures,"Ptestfeatures.csv") """ print "reading features...." trfeatures = csvtolist2D("Ptrfeatures.csv") testfeatures = csvtolist2D("Ptestfeatures.csv") """ scaler = preprocessing.StandardScaler().fit(trfeatures) trfeatures = scaler.transform(trfeatures) testfeatures = scaler.transform(testfeatures) """ print "Starting training..." #clf = svm.SVC() #clf = RandomForestClassifier(n_estimators=150) clf = ExtraTreesClassifier(n_estimators=150) clf = clf.fit(trfeatures, labels) print "Predicting result..." RFCresult = clf.predict(testfeatures) output=[['ImageId','Label']] i=1 for ele in RFCresult: row=[] row.append(i) row.append(ele) output.append(row) i=i+1 list2DtoCSV(output,"Poutput.csv")
def extratreeclassifier(input_data,output_labels,filename, m_d=3, n_est=10, rs=0): # Learn an ExtraTreesClassifier for comparison from sklearn.ensemble import ExtraTreesClassifier etC = ExtraTreesClassifier(max_depth= m_d, n_estimators=n_est, random_state=rs) crossValidation(input_data, output_labels, etC) X_train, X_test, Y_train, Y_test = train_test_split(input_data, output_labels, test_size=0.25, random_state=42) etC.fit(X_train,Y_train) predictionsTrees = etC.predict(X_test) calc_conf_matrix(Y_test, predictionsTrees, 'Extra Tree Classifier confusion matrix', filename+'_cm') roc_plot(input_data,output_labels, etC,'roc_'+filename ) coeff_of_deterimination(etC, input_data, output_labels, 3) print etC.feature_importances_
def test_extra_trees_3(): """Ensure that the TPOT ExtraTreesClassifier outputs the same as the sklearn version when max_features > the number of features""" tpot_obj = TPOT() training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(tpot_obj.non_feature_columns, axis=1) num_features = len(training_features.columns) result = tpot_obj._extra_trees(training_testing_data, 0, num_features + 1) result = result[result['group'] == 'testing'] etc = ExtraTreesClassifier(n_estimators=500, random_state=42, max_features=num_features, criterion='gini') etc.fit(training_features, training_classes) assert np.array_equal(result['guess'].values, etc.predict(testing_features))
def execute(fdata): data = list() target = list() storeDict = dict() for i, lines in enumerate(fdata): sline = lines.split(",") target.append(int(sline[0])) data.append([float(x) for j, x in enumerate(sline) if j != 0]) storeDict[i] = [float(x) for j, x in enumerate(sline) if j != 0] data = np.array(data) X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, target, test_size=0.25, random_state=0) clf = ExtraTreesClassifier() clf = clf.fit(X_train, y_train) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X_train) clfNew = svm.SVC(kernel='linear', C=1).fit(X_new, y_train) value_feature = list() countDict = dict() for key, val in storeDict.items(): countDict[key] = 0 for i, inval in enumerate(val): if inval in X_new[0]: countDict[key] = countDict[key] + 1 keyName = max(countDict, key=countDict.get) posStore = list() for val in X_new[0]: posStore.append(storeDict[keyName].index(val)) X_test_new = list() for val in X_test: inlist = list() for i, inval in enumerate(val): if i in posStore: inlist.append(inval) X_test_new.append(inlist) X_test_new = np.array(X_test_new) return accuracy_score(y_test, clf.predict(X_test)), accuracy_score(y_test, clfNew.predict(X_test_new))
def feature_selection(train, y): sss = StratifiedShuffleSplit(y, n_iter=1, test_size=.3, random_state=42) train_idx, test_idx = next(iter(sss)) xtrain = train.iloc[train_idx].values ytrain = y.iloc[train_idx].values xtest = train.iloc[test_idx].values ytest = y.iloc[test_idx].values clf_et = ExtraTreesClassifier().fit(xtrain, ytrain) et_preds = clf_et.predict(xtest) print 'initial f1 score based on extra trees classifier: ', f1_score(ytest, et_preds) feat_imp = clf_et.feature_importances_ sorted_fi = feat_imp[np.argsort(feat_imp)[::-1]] #descending sort print 'feature importance: ', feat_imp print 'sorted feature importances: ', sorted_fi clf_gb = GradientBoostingClassifier() feats_tot = xtrain.shape[1] f1_best = 0 print "output format:" print "no of features, f1-score, roc-score of class-predictions, roc-score of probabilities" for feats in range(1,feats_tot+1): threshold_idx = min(len(sorted_fi),feats) threshold = sorted_fi[threshold_idx] select = (feat_imp>threshold) clf_gb.fit(xtrain[:,select],ytrain) tmp_preds = clf_gb.predict(xtest[:,select]) tmp_probs = clf_gb.predict_proba(xtest[:,select])[:,1] f1 = f1_score(ytest,tmp_preds) roc_pred = roc_auc_score(ytest,tmp_preds) roc_prob = roc_auc_score(ytest,tmp_probs) if f1 > f1_best: f1_best = f1 np.save('./features/clf_sel.npy',select) print feats,f1,roc_pred,roc_prob if feats >= 16: break print "f1_best:", f1_best
def cross_val(clf_name, X, y, n_folds=5, proba=False, score=accuracy_score, *params, **kwargs): cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=41) if clf_name == "extra": c = ExtraTreesClassifier(12, max_depth=23, max_features=10, n_jobs=-1, *params, **kwargs) elif clf_name == "grad": c = GradientBoostingClassifier(n_estimators=40, learning_rate=0.1, *params, **kwargs) elif clf_name == "cgrad": c = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier(n_estimators = 20,learning_rate= 0.1, *params, **kwargs), method='isotonic', cv=10) elif clf_name == "cmulti": c = CalibratedClassifierCV(base_estimator=MultinomialNB(alpha = alpha_multi, *params, **kwargs), method='isotonic', cv=10) elif clf_name == "multi": c = MultinomialNB(*params, **kwargs) elif clf_name == "bag": c = BaggingClassifier(base_estimator=MultinomialNB(alpha = 0.5, *params, **kwargs),n_estimators = 100,n_jobs = -1) elif clf_name == "bern": c = BernoulliNB(alpha=0.00000000001, *params, **kwargs) elif clf_name == "gauss": c = GaussianNB(*params, **kwargs) elif clf_name == "random": c = RandomForestClassifier(1200,max_depth= 23,max_features = 10,n_jobs = -1, *params, **kwargs) elif clf_name == "lda": c = LinearDiscriminantAnalysis(*params, **kwargs) elif clf_name == "logistic": c = LogisticRegression(C=1, *params, **kwargs) elif clf_name == "svm": c = LinearSVC(C=100, *params, **kwargs) elif clf_name == "knn": c = KNeighborsClassifier(n_neighbors=20, *params, **kwargs) elif clf_name == "near": c = NearestCentroid(*params, **kwargs) elif clf_name == "ridge": c = OneVsOneClassifier(RidgeClassifier(alpha=0.1, *params, **kwargs)) elif clf_name == "sgd": c = SGDClassifier(loss="hinge", penalty="l2", n_iter=50, alpha=0.000001, fit_intercept=True, average=True) y_pred = np.zeros(y.shape) score_list = [] for i, (train, test) in enumerate(cv): c.fit(X[train,:], y[train]) if proba: y_pred[test] = c.predict_proba(X[test,:]) else: y_pred[test] = c.predict(X[test,:]) score_list.append(score(y[test], y_pred[test])) print(score_list[i]) print("Final score",score(y,y_pred)) return y_pred
def binary_cbf(oversampling=(0, 0)): """ :param oversampling: Tuple(Int), double review samples with star classes in range :return: None """ t = time() with sqlite3.connect(DB_PATH) as conn: y = FeatureReformer(conn, 'r_samples', ['rstar']).transform('y2').transpose()[0] X = FeatureReformer(conn, 'r_samples', [ 'brcnt', 'bstar', 'checkins', 'compliments', 'fans', 'rdate', 'urcnt', 'ustar', 'uvotes', 'ysince', ]).transform() # oversampling ovsp = over_sampling(y, oversampling) y = y[ovsp] X = X[ovsp] n_samples, n_features = X.shape print(X.shape) print('Done with collecting & reforming data from database, using ', time()-t, 's') t = time() rec_scorer = RecScorer(n_class=2) div = ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=0) model = ExtraTreesClassifier(n_estimators=5) for train, test in div: X_train = X[np.array(train)] X_test = X[np.array(test)] y_train = y[np.array(train)] y_test = y[np.array(test)] model.fit(X_train, y_train) y_pred = model.predict(X_test) # Metrics below rec_scorer.record(y_true=y_test, y_pred=y_pred) # print(confusion_matrix(y_true=y_test, y_pred=y_pred), '\n', time()-t, 's used >>\n') print(time()-t, 's used >>\n') print('Done with 5-fold training & cross validating, using ', time()-t, 's') rec_scorer.finalScores()
class ERFTrainer(object): def __init__(self, X, label_words): self.le = preprocessing.LabelEncoder() self.clf = ExtraTreesClassifier(n_estimators=100, max_depth=16, random_state=0) y = self.encode_labels(label_words) self.clf.fit(np.asarray(X), y) def encode_labels(self, label_words): self.le.fit(label_words) return np.array(self.le.transform(label_words), dtype=np.float32) def classify(self, X): label_nums = self.clf.predict(np.asarray(X)) label_words = self.le.inverse_transform([int(x) for x in label_nums]) return label_words
def etclassifier(training_samples, eval_samples, do_grid_search=True): X_train, Y_train = training_samples X_eval, Y_eval = eval_samples clf = ExtraTreesClassifier(max_depth=None, n_estimators=1000, min_weight_fraction_leaf=0.0, max_features=None, min_samples_split=16, criterion='gini', min_samples_leaf=2, max_leaf_nodes=None, oob_score=False, bootstrap=True, n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None) to_be_tuned_parameters = { #'n_estimators':[500, 2000, 4000], 'max_features':['log2', 'auto', None], 'min_samples_split':[2, 8, 16], 'min_samples_leaf': [1, 2], } if do_grid_search: clf = GridSearchCV(clf, to_be_tuned_parameters, cv=5, n_jobs=5, scoring='log_loss') #Best parameters set found on development set: #() #{'max_features': None, 'min_samples_split': 10, 'n_estimators': 1000, 'min_samples_leaf': 2} print(clf) clf.fit(X_train, Y_train) if do_grid_search: print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() for params, mean_score, scores in clf.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) else: scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss') print scores, np.mean(scores), np.median(scores) Y_eval = clf.predict(X_eval) Y_prob = clf.predict_proba(X_eval) return Y_eval, Y_prob
def trainExtraRandomForest(self, trainingData, evaluationData, validCols): """ Train the extra random forest model, and return the predictions and the instantiated trained model """ # Get the results list trainingResults = trainingData['Result'].tolist() nEstimators = randint(50,1000) logging.info('Extra Random Forest - Model Iterations %i', nEstimators) extraModel = ExtraTreesClassifier(n_estimators= nEstimators, max_depth=None, min_samples_split=1, random_state=0) # Train the model start = time.clock() extraModel = extraModel.fit(trainingData[validCols],trainingResults) elapsed = (time.clock() - start) logging.info('Extra Random Forest - Training Time %f secs', elapsed) # Return the model predictions for evaluation return extraModel.predict(evaluationData[validCols]), extraModel
def allfeatures_001(): train = classes.get_train_data() copy = ColumnExtractor(['group_size', 'homeowner', 'car_age', 'age_oldest', 'age_youngest', 'married_couple']) day = DayTransformer() state = StateTransformer() car_val = FillEncoderBinarizer('car_value', 'z') risk_factor = FillEncoderBinarizer('risk_factor', 0) c_prev = FillEncoderBinarizer('C_previous', 0) c_dur = FillEncoderBinarizer('duration_previous', -1) last_plan = LastObservedPlan() features = FeatureUnion([ ('copy', copy), ('day', day), ('state', state), ('car_val', car_val), ('risk_factor', risk_factor), ('c_prev', c_prev), ('c_dur', c_dur), # ('last_plan', last_plan) ]) pipeline = Pipeline([ ('filter', LastShoppingPointSelector()), ('features', features) ]) train, test = classes.train_test_split(train) train_x = pipeline.fit_transform(train) train_y = classes.split_plan(classes.get_actual_plan(train)) y_encoder = classes.MultiColLabelBinarizer() y = y_encoder.fit_transform(train_y[list('ABCDEFG')]) # Just on one col est = ExtraTreesClassifier(n_estimators=100, verbose=3) est.fit(train_x, y) actuals = classes.split_plan(classes.get_actual_plan(test)) test_x = classes.truncate(test) test_x = pipeline.transform(test_x) pred = classes.concatenate_plan(y_encoder.inverse_transform(est.predict(test_x))) score = classes.score_df(pred, actuals) scores = classes.col_score_df(pred, actuals)
def train_model(stats, X_train, Y_train, X_test=None, Y_test=None): print "Training ExtraTrees classifier" clf = Classifier(n_estimators=n_estimators,n_jobs=30, min_samples_leaf=nodesize, #class_weight='balanced_subsample', ) clf.fit(X_train,Y_train) stats["train_acc"] = clf.score(X_train, Y_train) print "Training complete" print 'Training Accuracy: %.3f'%stats["train_acc"] # Breakout early if no test set is given if X_test is None: return clf, stats stats["test_acc"] = clf.score(X_test, Y_test) print 'Testing Accuracy: %.3f'%stats["test_acc"] X_test_TP = X_test[Y_test==1] Y_test_TP = Y_test[Y_test==1] stats["test_acc_TP"] = clf.score(X_test_TP, Y_test_TP) print 'Testing Accuracy TP: %.3f'%stats["test_acc_TP"] X_test_FP = X_test[Y_test==0] Y_test_FP = Y_test[Y_test==0] stats["test_acc_FP"] = clf.score(X_test_FP, Y_test_FP) print 'Testing Accuracy FP: %.3f'%stats["test_acc_FP"] pred_probas = clf.predict_proba(X_test)[:,1] Y_predict = clf.predict(X_test) total_contacts = Y_test.sum() predicted_contacts = Y_predict[Y_test==1].sum() print 'Total contacts predicted %i/%i'%(predicted_contacts,total_contacts) fpr,tpr,_ = roc_curve(Y_test, pred_probas) stats["ROC_AUC"] = auc(fpr,tpr) print "ROC area under the curve", stats["ROC_AUC"] return clf, stats
def makeAllEvals(dataset,dbtype='CATH',level=1,k_iters=10): dataDict = dbParser(dataset,level=level,dbtype=dbtype) print dataDict labels = dataDict['target_names'] skf = StratifiedKFold(labels, k_iters) #Level 1 #clf = ExtraTreesClassifier(n_estimators=100,min_samples_split=2,max_depth=None) #Level 2 clf = ExtraTreesClassifier(n_estimators=300,min_samples_split=2,max_depth=None) accsList = [] for train, test in skf: print '\n--------------------------------------------------\n' _train = [dataDict['vectors'][i] for i in train] _test = [dataDict['vectors'][i] for i in test] _targets = [dataDict['target_names'][i] for i in train] clf.fit(_train,_targets) y_true = [labels[i] for i in test] y_pred = clf.predict(_test) localAccuracy = accuracy_score(y_true, y_pred) accsList.append(localAccuracy) print '*** localAccuracy:', localAccuracy print(classification_report(y_true, y_pred)) cm = confusion_matrix(y_true, y_pred,labels=clf.classes_) print cm _ACC = np.mean(accsList) _ACCstd = np.std(accsList) print '\n[ FINAL SUMMARY ]' print ' *** ACCURACY: ', _ACC print ' *** ACCURACY deviation: ', _ACCstd
class extraTrees: def __init__(self,predictorsLabel,targetLabel='target',n_estimators=5): self.predictorsLabel = predictorsLabel self.targetLabel = targetLabel self.ETC = ExtraTreesClassifier(n_estimators) def training(self,trainingData,n_estimators=5): target = trainingData[self.targetLabel] train = pd.DataFrame(trainingData, columns=self.predictorsLabel) train = train.replace(np.nan,-1.0) self.ETC.fit(train,target) def prediction(self,predictionData): test = pd.DataFrame(predictionData, columns=self.predictorsLabel) test = test.replace(np.nan,-1.0) predTarget = self.ETC.predict(test) pred = pd.DataFrame(np.array([test['ID'],predTarget]).T,\ columns=["ID","target"]) return pred
etc.fit(X_train, y_train) print("\n----------ET----------") print('Accuracy of ET classifier on training set: {:.3f}'.format(etc.score(X_train, y_train))) # test data set acc print('Accuracy of ET classifier on test set: {:.3f}'.format(etc.score(X_new_bal, y_new_bal))) # ' test data matrix' # y_pred = etc.fit(X_train, y_train).predict(X_test) # # Plot non-normalized confusion matrix # plot_confusion_matrix(y_test, y_pred, classes=class_names, # title='Confusion matrix, without normalization') # plt.show() 'Confusion Matrix' y_pred_et = etc.predict(X_new_bal) # Plot non-normalized confusion matrix plot_confusion_matrix(y_new_bal, y_pred_et, classes=class_names, title='Confusion matrix, without normalization') print("Time consuming of DT is: ", time.time() - start_time_ET) plt.show() # ' roc for test' # roc_curve_plot(X_test, y_test, etc) # ' pre_re for test' # precision_recall_curve(X_Balance_test, y_Balance_test, etc) "2.7 Logistic Regression" start_time_LR = time.time() from sklearn.linear_model import LogisticRegression LR = LogisticRegression(solver='lbfgs')
# Wow! We can see that we have a lot of features for nothing! Basicly the Tickets Features. # These features generate noise # In[ ]: #Selected features g = sns.barplot(y=col[:X_test_best.shape[1]], x=importances[indices][:X_test_best.shape[1]], orient='h') g.set_xlabel("Relative importance", fontsize=12) g.set_ylabel("Features", fontsize=12) g.tick_params(labelsize=9) g.set_title("Feature importance") plt.show() # So, my new features "Groups", "Ticket_tog" and "CabinYN" are some of the important features !! # They are not among the most important features such as: Embarked and Tickets. # The best algorithm is Extra Tree Classifier. # Create the results file. # In[ ]: ExtC.fit(X_train_best, y_train) test_Survived = pd.Series(ExtC.predict(X_test_best), name="Survived") r = pd.DataFrame(test_Survived, dtype="int64") results = pd.concat([IDtest, r], axis=1) results.to_csv("result.csv", index=False) #
#Get the CV score of the tree classifier. result = cross_val_score(cls, x_train, x_target, cv=skf, scoring=make_scorer(acc), n_jobs=-1) #Print the accuracy of the classifier. print("ACC: %0.2f (+/- %0.2f)" % (result.mean(), result.std())) #Fit the data cls.fit(x_train, x_target) #Confusion matrix prediction = cls.predict(x_train) cnf_matrix = confusion_matrix(x_target, prediction) np.set_printoptions(precision=2) plt.figure() plot_confusion_matrix(cnf_matrix, classes=range(len(set(x_target))), normalize=False, title='Confusion matrix') plt.savefig("bankTreeConfusion.png", bbox_inches='tight') plt.savefig("bankTreeConfusion.pdf", bbox_inches='tight') #Get important features importances = cls.feature_importances_
max_depth=15, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) xtra.fit(train[features], train['Mood']) print "Extra Trees Classifier" print accuracy_score(test['Mood'], xtra.predict(test[features])) ada = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1, n_estimators=300, random_state=None) ada.fit(train[features], train['Mood']) print "Ada Boost Classifier" print accuracy_score(test['Mood'], ada.predict(test[features])) knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean', metric_params=None, n_jobs=1,
best_params = { 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 500 } et_clf = ExtraTreesClassifier(**best_params) et_clf.fit(X_train, y_train) print('test accuracy') y_hat = et_clf.predict(X_test) print(classification_report(y_test, y_hat)) print('all towns') y_all_hat = et_clf.predict(X_data) print(classification_report(y.targets, y_all_hat)) selected_data['predicted'] = y_all_hat # %% print('predicted positive') pp = [] for pop_id, r in selected_data[selected_data.predicted == 1].iterrows(): print(pop_id, r.predicted, r.city_type, r.city_name) if r.city_name is not None: pp.append(r.city_name) print(', '.join(sorted(set(pp))))
rfe = rfe.fit(x, y) print(rfe.support_) print(rfe.ranking_) #define the training and test sets x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1) print x_test.shape # Decisiontree algorithm #train the model on test set model = DecisionTreeClassifier() model.fit(x_train, y_train) print(model) # make predictions for the test set expected = y_test predicted = model.predict(x_test) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) sales_test['return'] = model.predict(z) print(sales_test['return'].value_counts()) Kscore = cross_val_score(model, x, y, cv=10, scoring='accuracy') print(Kscore) print(Kscore.mean()) from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier #clf = RandomForestClassifier(n_estimators=10) clf = KNeighborsClassifier(n_neighbors=1)
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.7593650793650794 exported_pipeline = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.6500000000000001, min_samples_leaf=5, min_samples_split=9, n_estimators=100) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0} if len(sys.argv) > 1: if sys.argv[1] == 'balance': params = { 'n_estimators': 100, 'max_depth': 4, 'random_state': 0, 'class_weight': 'balanced' } else: raise TypeError("Invalid input argument; should be 'balance'") classifier = ExtraTreesClassifier(**params) classifier.fit(x_train, y_train) visualize_classifier(classifier, x_train, y_train, 'Training dataset') y_test_pred = classifier.predict(x_test) visualize_classifier(classifier, x_test, y_test, 'Test dataset') # Evaluate classifier performance class_names = ['Class-0', 'Class-1'] print("\n" + "#" * 40) print("\nClassifier performance on training dataset\n") print( classification_report(y_train, classifier.predict(x_train), target_names=class_names)) print("#" * 40 + "\n") print("#" * 40) print("\nClassifier performance on test dataset\n") print(classification_report(y_test, y_test_pred, target_names=class_names)) print("#" * 40 + "\n")
tmp = np.zeros(self.shape) height, width = self.shape for row in xrange(height): for col in xrange(width): tmp[row][col] = self.getElement(row, col) return tmp if __name__ == "__main__": from sklearn.ensemble import ExtraTreesClassifier classif = ExtraTreesClassifier() nbObj = 10 nbFeat = 7 nbClass = 4 Xl = np.random.rand(nbObj, nbFeat) yl = np.random.randint(0, nbClass - 1, nbObj) Xlv = FeatureBiaser(Xl, [(0, 3), (1, 3)]) Xlv = Xlv.asContiguousArray() print Xl.shape, Xlv.shape classif.fit(Xlv, yl) Xt = np.random.rand(nbObj, nbFeat) Xtv = FeatureBiaser(Xt, [(0, 3), (1, 3)]) Xtv = Xtv.asContiguousArray() print Xt.shape, Xtv.shape classif.predict(Xtv)
# In[ ]: evaluate(y, y_pred2) # In[ ]: model3 = ExtraTreesClassifier() # In[ ]: model3.fit(X[:700], y[:700]) # In[ ]: y_pred3 = model3.predict(X[700:]) # In[ ]: evaluate(y[700:], y_pred3) # ## TODO: Only use these as features. # ## 1. Generate word bags using similar codes in naive_bayes_classifier # ## 2. The reval characterisctic is captured by n day rate movement, we first set n == 1 # ## 3. Fit a naive bayes model to learn about which word/words could have the most impact of the rate. # ## Future work # 1. Use title information and treat title worlds differently as those in the body of the article
class TreeRegression(RegressionModel): """ initialise class instance. """ def __init__(self, data, normalize=False, n_estimators=1000, min_samples_leaf=1, max_depth=None, **kwargs): # call parent function. RegressionModel.__init__(self, data, normalize=normalize, **kwargs) # placeholders specific to this class. self.model = None # Reference to the library used: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html # Selecting the most important features using a tress classifer algorithm# initialise a statsmodels OLS instance. self.model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf) """ fit the model with the training data. """ def train(self): # call parent function. RegressionModel.train(self) # building of a forest of tress based on the the untrained data set self.model.fit(self.train_x, self.train_y) # update the is_trained variable. self.is_trained = True """ display coefficient information. """ def describe(self): # call parent function. RegressionModel.describe(self) # uses an inbuilt class feature_importances of tree based classifiers - # which selects the most important features based on gini importance/ mean decrease impurity # in more laymen terms: along the lines of the less probability/ samples that read that particular node/ variable -> # the less important that variable is #print(self.model.feature_importances_) # plot a bar graph of feature importances - selecting all the features #feat_importances = pd.Series(self.model.feature_importances_, index=self.train_x.columns) #feat_importances.nlargest(len(self.train_x.columns)).plot(kind='barh') #plt.show() """ generate test predictions based on the fitted model. """ def test(self): # call parent function. RegressionModel.test(self) # predict TRAINING data. convert to pandas series. numpy_predictions_train = self.model.predict( self.train_x).flatten().astype(int) self.train_predictions = pd.Series(numpy_predictions_train, dtype="int32").clip(lower=0) # predict TESTING data. convert to pandas series. numpy_predictions_test = self.model.predict( self.test_x).flatten().astype(int) self.test_predictions = pd.Series(numpy_predictions_test, dtype="int32").clip(lower=0) # assess the performance of the predictions. self.assess_performance()
annot=True, cmap="RdYlGn") # ### Applying logistic regression and getting the coefficients # In[970]: model = LogisticRegression(C=10**2) ytest = ytest.to_numpy() xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.10, random_state=0, stratify=y) model.fit(xtrain, ytrain) predicted_classes = model.predict(xtest) accuracy = accuracy_score(ytest.to_numpy().flatten(), predicted_classes) parameters = model.coef_ print("Accuracy: ", accuracy) print("Parameters: ", parameters) # printing the coefficients cm = confusion_matrix(ytest, predicted_classes) print(cm) # ### Getting important features using Random Forrest # In[954]: sel = SelectFromModel(RandomForestClassifier(n_estimators=100)) sel.fit(xtrain, ytrain) # In[955]:
bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) bag.fit(X_train, y_train) y_pred = bag.predict(X_test) score = roc_auc_score(y_test, y_pred) print("Bag: Area under ROC {0}".format(score)) model_scores.append(score) s = precision_recall_fscore_support(y_test, y_pred) scores_f1_pre_re.append(s) from sklearn.ensemble import ExtraTreesClassifier etc = ExtraTreesClassifier(n_estimators=150, random_state=20, n_jobs=-1) etc.fit(X_train, y_train) y_pred = etc.predict(X_test) score = roc_auc_score(y_test, y_pred) print("ET: Area under ROC {0}".format(score)) model_scores.append(score) s = precision_recall_fscore_support(y_test, y_pred) scores_f1_pre_re.append(s) import xgboost as xgb gbm = xgb.XGBClassifier(max_depth=500, n_estimators=150, learning_rate=0.15, colsample_bytree=0.35) gbm.fit(X_train, y_train) y_pred = gbm.predict(X_test) score = roc_auc_score(y_test, y_pred)
best_columns = [ 'f_138', 'f_11', 'f_96', 'f_200', 'f_76', 'f_41', 'f_83', 'f_156', 'f_131', 'f_84', 'f_182', ] # -0.8605 exported_pipeline = ExtraTreesClassifier(max_features=0.367266672504996, criterion='entropy', min_samples_leaf=1, min_samples_split=2, n_estimators=4464) exported_pipeline.fit(X[best_columns], Y['target']) # --- answer module --- score_dataset = pd.read_csv('../original_data/x_test.csv', delimiter=';', names=names) y_pred = exported_pipeline.predict(score_dataset[best_columns]) pd.Series(y_pred).to_csv('../data/answer.csv', index=False)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix print(accuracy_score(y_test, y_pred2)) print(confusion_matrix(y_test, y_pred2)) print(classification_report(y_test, y_pred2)) n_errors_Ran = print((y_pred2 != y_test).sum()) cohen_kappa_score(y_test, y_pred2) print(accuracy_score(y_train, model2.predict(X_train))) ###################################Extratreeclassifier#################################################### from sklearn.ensemble import ExtraTreesClassifier model3 = ExtraTreesClassifier() model3.fit(X_train, y_train) y_pred3 = model3.predict(X_test) from sklearn.metrics import accuracy_score, confusion_matrix, classification_report print(accuracy_score(y_test, y_pred3)) print(confusion_matrix(y_test, y_pred3)) print(classification_report(y_test, y_pred3)) n_errors_ext = print((y_pred3 != y_test).sum()) cohen_kappa_score(y_test, y_pred3) print(accuracy_score(y_train, model3.predict(X_train))) ####################################Support Vector Machine #################################################################################### from sklearn.svm import SVC model4 = SVC()
# In[17]: from sklearn.model_selection import train_test_split # In[18]: x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3) # In[20]: from sklearn.ensemble import ExtraTreesClassifier modelo = ExtraTreesClassifier() modelo.fit(x_treino, y_treino) resultado = modelo.score(x_teste, y_teste) print("Acurácia:", resultado) # In[21]: previsoes = modelo.predict(x_teste[400:403]) # In[22]: previsoes # In[23]: y_teste[400:403] # In[ ]:
feature_set_test.append(feature_extraction(Xtest[i][j])) feature_sets_train = np.array(feature_set_train) feature_sets_test = np.array(feature_set_test) print("Loading Feature Set Matrix...") print("FeatureSet Train: ", feature_sets_train.shape) print("FeatureSet Test: ", feature_sets_test.shape) # In[9]: ytrain = ytrain.reshape(-1, ) ytest = ytest.reshape(-1, ) # print ("ytrain Reshaped!") # In[10]: Emodel = ExtraTreesClassifier(n_estimators=150) Emodel.fit(feature_sets_train, ytrain) # In[11]: t1 = time() pred = Emodel.predict(feature_sets_test[0].reshape(1, -1)) print("Running the Classifier, Sony Mixed mode... ") print("Predicted Label: ", pred[0]) t2 = time() print("Time taken per prediction (in sec): ", t2 - t1) # In[ ]:
n_estimators=30, bootstrap=True, max_features=None, max_depth=7, max_leaf_nodes=7) et_clf # In[39]: et_clf.fit(x_train, y_train) print(et_clf.score(x_train, y_train)) print(et_clf.score(x_test, y_test)) # In[40]: print(confusion_matrix(et_clf.predict(x_test), y_test)) print(f1_score(et_clf.predict(x_test), y_test, average='macro')) # In[41]: test_pred = et_clf.predict(test_data[train_columns]) submission['target'] = np.where(test_pred == 0, 'low', np.where(test_pred == 1, 'medium', 'high')) print(np.unique(submission['target'], return_counts=True)) submission.to_csv(cwd + "/submission_v2.csv", index=False) # ### Analysing with Logistic Regression Model # In[42]: from sklearn.linear_model import LogisticRegression
n_estimators=50, bootstrap=True, n_jobs=-1, oob_score=True, bootstrap_features=True, max_features=0.5) print('Training model..') bag_clf_2.fit(X_train, y_train) print('Done') print('oob score:', bag_clf_2.oob_score_) print('Making predictions..') y_pred = bag_clf_2.predict(X_test) print('Accuracy:', accuracy_score(y_test, y_pred)) RandomForest Classifier rf_clf = RandomForestClassifier(n_estimators=50, bootstrap=True, max_leaf_nodes=16, n_jobs=-1, oob_score=True) print('Training model..') rf_clf.fit(X_train, y_train) print('Done.') print('oob score:', rf_clf.oob_score_) print('Making predictions..') y_pred = rf_clf.predict(X_test) print('Accuracy:', accuracy_score(y_test, y_pred)) # ExtraTree Classifier ext_clf = ExtraTreesClassifier(n_estimators=50, bootstrap=True, max_leaf_nodes=16, n_jobs=-1, oob_score=True) print('Training model..') ext_clf.fit(X_train, y_train) print('Done.') print('oob score:', ext_clf.oob_score_) print('Making predictions..') y_pred = ext_clf.predict(X_test) print('Accuracy:', accuracy_score(y_test, y_pred))
stacking_model = StackingModel(topLayer_model, base_model_list) stacking_model.fit(X_train, y_train, X_test) print('stacking_model:', getAuc(y_test, stacking_model.predict())) print("other_model>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") rf_model = RandomForestClassifier() adb_model = AdaBoostClassifier() gdbc_model = GradientBoostingClassifier() et_model = ExtraTreesClassifier() rf_model.fit(X_train, y_train) adb_model.fit(X_train, y_train) gdbc_model.fit(X_train, y_train) et_model.fit(X_train, y_train) print('rf_model:', getAuc(y_test, rf_model.predict(X_test))) print('adb_model:', getAuc(y_test, adb_model.predict(X_test))) print('gdbc_model:', getAuc(y_test, gdbc_model.predict(X_test))) print('et_model:', getAuc(y_test, et_model.predict(X_test))) ''' 终于搞定stacking,太开心了!!! 之前一直以为,这个东西贼他妈神秘;搞懂了,发现贼他妈简单!!!! '''
###################################################################################################### # Train X_train, X_test, y_train, y_test = train_test_split(data2, training_labels, stratify=training_labels, test_size=0.25) percent = list() for bagging_run in range(0, 30): # train 3 classifiers final_preds = [] for x in range(0, 3): clf = ExtraTreesClassifier(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) final_preds.append(y_pred) # print(classification_report(y_test, y_pred, labels=[1, 2, 3, 4, 5, 6, 7])) summary = np.zeros(shape=(len(final_preds[0]), len(final_preds) + 1)) # Create an array with all predictions in a row # with the correct prediction at the end # summary = pred pred pred ... correct_y # loop through all predictions for j in range(len(final_preds[0])): # loop through number of predictor models for i in range(len(final_preds)): summary[j][i] = final_preds[i][j] # Append correct pred summary[j][summary[0].shape[0] - 1] = y_test.iloc[j]
# naive bayes implementation from matplotlib import pyplot # create model from pyearth import Earth model = Earth() # fit the earth model model.fit(X, y) print(" Model:") print(model) # make predictions expected = y predicted = model.predict(X) # since the quality can only be a number, round all the outputs off for i in range(len(predicted)): predicted[i] = int(round(predicted[i])) # check how far the predictions are from actual values difference = list() total_diff = 0. for i in range(len(y)): diff = predicted[i] - expected[i] difference.append(diff) total_diff += abs(diff) diversion = total_diff / len(y) # check how many of predictions match actual values
class ExtraTreesClassifier: def __init__(self, criterion, min_samples_leaf, min_samples_split, max_features, bootstrap, max_leaf_nodes, max_depth, min_weight_fraction_leaf, min_impurity_decrease, oob_score=False, n_jobs=1, random_state=None, verbose=0, class_weight=None): self.n_estimators = self.get_max_iter() self.estimator_increment = 10 if criterion not in ("gini", "entropy"): raise ValueError("'criterion' is not in ('gini', 'entropy'): " "%s" % criterion) self.criterion = criterion if check_none(max_depth): self.max_depth = None else: self.max_depth = int(max_depth) if check_none(max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) self.bootstrap = check_for_bool(bootstrap) self.min_weight_fraction_leaf = float(min_weight_fraction_leaf) self.min_impurity_decrease = float(min_impurity_decrease) self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.class_weight = class_weight self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight) iteration = 2 while not self.configuration_fully_fitted(): n_iter = int(2**iteration / 2) self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) iteration += 1 return self def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesClassifier as ETC if refit: self.estimator = None if self.estimator is None: max_features = int(X.shape[1]**float(self.max_features)) self.estimator = ETC( n_estimators=n_iter, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() probas = self.estimator.predict_proba(X) probas = convert_multioutput_multiclass_to_multilabel(probas) return probas @staticmethod def get_max_iter(): return 512
print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) scores = cross_val_score(ada_boost_classifier, X_train, y_train, cv=10, scoring='accuracy') print(scores.mean()) print("BaggingClassifier") bagging_classifier = BaggingClassifier() bagging_classifier.fit(X_train, y_train) y_pred = bagging_classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) scores = cross_val_score(bagging_classifier, X_train, y_train, cv=10, scoring='accuracy') print(scores.mean()) print("ExtraTreesClassifier") extra_trees_classifier = ExtraTreesClassifier(n_estimators=100) extra_trees_classifier.fit(X_train, y_train) y_pred = extra_trees_classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) scores = cross_val_score(extra_trees_classifier, X_train, y_train, cv=10, scoring='accuracy') print(scores.mean()) print("GradientBoostingClassifier") gradient_boosting_classifier = GradientBoostingClassifier() gradient_boosting_classifier.fit(X_train, y_train) y_pred = gradient_boosting_classifier.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) scores = cross_val_score(gradient_boosting_classifier, X_train, y_train, cv=10, scoring='accuracy') print(scores.mean()) print("RandomForestClassifier") random_forest_classifier = RandomForestClassifier(n_estimators=100) random_forest_classifier.fit(X_train, y_train)
from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier # In[ ]: # random_forest = RandomForestClassifier(n_estimators=100) # random_forest.fit(x_tr, y_tr) # random_forest.score(x_tr, y_tr) etc = ExtraTreesClassifier(n_estimators=400) etc.fit(x, y) ypred = etc.predict(test_x) # In[ ]: # ypred=random_forest.predict(test_x) # In[ ]: passenger = test2['PassengerId'] # In[ ]: submission = pd.DataFrame({"PassengerId": passenger, "Survived": ypred}) # In[ ]:
print(f1) print(f2) print(f1 * f2 / (f1 + f2) * 2) rf3 = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=2, class_weight="balanced") ######extratrees from sklearn.ensemble import ExtraTreesClassifier etc = ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=2, random_state=0) etc.fit(x1, y1) r4 = etc.predict(xt1) r5 = etc.predict(ax1) r6 = etc.predict(nx1) r7 = etc.predict(x1) print(pd.Series(r4).value_counts()) print(pd.Series(r5).value_counts(1)[0]) print(pd.Series(r6).value_counts(1)[1]) print(pd.Series(r7).value_counts()) f1 = pd.Series(r6).value_counts(1)[1] f2 = pd.Series(r6).value_counts()[1] / (pd.Series(r6).value_counts()[1] + pd.Series(r5).value_counts()[1]) print(f1) print(f2) print(f1 * f2 / (f1 + f2) * 2) #####adaboost
def cross_val(clf_name, X, y, n_folds=5, proba=False, score=accuracy_score, *params, **kwargs): cv = StratifiedKFold(y, n_folds=n_folds, shuffle=True, random_state=41) if clf_name == "extra": c = ExtraTreesClassifier(12, max_depth=23, max_features=10, n_jobs=-1, *params, **kwargs) elif clf_name == "grad": c = GradientBoostingClassifier(n_estimators=40, learning_rate=0.1, *params, **kwargs) elif clf_name == "cgrad": c = CalibratedClassifierCV(base_estimator=GradientBoostingClassifier( n_estimators=20, learning_rate=0.1, *params, **kwargs), method='isotonic', cv=10) elif clf_name == "cmulti": c = CalibratedClassifierCV(base_estimator=MultinomialNB( alpha=alpha_multi, *params, **kwargs), method='isotonic', cv=10) elif clf_name == "multi": c = MultinomialNB(*params, **kwargs) elif clf_name == "bag": c = BaggingClassifier(base_estimator=MultinomialNB(alpha=0.5, *params, **kwargs), n_estimators=100, n_jobs=-1) elif clf_name == "bern": c = BernoulliNB(alpha=0.00000000001, *params, **kwargs) elif clf_name == "gauss": c = GaussianNB(*params, **kwargs) elif clf_name == "random": c = RandomForestClassifier(1200, max_depth=23, max_features=10, n_jobs=-1, *params, **kwargs) elif clf_name == "lda": c = LinearDiscriminantAnalysis(*params, **kwargs) elif clf_name == "logistic": c = LogisticRegression(C=1, *params, **kwargs) elif clf_name == "svm": c = LinearSVC(C=100, *params, **kwargs) elif clf_name == "knn": c = KNeighborsClassifier(n_neighbors=20, *params, **kwargs) elif clf_name == "near": c = NearestCentroid(*params, **kwargs) elif clf_name == "ridge": c = OneVsOneClassifier(RidgeClassifier(alpha=0.1, *params, **kwargs)) elif clf_name == "sgd": c = SGDClassifier(loss="hinge", penalty="l2", n_iter=50, alpha=0.000001, fit_intercept=True, average=True) y_pred = np.zeros(y.shape) score_list = [] for i, (train, test) in enumerate(cv): c.fit(X[train, :], y[train]) if proba: y_pred[test] = c.predict_proba(X[test, :]) else: y_pred[test] = c.predict(X[test, :]) score_list.append(score(y[test], y_pred[test])) print(score_list[i]) print("Final score", score(y, y_pred)) return y_pred
training_features = scale(training_features) testing_features = scale(testing_features) SVM_lin.fit(training_features, labels_array_train) predictions_SVM_lin = SVM_lin.predict(testing_features) print("f1 Score SVM with linear kernel : ", f1_score(y_true=labels_array_test, y_pred=predictions_SVM_lin)) SVM_rbf.fit(training_features, labels_array_train) predictions_SVM_rbf = SVM_rbf.predict(testing_features) print("f1 Score SVM with Gaussian kernel : ", f1_score(y_true=labels_array_test, y_pred=predictions_SVM_rbf)) xtr.fit(training_features, labels_array_train) predictions_XTR = xtr.predict(testing_features) print("f1 Score XTR : ", f1_score(y_true=labels_array_test, y_pred=predictions_XTR)) gbm.fit(training_features, labels_array_train) predictions_XGB = gbm.predict(testing_features) print("f1 Score XGB : ", f1_score(y_true=labels_array_test, y_pred=predictions_XGB)) #model.fit(training_features, labels_array_train, nb_epoch = 4, batch_size = 258) #predictions_DL = model.predict_classes(testing_features) #print("f1 Score DL : ", f1_score(y_true=labels_array_test, y_pred = np.round(predictions_DL))) #============================================================================== # Issue prediction
np.array(gbdt.predict_proba(x_test)[:, 1]) > 0.4)) #--Weighted averaging--# wa_prob = 0.2 * gbdt.predict_proba(x_test)[:, 1] + 0.5 * rf.predict_proba( x_test)[:, 1] + 0.2 * et.predict_proba( x_test)[:, 1] + 0.1 * lr1.predict_proba(x_test)[:, 1] wa_brier.append(brier_score_loss(y_test, wa_prob)) print('accuracy rate of weighted averaging:', accuracy_score(y_test, np.array(wa_prob) >= 0.5)) wa_score1.append(accuracy_score(y_test, np.array(wa_prob) >= 0.5)) del wa_prob #--Majority voting--# mv_prob = 0.24 * gbdt.predict(x_test) + 0.27 * rf.predict( x_test) + 0.24 * lr1.predict(x_test) + 0.25 * et.predict(x_test) mv_brier.append(brier_score_loss(y_test, mv_prob)) print('accuracy rate of majority voting:', accuracy_score(y_test, np.array(mv_prob) >= 0.5)) mv_score1.append(accuracy_score(y_test, np.array(mv_prob) >= 0.5)) del mv_prob #----------Output accuracy rate----------# #print ('accuracy rate of support vector machine:', mean(svc_accuracy) ) print('accuracy rate of k nearest neighbors:', mean(knn_accuracy)) print('accuracy rate of logistic regression with lasso:', mean(lr1_accuracy)) print('accuracy rate of logistic regression with ridge:', mean(lr2_accuracy)) print('accuracy rate of decision tree:', mean(dt_accuracy)) print('accuracy rate of extremely randomized trees:', mean(et_accuracy)) print('accuracy rate of random forest:', mean(rf_accuracy))
class Extractor(BaseEstimator, ClassifierMixin): """ An sklearn-style classifier that extracts the main content (and/or comments) from an HTML document. Args: blockifier (``Blockifier``) features (str or List[str], ``Features`` or List[``Features``], or List[Tuple[str, ``Features``]]): One or more features to be used to transform blocks into a matrix of numeric values. If more than one, a :class:`FeatureUnion` is automatically constructed. See :func:`get_and_union_features`. model (:class:`ClassifierMixin`): A scikit-learn classifier that takes a numeric matrix of features and outputs a binary prediction of 1 for content or 0 for not-content. If None, a :class:`ExtraTreesClassifier` with default parameters is used. to_extract (str or Sequence[str]): Type of information to extract from an HTML document: 'content', 'comments', or both via ['content', 'comments']. prob_threshold (float): Minimum prediction probability of a block being classified as "content" for it actually be taken as such. max_block_weight (int): Maximum weight that a single block may be given when training the extractor model, where weights are set equal to the number of tokens in each block. Note: If ``prob_threshold`` is not None, then ``model`` must implement the ``predict_proba()`` method. """ def __init__(self, blockifier=TagCountNoCSSReadabilityBlockifier, features=('kohlschuetter', 'weninger', 'readability'), model=None, to_extract='content', prob_threshold=0.5, max_block_weight=200): self.blockifier = blockifier self.features = features # initialize model if model is None: self.model = ExtraTreesClassifier() elif isinstance(model, ClassifierMixin): self.model = model else: raise TypeError('invalid `model` type: "{}"'.format(type(model))) if isinstance(to_extract, string_): self.to_extract = (to_extract, ) else: self.to_extract = tuple(to_extract) self.prob_threshold = prob_threshold self.max_block_weight = max_block_weight self._positive_idx = None @property def features(self): return self._features @features.setter def features(self, feats): self._features = get_and_union_features(feats) def fit(self, documents, labels, weights=None): """ Fit :class`Extractor` features and model to a training dataset. Args: blocks (List[Block]) labels (``np.ndarray``) weights (``np.ndarray``) Returns: :class`Extractor` """ block_groups = np.array( [self.blockifier.blockify(doc) for doc in documents]) mask = [self._has_enough_blocks(blocks) for blocks in block_groups] block_groups = block_groups[mask] labels = np.concatenate(np.array(labels)[mask]) # TODO: This only 'fit's one doc at a time. No feature fitting actually # happens for now, but this might be important if the features change features_mat = np.concatenate( [self.features.fit_transform(blocks) for blocks in block_groups]) if weights is None: self.model.fit(features_mat, labels) else: weights = np.concatenate(np.array(weights)[mask]) self.model.fit(features_mat, labels, sample_weight=weights) return self def get_html_labels_weights(self, data): """ Gather the html, labels, and weights of many files' data. Primarily useful for training/testing an :class`Extractor`. Args: data: Output of :func:`dragnet.data_processing.prepare_all_data`. Returns: Tuple[List[Block], np.array(int), np.array(int)]: All blocks, all labels, and all weights, respectively. """ all_html = [] all_labels = [] all_weights = [] for html, content, comments in data: all_html.append(html) labels, weights = self._get_labels_and_weights(content, comments) all_labels.append(labels) all_weights.append(weights) return np.array(all_html), np.array(all_labels), np.array(all_weights) def _has_enough_blocks(self, blocks): if len(blocks) < 3: logging.warning('extraction failed: too few blocks (%s)', len(blocks)) return False return True def _get_labels_and_weights(self, content, comments): """ Args: content (Tuple[np.array[int], np.array[int], List[str]]) comments (Tuple[np.array[int], np.array[int], List[str]]) Returns: Tuple[np.array[int], np.array[int], List[str]] """ # extract content and comments if 'content' in self.to_extract and 'comments' in self.to_extract: labels = np.logical_or(content[0], comments[0]).astype(int) weights = content[1], # extract content only elif 'content' in self.to_extract: labels = content[0] weights = content[1] # extract comments only else: labels = comments[0] weights = comments[1] if self.max_block_weight is None: weights = np.minimum(weights, self.max_block_weight) return labels, weights def extract(self, html, encoding=None, as_blocks=False): """ Extract the main content and/or comments from an HTML document and return it as a string or as a sequence of block objects. Args: html (str): HTML document as a string. encoding (str): Encoding of ``html``. If None (encoding unknown), the original encoding will be guessed from the HTML itself. as_blocks (bool): If False, return the main content as a combined string; if True, return the content-holding blocks as a list of block objects. Returns: str or List[Block] """ preds, blocks = self.predict(html, encoding=encoding, return_blocks=True) if as_blocks is False: return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds))) else: return [blocks[ind] for ind in np.flatnonzero(preds)] def predict(self, documents, **kwargs): """ Predict class (content=1 or not-content=0) of the blocks in one or many HTML document(s). Args: documents (str or List[str]): HTML document(s) Returns: ``np.ndarray`` or List[``np.ndarray``]: array of binary predictions for content (1) or not-content (0). """ if isinstance(documents, (str, bytes, unicode_, np.unicode_, etree._Element)): return self._predict_one(documents, **kwargs) else: return np.concatenate( [self._predict_one(doc, **kwargs) for doc in documents]) def _predict_one(self, document, encoding=None, return_blocks=False): """ Predict class (content=1 or not-content=0) of each block in an HTML document. Args: documents (str): HTML document Returns: ``np.ndarray``: array of binary predictions for content (1) or not-content (0). """ # blockify blocks = self.blockifier.blockify(document, encoding=encoding) # get features try: features = self.features.transform(blocks) except ValueError: # Can't make features, predict no content preds = np.zeros((len(blocks))) # make predictions else: if self.prob_threshold is None: preds = self.model.predict(features) else: self._positive_idx = (self._positive_idx or list(self.model.classes_).index(1)) preds = self.model.predict_proba( features) > self.prob_threshold preds = preds[:, self._positive_idx].astype(int) if return_blocks: return preds, blocks else: return preds