def extratree(typ, X_train, Y_train, X_test, Y_test, text): text.delete(1.0, tk.END) text.insert( tk.END, "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...", "bold") text.update_idletasks() from sklearn.tree import ExtraTreeClassifier ETC = ExtraTreeClassifier() ETC.fit(X_train, Y_train) Y_pred = ETC.predict(X_test) text.insert( tk.END, "\n\nExtra Tree Classifier report \n" + classification_report(Y_pred, Y_test), "bold") text.insert( tk.END, "*****roc_auc_score: %0.3f*****\n" % roc_auc_score(Y_pred, Y_test), "bold") text.insert( tk.END, "Extra Tree Classifier confusion matrix \n" + str(confusion_matrix(Y_pred, Y_test)), "bold") score = accuracy_score(Y_pred, Y_pred) text.insert(tk.END, "Extra tree score= ", score) text.update_idletasks() roc_curve_acc(Y_test, Y_pred, 'ETC') if typ == "s": plt.show() elif typ == "a": pass
def dTree(data, labels, test, impurity="gini", mdepth=None): newData = pd.DataFrame() newTest = pd.DataFrame() le = LabelEncoder() for datum in data: newData[datum] = le.fit_transform(data[datum]) for testItem in test: newTest[testItem] = le.fit_transform(test[testItem]) tree1 = DecisionTreeClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree2 = ExtraTreeClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree3 = RandomForestClassifier(criterion=impurity, max_depth=mdepth, random_state=42) tree1.fit(newData, labels) tree2.fit(newData, labels) tree3.fit(newData, labels) predict1 = tree1.predict(newTest) print("tree1", evaluate(predict1, validation_genres)) predict2 = tree2.predict(newTest) print("tree2", evaluate(predict2, validation_genres)) predict3 = tree3.predict(newTest) print("tree3", evaluate(predict3, validation_genres)) combined_prediction = voting([predict1, predict2, predict3], [1, 1, 1]) return combined_prediction
def apply_extra_trees_classifier(trainData, targetTrain, testData, targetTest): """ Applies decision tree algorithm on the dataset, by tuning various parameters Args: dataframe: The input trainData, testData and class label on which the decision tree algorithm has to be applied """ # fit a CART model to the data etc = ExtraTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=None, splitter='random') etc.fit(trainData, targetTrain) print(etc) # make predictions expected = targetTest predicted = etc.predict(testData) # summarize the fit of the model print(accuracy_score(expected, predicted))
def extra_tree_classifier(self): self.log.writeToLog('Running Extra Tree Classifier Model...') X_train, X_test, y_train, y_test = self.train_test_split() et = ExtraTreeClassifier() trained_model = et.fit(X_train, y_train) self.save_pickle(trained_model) y_pred = et.predict(X_test) self.model_auc_roc(y_test, y_pred, "Extra Tree Classifier Model") self.model_evaluation(y_test, y_pred, "Extra Tree Classifier Model")
class ExtraTreeClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._etc = ETC() def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._etc = self._etc.fit(matrix, classes) print 'Fitting complete...' self._has_fit = True output = self._etc.predict(self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
class ExtraTreeClassifier(Classifier): def __init__(self, matrixdatabase): self._matrix_database = matrixdatabase self._has_fit = False self._etc = ETC() def learn(self, ingredients, cuisine): return def classify(self, ingredients): if not self._has_fit: matrix, classes = self._matrix_database.make_train_matrix() self._etc = self._etc.fit(matrix, classes) print('Fitting complete...') self._has_fit = True output = self._etc.predict( self._matrix_database.make_row_from_recipe(ingredients)) return output[0]
def fit(self, X, y): """Build a random decision tree based classifier from the training set (X, y).""" # Remove protected features X_protect = np.delete(X, [self.prot_class], axis=1) num_tr = len(y) num_prot_1 = sum(X[:, self.prot_class]) num_prot_0 = num_tr - num_prot_1 #X_protect = X i = 0 fair_trees = [] predictions = [] # Pick up fair trees while i < self.num_fair_trees: new_tree = ExtraTreeClassifier( max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, max_features=1) new_tree.fit(X_protect, y) new_prediction = new_tree.predict(X_protect) # Calculate the probability we predict someone will dropout between groups (Statistical Parity) num_pred_1 = len([ e for e in range(0, num_tr) if new_prediction[e] == 0 and X[e, self.prot_class] == 1 ]) num_pred_0 = len([ e for e in range(0, num_tr) if new_prediction[e] == 0 and X[e, self.prot_class] == 0 ]) stat_parity = abs(num_pred_1 / num_prot_1 - num_pred_0 / num_prot_0) if stat_parity < self.rho: i += 1 fair_trees.append(new_tree) predictions.append(new_prediction) self.ridge_model.fit(np.transpose(np.asarray(predictions)), y) self.decision_trees = fair_trees
def train_extratree_model(): results_extratree_model = {} results_extratree_model['acc'] = [] results_extratree_model['p_r_f1_s'] = [] for i in range(30): train_features, train_labels = get_train_data() test_features, test_labels = get_test_data() clf = ExtraTreeClassifier() clf.fit(train_features, train_labels) predictions = clf.predict(test_features) p_r_f1_s = precision_recall_fscore_support(test_labels, predictions) acc = accuracy_score(test_labels, predictions) print("ExtraTree Model Classifier : ", acc) print( "ExtraTree Model Classifier Precision, Recall, F1-Score, Support: ", p_r_f1_s) results_extratree_model['acc'].append(acc) results_extratree_model['p_r_f1_s'].append(p_r_f1_s) time.sleep(10) pickle.dump(results_extratree_model, open('results_extratree_model.pkl', 'wb'))
clf = ExtraTreeClassifier(random_state=103, splitter='random', max_features=9) ##Get dataset X = np.array(traindata.iloc[:, :10]) y = np.array(traindata.iloc[:, 10]) ##build decision tree X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11) clf.fit(X_train, y_train) print('Finish Extra tree training') predicttest = clf.predict(X_test) ##count click (0 or 1) countClick = [0, 0] for i in predicttest: if i == 0: countClick[0] += 1 else: countClick[1] += 1 print(countClick) ##get accuracy, precision, recall, f_measure tn, fp, fn, tp = confusion_matrix(y_test, predicttest).ravel() print('tp: {0}, tn: {1}, fp: {2}, fn: {3}'.format(tp, tn, fp, fn)) acc = float((tp + tn) / (tp + tn + fp + fn)) precision = float(tp / (tp + fp))
sc = MinMaxScaler(feature_range=(0,1)) X_train = sc.fit_transform(X_train) features2 = features2.replace('mod', 0) features2 = features2.replace('unm', 1) features2 = features2.replace(np.nan, 0, regex=True) # print(features) X_test = features2[['q1', 'q2', 'q3', 'q4', 'q5', 'mis1', 'mis2', 'mis3', 'mis4', 'mis5']].astype(float) y_test = features2['sample'].astype(int) sc = MinMaxScaler(feature_range=(0,1)) X_test = sc.fit_transform(X_test) # X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, shuffle=True) #Classifier from sklearn.ensemble import BaggingClassifier clf = ExtraTreeClassifier() # clf = BaggingClassifier(clf, random_state=0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']) sn.heatmap(confusion_matrix, annot=True) print('Accuracy: ', metrics.accuracy_score(y_test, y_pred)) plt.show()
print "Cross validation" scores = cross_val_score(RandomForestClassifier(), training, classes, cv=KFold(n=len(training), n_folds=5, random_state=42), scoring="accuracy") print("CV error = %f +-%f" % (1. - np.mean(scores), np.std(scores))) print("Accuracy =", accuracy_score(y_test, tlf.predict(X_test))) print("Precision =", precision_score(y_test, tlf.predict(X_test))) print("Recall =", recall_score(y_test, tlf.predict(X_test))) print("F =", fbeta_score(y_test, tlf.predict(X_test), beta=1)) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Extra Tree classifier" rlf = ExtraTreeClassifier() rlf.fit(training, classes) print("Training error =", zero_one_loss(classes, rlf.predict(training))) X_train, X_test, y_train, y_test = train_test_split(training, classes) rlf = ExtraTreeClassifier() rlf.fit(X_train, y_train) print("Training error =", zero_one_loss(y_train, rlf.predict(X_train))) print("Test error =", zero_one_loss(y_test, rlf.predict(X_test))) scores = [] print "K-fold cross validation" for train, test in KFold(n=len(training), n_folds=5, random_state=42): X_train, y_train = training[train], classes[train] X_test, y_test = training[test], classes[test] rlf = ExtraTreeClassifier().fit(X_train, y_train) scores.append(zero_one_loss(y_test, rlf.predict(X_test))) #
# ExtraTree brk = ExtraTreeClassifier(criterion='gini', splitter='random', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, ccp_alpha=0.0).fit(X_train_counts, y_train['prdtypecode']) pred_ETC = brk.predict(X_test_counts) # In[16]: # Adding extratree predictions to dataframe df2['ExtraTree'] = pred_ETC # In[18]: from sklearn.ensemble import GradientBoostingClassifier # Gradient Boosting gb_clf_test = GradientBoostingClassifier(n_estimators=10).fit( X_train_counts, y_train['prdtypecode']) pred_gb_test = gb_clf_test.predict(X_test_counts)
# data = '' with open(fname) as f: for s in f: tmp = map(int, s.split()) labels.append(tmp[-1]) res.append(tmp[:-1]) # data += (str(tmp)[1:-1]).replace(',', '')+'\n' # with open('out.txt', 'w') as o: # o.write(str(data)[1:-1]) return res, labels X, Y = readData('german.data-numeric.txt') Xt = X[:-200] ; Yt = Y[:-200] XT = X[-200:] ; YT = Y[-200:] print len(Xt) clf = ExtraTreeClassifier(max_depth=None, random_state=0) clf = clf.fit(Xt, Yt) #proba = clf.predict_proba(XT) #print len(proba) #print proba err = 0 for i, x in enumerate(XT): if clf.predict(x) != YT[i]: prob = clf.predict_proba(x) # print prob err += 1 print err
# wielokrotna 5krotna walidacja krzyzowa (10x5) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42) scores = np.zeros((len(preprocs), 5 * 2, len(metrics))) for fold_id, (train, test) in enumerate(rskf.split(X, y)): for preproc_id, preproc in enumerate(preprocs): clf = clone(clf) if preprocs[preproc] == None: X_train, y_train = X[train], y[train] else: X_train, y_train = preprocs[preproc].fit_resample( X[train], y[train]) clf.fit(X_train, y_train) y_pred = clf.predict(X[test]) for metric_id, metric in enumerate(metrics): scores[preproc_id, fold_id, metric_id] = metrics[metric](y[test], y_pred) # Save scores to a file writeResToFile(scores) # Load scores from file scores = loadResFromFile() # Results table table = getResultsFromFileAsArray(scores) print(table) # Save table to a file
Gboost = GradientBoostingClassifier() Xgboost = XGBClassifier() #reg_lambda=2 #i=0 for train, test in sfk.split(X, Y): # print(i) x_train = X.iloc[train, :] x_test = X.iloc[test, :] y_train = Y.iloc[train] y_test = Y.iloc[test] LogReg.fit(x_train, y_train) all_prediction.iloc[0, test] = LogReg.predict(x_test) # Extr_tree.fit(x_train, y_train) all_prediction.iloc[1, test] = Extr_tree.predict(x_test) # D_tree.fit(x_train, y_train) all_prediction.iloc[2, test] = D_tree.predict(x_test) # Rnd_frst.fit(x_train, y_train) all_prediction.iloc[3, test] = Rnd_frst.predict(x_test) # Gboost.fit(x_train, y_train) all_prediction.iloc[4, test] = Gboost.predict(x_test) Xgboost.fit(x_train, y_train) all_prediction.iloc[5, test] = Xgboost.predict(x_test) # i += 1 print(accuracy_score(Y, all_prediction.iloc[:, :].values[5]) * 100)
clf_entropy.fit(X_train, y_train) # Training entropy tree # Creating SVM with polynomial kernel clf_svc = svm.SVC(random_state=100, kernel='poly') clf_svc.fit(X_train, y_train) # Training SVM # Extra trees classifier clf_ext = ExtraTreeClassifier(random_state=100, max_depth=3, min_samples_leaf=5) clf_ext.fit(X_train, y_train) # Training extra tree clf_Lin = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto') y_pred_gi = clf_gini.predict(X_test) # gini tree prediction test y_pred_en = clf_entropy.predict(X_test) # entropy tree prediction test y_pred_sv = clf_svc.predict(X_test) # SVM prediction test y_pred_et = clf_ext.predict(X_test) # extra tree prediction test clf_Lin.fit(X_train, y_train) y_pred_L = clf_Lin.predict(X_test) # Print accuracy scores print("Gini accuracy score: ", accuracy_score(y_test, y_pred_gi) * 100) print("Entropy accuracy score: ", accuracy_score(y_test, y_pred_en) * 100) print("SVM accuracy score: ", accuracy_score(y_test, y_pred_sv) * 100) print("Extra tree accuracy score: ", accuracy_score(y_test, y_pred_et) * 100) print("LinearDiscriminant accuracy score: ", accuracy_score(y_test, y_pred_L) * 100) print(y_test) print(y_pred_sv)
def extratree(typ, X_train, Y_train, X_test, Y_test, text): text.delete(1.0, tk.END) text.insert( tk.END, "\n\nIMPORTING ExtraTree" + "\nProcessing this might take a while...", "bold") text.update_idletasks() from sklearn.tree import ExtraTreeClassifier ETC = ExtraTreeClassifier() text.insert(tk.END, "\n\n Number of Features for Training : " + str(len(X_train)), "bold") text.update_idletasks() text.insert(tk.END, "\n\n Number of Labels for Training : " + str(len(Y_train)), "bold") text.update_idletasks() text.insert( tk.END, "\n\n *** Training ExtraTree using the above Features and Labels ***", "bold") text.update_idletasks() ETC.fit(X_train, Y_train) text.insert(tk.END, "\n\n Number of Test Features : " + str(len(X_test)), "bold") text.update_idletasks() text.insert( tk.END, "\n\n Predicting the Test Labels for the above Test Features using ExtraTree ", "bold") text.update_idletasks() Y_pred = ETC.predict(X_test) text.insert(tk.END, "\n\n Number of Actual Labels : " + str(len(Y_test)), "bold") text.update_idletasks() text.insert( tk.END, "\n\n Number of Test Labels Predicted by DExtraTree --> " + str(len(Y_pred)), "bold") text.insert(tk.END, "\n\n ---------------------------------------------------") text.update_idletasks() text.insert( tk.END, "\n\n Number of LABELS MATCHED : " + str(accuracy_score(Y_test, Y_pred, normalize=False)), "bold") text.update_idletasks() text.insert( tk.END, "\n\n Calculating Accuracy of ExtraTree = Label Matched/Actual Labels ", "bold") text.update_idletasks() text.insert( tk.END, "\n\n Accuracy Score : " + str(accuracy_score(Y_test, Y_pred, normalize=True)), "bold") text.update_idletasks() text.insert( tk.END, "\n\n ExtraTree report \n" + classification_report(Y_test, Y_pred), "bold") text.update_idletasks() roc_curve_acc(Y_test, Y_pred, 'ETC') if typ == "s": plt.show() elif typ == "a": pass
for col in cols: hotlab[col] = data[col] promoted = data[["is_promoted"]] #%% x_train, x_test, y_train, y_test = train_test_split(hotlab, promoted) sm = SMOTE(random_state=20) train_input_new, train_output_new = sm.fit_sample(x_train, y_train) #%% class1 = ExtraTreeClassifier() class1.fit(x_train, y_train) pred1 = class1.predict(x_test) score = f1_score(y_test, pred1) #%% confussion = confusion_matrix(y_test, pred1) #%% #For submission submission_data = pd.read_csv("D:\\Hackathons\\Promotion\\test_2umaH9m.csv") #%% submission_data["education"] = submission_data["education"].fillna("Unknown") submission_data["previous_year_rating"] = submission_data["previous_year_rating"].fillna(np.mean(submission_data["previous_year_rating"])) submission_data["education"] = np.where(submission_data["age"] > 35, "Does not matter", submission_data["education"])
class ExtraTreeClass: """ Name : ExtraTreeClassifier Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'extratree' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/classifier/resource/classifier_sample.csv", sep=",", encoding="utf-8") # 학습 및 레이블(정답) 데이터 분리 self._x = data.drop("quality", axis=1) self._y = data["quality"] # 학습 데이터 및 테스트 데이터 분리 self._x_train, self._x_test, self._y_train, self._y_test = train_test_split( self._x, self._y, test_size=0.2, shuffle=True, random_state=42) # 모델 선언 self._model = ExtraTreeClassifier() # 모델 학습 self._model.fit(self._x_train, self._y_train) # 일반 예측 def predict(self): # 예측 y_pred = self._model.predict(self._x_test) # 리포트 출력 print(classification_report(self._y_test, y_pred)) score = accuracy_score(self._y_test, y_pred) # 스코어 확인 print(f'Score = {score}') # 스코어 리턴 return score # CV 예측(Cross Validation) def predict_by_cv(self): cv = KFold(n_splits=5, shuffle=True) # CV 지원 여부 if hasattr(self._model, "score"): cv_score = cross_val_score(self._model, self._x, self._y, cv=cv) # 스코어 확인 print(f'Score = {cv_score}') # 스코어 리턴 return cv_score else: raise Exception('Not Support CrossValidation') # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}.pkl'): os.rename( self._f_path + f'/model/{self._name}.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}.pkl') def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
plot_step = 0.02 for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): print(pair, pairidx) X = iris.data[:, pair] Y = iris.target clf = ExtraTreeClassifier(max_depth=3).fit(X, Y) plt.subplot(2, 3, pairidx + 1) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired) plt.xlabel(iris.feature_names[pair[0]]) plt.ylabel(iris.feature_names[pair[1]]) plt.axis("tight") for i, color in zip(range(n_classes), plot_colors): idx = np.where(Y == i) print(i) # print (idx) # plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i], # cmap=plt.cm.Paired) plt.axis("tight") plt.suptitle("Ejemplos de clasificador de arboles")
sgd.score(x_test_3, y_test_3) sgd = SGDClassifier(loss='log', shuffle=True, random_state=171) sgd.fit(x_train_3, y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3, y_test_3) sgd = SGDClassifier(shuffle=True, random_state=171) sgd.fit(x_train_3, y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3, y_test_3) submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': ensemble_test_pred}) submission.head() submission.to_csv('submission.csv', index=False) submission_tree = pd.DataFrame({'Id': test.Id, 'Cover_Type': tree_test_pred}) submission_tree.head() submission_tree.to_csv('submission2.csv', index=False) #Extra tree classifier is a tree based model for classification problems et = ExtraTreeClassifier() et.fit(x_train_3, y_train_3) et.predict(x_train_3) et.score(x_test_3, y_test_3) from sklearn.semi_supervised import LabelPropagation lb = LabelPropagation() lb.fit(x_train_3, y_train_3) lb.predict(x_train_3) lb.score(x_test_3, y_test_3) from sklearn.neighbors import KNeighborsClassifier knng = KNeighborsClassifier() knng.fit(x_train_3, y_train_3) knng.predict(x_train_3) knng.score(x_test_3, y_test_3)
# In[ ]: DTree = DecisionTreeClassifier(max_depth=3) DTree.fit(x_train, y_train) yhat = DTree.predict(x_test) print("DecisionTreeClassifier") print("Train set Accuracy: ", metrics.accuracy_score(y_train, DTree.predict(x_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) # In[ ]: ETree = ExtraTreeClassifier(max_depth=3) ETree.fit(x_train, y_train) yhat = ETree.predict(x_test) print("ExtraTreeClassifier") print("Train set Accuracy: ", metrics.accuracy_score(y_train, ETree.predict(x_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) # In[ ]: Ada = AdaBoostClassifier() Ada.fit(x_train, y_train) yhat = Ada.predict(x_test) print("AdaBoostClassifier") print("Train set Accuracy: ", metrics.accuracy_score(y_train, Ada.predict(x_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plotting featre importance plt.figure(figsize=(10,5)) plt.title("Feature importances") plt.bar(range(X.shape[1]), importances[indices], color="g", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), features,rotation=90) plt.xlim([-1, X.shape[1]]) plt.show() # Importing,intitiating and fitting Extra trees classifier from sklearn.tree import ExtraTreeClassifier extree = ExtraTreeClassifier(max_features=11,min_samples_split=21, random_state=101,max_depth =28) extree.fit(X_train_sm1,y_train_sm1) extree_predict=extree.predict(X_test) #checking performacne of the extra trees classifier print(confusion_matrix(y_test,extree_predict)) print(classification_report(y_test,extree_predict)) #Importing test data test=pd.read_csv('FIA_predictions.csv') # getting columns same as training data test=test.iloc[:,0:33] #converting data type for categorical variables test['NAICS2']=test['NAICS2'].astype('category') test['NAICS4']=test['NAICS4'].astype('category') test['NAICS_CD']=test['NAICS_CD'].astype('category') test['Restricted_Vertical']=test['Restricted_Vertical'].astype('category') test['LCTN_TYP_VAL']=test['LCTN_TYP_VAL'].astype('category') test['srvc_five_dgt_zip']=test['srvc_five_dgt_zip'].astype('category') test['data_srvc_rnge_6_flg']=test['data_srvc_rnge_6_flg'].astype('category')
print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_cart.predict(rnd_test_X))) roc_auc_scorer = get_scorer("roc_auc") print("ROC AUC = %s"%roc_auc_scorer(clf_cart, rnd_test_X, rnd_test_y)) fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_cart.predict_proba(rnd_test_X)[:, 1]) axes_roc.plot(fpr, tpr, label = 'CART-2') ## randomized tree with default setting clf_rnd_tree = ExtraTreeClassifier() clf_rnd_tree.fit(rnd_training_X, rnd_training_y) export_graphviz(clf_rnd_tree, out_file = 'default_rnd_tree.dot', feature_names = attribute_names, class_names = bi_class_target_attrs, filled = True, rounded = True, special_characters = True) print(check_output('dot -Tpdf default_rnd_tree.dot -o default_rnd_tree.pdf', shell = True)) print("Accuracy = %s"%accuracy_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("Precision = %s"%precision_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("Recall = %s"%recall_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) print("F = %s"%fbeta_score(rnd_test_y, clf_rnd_tree.predict(rnd_test_X), beta=1)) print("Confusion matrix = %s"%confusion_matrix(rnd_test_y, clf_rnd_tree.predict(rnd_test_X))) fpr, tpr, thresholds = roc_curve(rnd_test_y, clf_rnd_tree.predict_proba(rnd_test_X)[:, 1]) axes_roc.plot(fpr, tpr, label = "Randomized tree-1") axes_roc.set_title("ROC of CART and a randomized tree") axes_roc.set_xlabel("FPR") axes_roc.set_ylabel("TPR") axes_roc.set_ylim(0, 1.1) axes_roc.legend(loc = 'best', fontsize = 'medium') roc_auc_scorer = get_scorer("roc_auc") print("ROC AUC = %s"%roc_auc_scorer(clf_rnd_tree, rnd_test_X, rnd_test_y)) # randomized tree with max_depth = 4, min_samples_leaf = 5
def myclassify_AudPow(numfiers,xtrain_1,xtrain_2,ytrain_1,ytrain_2,xtest): # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest = xtest[~np.isnan(xtest).any(axis=1),:] xtest = xtest[~np.isinf(xtest).any(axis=1),:] xtrain = np.append(xtrain_1,xtrain_2,0) ytrain = np.append(ytrain_1,ytrain_2) ytrain = np.ravel(ytrain) xtrunclength = sio.loadmat('../Files/xtrunclength.mat') xtrunclength = xtrunclength['xtrunclength'][0] #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] modeCol = predWindowVecModeFinder(tempCol,xtrunclength) modeStr = predVec2Str(modeCol) predictionStringMat.append(modeStr) finalPredMat += map(int,modeCol) return predictionStringMat,finalPredMat
def myclassify_practice_set(numfiers,xtrain,ytrain,xtltrain,xtltest,xtest,ytarget=None,testing=False,grids='ABCDEFGHI'): #NOTE we might not need xtltrain # xtrain and ytrain are your training set. xtltrain is the indices of corresponding recordings in xtrain and ytrain. these will always be present #xtest is your testing set. xtltest is the corresponding indices of the recording. for the practice set xtltest = xtrunclength # ytest is optional and depends on if you are using a testing set or the practice set # remove NaN, Inf, and -Inf values from the xtest feature matrix xtest,xtltest,ytarget = removeNanAndInf(xtest,xtltest,ytarget) # print 'finished removal of Nans' ytrain = np.ravel(ytrain) ytarget = np.ravel(ytarget) #if xtest is NxM matrix, returns Nxnumifiers matrix where each column corresponds to a classifiers prediction vector count = 0 # print numfiers predictionMat = np.empty((xtest.shape[0],numfiers)) predictionStringMat = [] finalPredMat = [] targetStringMat = [] targets1 = [] predictions1 = [] # svc1 = SVC() # svc1.fit(xtrain,ytrain) # ytest = svc1.predict(xtest) # predictionMat[:,count] = ytest # count+=1 if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () ytest = eclf.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) ytest = bagging2.predict(xtest) predictionMat[:,count] = ytest count += 1 if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) ytest = tree2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) ytest = bagging1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) ytest = svc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) ytest = qda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) ytest = tree1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) ytest = knn1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) ytest = lda.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) ytest = tree3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) ytest = bagging3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) ytest = bagging4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) ytest = tree4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) ytest = tree6.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) ytest = knn2.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) ytest = knn3.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) ytest = knn4.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) ytest = knn5.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) ytest = ncc1.predict(xtest) predictionMat[:,count] = ytest count+=1 if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) ytest = tree5.predict(xtest) predictionMat[:,count] = ytest count+=1 # print xtltest # print len(ytest) for colCount in range(predictionMat.shape[1]): tempCol = predictionMat[:,colCount] if testing: modeCol = temppredWindowVecModeFinder(tempCol,xtltest,4,grids,isPrint=0) else: modeCol = predWindowVecModeFinder(tempCol,xtltest,4,isPrint=0) ytarg = predWindowVecModeFinder(ytarget,xtltest,1,isPrint=0) if testing: modeStr = temppredVec2Str(modeCol,grids) else: modeStr = predVec2Str(modeCol) modeStrans = predVec2Str(ytarg) predictionStringMat.append(modeStr) predictions1.append(modeCol) finalPredMat += map(int,modeCol) targetStringMat.append(modeStrans) targets1.append(ytarg) if testing == False: if ytarget != None: #print targets1 #print "" #print predictions1 confusionme = confusion_matrix(targets1[0],predictions1[0]) #print "Confusion Matrix is: " #print confusionme return predictionStringMat, targetStringMat, finalPredMat
class stacked_generalization(): def __init__(self, data, target): self.data = data if len(target.shape) == 2: # Convert 2-dim target array into 1-dim target array self.target = target.reshape(target.shape[0]) else: self.target = target self.training_data = None self.training_target = None self.test_data = None self.test_target = None # Construct 3 Tier-1 (base) classifiers self.Tier1_classifier1 = LogisticRegression(solver="lbfgs") self.Tier1_classifier2 = MultinomialNB() self.Tier1_classifier3 = LinearSVC(penalty="l2") self.Tier1_classifier4 = ExtraTreeClassifier() # self.Tier1_classifier5 = SGDClassifier(max_iter=1000, tol=1e-3) # Construct Tier-2 (meta) classifier # self.meta_classifier = LogisticRegression(solver="lbfgs") # self.meta_classifier = MultinomialNB() # self.meta_classifier = LinearSVC(penalty = "l2") self.meta_classifier = ExtraTreeClassifier() # self.meta_classifier = XGBClassifier() # self.meta_classifier = RandomForestClassifier(n_estimators=100) # Divide training data into different n_split training blocks and evaluation blocks # Create T Tier-1 classifiers, C1,..,CT, based on a cross-validation partition of the training data. To do so, # the entire training dataset is divided into B blocks, and each Tier-1 classifier is first trained on (a different set of) # B-1 blocks of the training data. Each classifier is then evaluated on the Bth (pseudo-test) block def TrainingData_Stratified_KFold_split(self, n_split=5, shuffle=False): # Blocks of training data Partition. n_splits cannot be greater than the number of members in each class skf_blocks = StratifiedKFold(n_splits=n_split, shuffle=shuffle) # Creat the indexes of blocks of training data. The number of blocks is n_split training_blocks_index = [] evaluation_blocks_index = [] for trainingBlock_index, evaluationBlock_index in skf_blocks.split( self.training_data, self.training_target): training_blocks_index.append(trainingBlock_index) evaluation_blocks_index.append(evaluationBlock_index) training_blocks_data = [ self.training_data[index, :] for index in training_blocks_index ] training_blocks_target = [ self.training_target[index] for index in training_blocks_index ] evaluation_blocks_data = [ self.training_data[index, :] for index in evaluation_blocks_index ] evaluation_blocks_target = [ self.training_target[index] for index in evaluation_blocks_index ] return training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target def train_meta_classifier(self): training_blocks_data, training_blocks_target, evaluation_blocks_data, evaluation_blocks_target = self.TrainingData_Stratified_KFold_split( ) # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs Tier1_outputs = [] for block in range(len(training_blocks_data)): # all Tier-1 base classifiers fit n-1 training data blocks (n blocks totally) self.Tier1_classifier1.fit(training_blocks_data[block], training_blocks_target[block]) self.Tier1_classifier2.fit(training_blocks_data[block], training_blocks_target[block]) self.Tier1_classifier3.fit(training_blocks_data[block], training_blocks_target[block]) self.Tier1_classifier4.fit(training_blocks_data[block], training_blocks_target[block]) # self.Tier1_classifier5.fit(training_blocks_data[block],training_blocks_target[block]) # All Tier-1 base classifiers fit nth training data blocks (n blocks totally).The outputs of all Tier-1 base # classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs output_C1 = self.Tier1_classifier1.predict( evaluation_blocks_data[block]) output_C1 = output_C1.reshape(output_C1.shape[0], 1) output_C2 = self.Tier1_classifier2.predict( evaluation_blocks_data[block]) output_C2 = output_C2.reshape(output_C2.shape[0], 1) output_C3 = self.Tier1_classifier3.predict( evaluation_blocks_data[block]) output_C3 = output_C3.reshape(output_C3.shape[0], 1) output_C4 = self.Tier1_classifier4.predict( evaluation_blocks_data[block]) output_C4 = output_C4.reshape(output_C4.shape[0], 1) # output_C5 = self.Tier1_classifier5.predict(evaluation_blocks_data[block]) # output_C5 = output_C5.reshape(output_C5.shape[0],1) # The classification outputs of all Tier-1 classifiers on each training data block (5 blocls now) are saved in list Tier1_outputs block_outputs = np.hstack((output_C1, output_C2, output_C3, output_C4)) # horizontally combined Tier1_outputs.append(block_outputs) # Vertically combine all training data blocks' classification outputs of all Tier-1 classifiers. # The function np.vstack() can be given a list Tier1_outputs = np.vstack(Tier1_outputs) # Combine all training data blocks' real labels evaluation_blocks_target = np.concatenate([ eva_block_target for eva_block_target in evaluation_blocks_target ]) # Using all training data blocks' classification outputs of all Tier-1 classifiers and all training data blocks' # real labels to train the meta classifier self.meta_classifier.fit(Tier1_outputs, evaluation_blocks_target) print("The training of meta classifier is finished") # return accuracy, recall and precision of test data # Train stacked generalization by cross-validation partition. def train_stacked_generalization_CV(self, n_split=5, shuffle=False): # Cross-validation Partition. n_splits cannot be greater than the number of members in each class skf_cv = StratifiedKFold(n_splits=n_split, shuffle=shuffle) # Creat the indexes of training data and test data training_sets_index = [] test_sets_index = [] for training_index, test_index in skf_cv.split(self.data, self.target): training_sets_index.append(training_index) test_sets_index.append(test_index) training_sets_data = [ self.data[index, :] for index in training_sets_index ] training_sets_target = [ self.target[index] for index in training_sets_index ] test_sets_data = [self.data[index, :] for index in test_sets_index] test_sets_target = [self.target[index] for index in test_sets_index] # Store all metrics of cross-validation in different lists test_cv_accuracy = [] test_cv_recall = [] test_cv_precision = [] time_start = time.time() # start time for cv_time in range(n_split): self.training_data = training_sets_data[cv_time] self.training_target = training_sets_target[cv_time] self.test_data = test_sets_data[cv_time] self.test_target = test_sets_target[cv_time] # train the meta classifier self.train_meta_classifier() # Using all training data to retrain the all Tier-1 base classifiers self.Tier1_classifier1.fit(self.training_data, self.training_target) self.Tier1_classifier2.fit(self.training_data, self.training_target) self.Tier1_classifier3.fit(self.training_data, self.training_target) self.Tier1_classifier4.fit(self.training_data, self.training_target) # self.Tier1_classifier5.fit(self.training_data,self.training_target) # All retrained Tier-1 base classifiers are utilized to predict the test data testset_output_C1 = self.Tier1_classifier1.predict(self.test_data) testset_output_C1 = testset_output_C1.reshape( testset_output_C1.shape[0], 1) testset_output_C2 = self.Tier1_classifier2.predict(self.test_data) testset_output_C2 = testset_output_C2.reshape( testset_output_C2.shape[0], 1) testset_output_C3 = self.Tier1_classifier3.predict(self.test_data) testset_output_C3 = testset_output_C3.reshape( testset_output_C3.shape[0], 1) testset_output_C4 = self.Tier1_classifier4.predict(self.test_data) testset_output_C4 = testset_output_C4.reshape( testset_output_C4.shape[0], 1) # testset_output_C5 = self.Tier1_classifier5.predict(self.test_data) # testset_output_C5 = testset_output_C5.reshape(testset_output_C5.shape[0],1) # Horizontally combine all Tier-1 base classifiers' predictions on test data testset_outputs_Tier1 = np.hstack( (testset_output_C1, testset_output_C2, testset_output_C3, testset_output_C4)) # Based on predictions on test data, of all Tier-1 base classifiers , it would use the meta classifier to predict labels of test data testset_outputs_meta = self.meta_classifier.predict( testset_outputs_Tier1) # Round all predictions of meta classifier xgboost testset_outputs_meta = np.round(testset_outputs_meta) # Store all metrics of cross-validation in different lists test_cv_accuracy.append( accuracy_score(self.test_target, testset_outputs_meta)) test_cv_recall.append( recall_score(self.test_target, testset_outputs_meta)) test_cv_precision.append( precision_score(self.test_target, testset_outputs_meta)) # Convert lists into numpy arrays, since only numpy arrays can be used to calculate mean values, min values, max values and std values test_cv_accuracy = np.array(test_cv_accuracy) test_cv_recall = np.array(test_cv_recall) test_cv_precision = np.array(test_cv_precision) time_end = time.time() # end time print("\nTime cost: ", time_end - time_start, "seconds") cv_scores = { "test_accuracy": test_cv_accuracy, "test_precision_weighted": test_cv_recall, "test_recall_weighted": test_cv_precision } return cv_scores