print('f1 score for DecisionTreeClassifier', f1_tree) print('precision for DecisionTreeClassifier', precision_tree) print('recall for DecisionTreeClassifier', recall_tree) if (precision_tree >= min_precision) & (recall_tree >= min_recall): print('DecisionTree is a good classifier with set parameters') else: print( 'Low precision and recall, DecisionTree is not a good classifier with set parameters' ) print( '################### Try DecisionTreeClassifier ###################################' ) # DecisionTree - 1 tree_clf_1 = tree.DecisionTreeClassifier(random_state=0) # create feature union features_pipeline = [] features_pipeline.append(('pca', PCA(n_components=3))) features_pipeline.append(('select_best', SelectKBest(k=k))) feature_union = FeatureUnion(features_pipeline) # Create a pipeline with feature selection and classification pipe = Pipeline([('feature_union', feature_union), ('feature_selection', SelectKBest(k=k)), ('classification', tree_clf_1)]) # Check the parameters that can be set for DecisionTree Classifier, and create a param_grid estimated = tree_clf_1.get_params().keys() print('param_keys########################', estimated)
def main(): item_type = 'hotel' # item_type = 'restaurant' my_folder = '/Users/fpena/UCC/Thesis/datasets/context/' my_file = my_folder + 'classified_' + item_type + '_reviews.json' binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl' my_records = ETLUtils.load_json_file(my_file) with open(binary_reviews_file, 'rb') as read_file: my_reviews = pickle.load(read_file) num_features = 2 my_metrics = numpy.zeros((len(my_reviews), num_features)) for index in range(len(my_reviews)): my_metrics[index] =\ review_metrics_extractor.get_review_metrics(my_reviews[index]) review_metrics_extractor.normalize_matrix_by_columns(my_metrics) count_specific = 0 count_generic = 0 for record in my_records: if record['specific'] == 'yes': count_specific += 1 if record['specific'] == 'no': count_generic += 1 print('count_specific: %d' % count_specific) print('count_generic: %d' % count_generic) print('specific percentage: %f%%' % (float(count_specific)/len(my_records))) print('generic percentage: %f%%' % (float(count_generic)/len(my_records))) my_labels = numpy.array([record['specific'] == 'yes' for record in my_records]) classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), # DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf'), SVC(C=1.0, kernel='linear'), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(), LinearSVC() ] scores = [[] for _ in range(len(classifiers))] Xtrans = my_metrics cv = KFold(n=len(my_metrics), n_folds=5) for i in range(len(classifiers)): for train, test in cv: x_train, y_train = Xtrans[train], my_labels[train] x_test, y_test = Xtrans[test], my_labels[test] clf = classifiers[i] clf.fit(x_train, y_train) scores[i].append(clf.score(x_test, y_test)) for classifier, score in zip(classifiers, scores): print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score))) plot(my_metrics, my_labels)
from skompiler import skompile from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict from sklearn.tree import DecisionTreeClassifier,tree import matplotlib.pyplot as plt from sklearn.metrics import * dia=pd.read_csv("10.1 diabetes.csv.csv") df=dia.copy() df=df.dropna() y=df["Outcome"] X=df.drop(["Outcome"],axis=1) #X=df["Pregnancies"] X=pd.DataFrame(X) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42) cart=DecisionTreeClassifier() cart_model=cart.fit(X_train,y_train) print(skompile(cart_model.predict)) y_pred=cart_model.predict(X_test) print(accuracy_score(y_test,y_pred)) cart_grid={"max_depth":range(1,10), "min_samples_split":range(2,50)} cart=tree.DecisionTreeClassifier() cart_cv=GridSearchCV(cart,cart_grid,cv=10,n_jobs=-1,verbose=2) cart_cv_model=cart_cv.fit(X_train,y_train) print("en iyi parametreler:"+str(cart_cv_model.best_params_)) cart=tree.DecisionTreeClassifier(max_depth=5,min_samples__split=19) cart_tuned=cart.fit(X_train,y_train) y_pred=cart_tuned.predict(X_test) print(accuracy_score(y_test,y_pred))
principalComponents2 = pca1.transform(test_set) print("This is important principal components") print(pca1.explained_variance_ratio_) principalDf = pd.DataFrame(data=principalComponents1, columns=[ 'principal component 1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5', 'principal component 6', 'principal component 7' ]) #decision tree method print("****THIS PART IS DECISION TREE CLASSIFICATION****") data = tree.DecisionTreeClassifier() data = data.fit( train[[ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal' ]], train['heartdisease']) predictions_data = data.predict(test[[ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal' ]]) predictright = 0 predictions_data.shape[0] for i in range(0, predictions_data.shape[0] - 1): if (predictions_data[i] == test.iloc[i][13]): predictright += 1 accuracy = predictright / predictions_data.shape[0]
import pandas as pd from sklearn.tree import tree data = pd.read_csv("data/_ea07570741a3ec966e284208f588e50e_titanic.csv", index_col='PassengerId') data = data[['Pclass', 'Fare', 'Age', 'Sex', 'Survived']] data = data.dropna() data.loc[data['Sex'] != 'female', 'Sex'] = 0 data.loc[data['Sex'] == 'female', 'Sex'] = 1 print(data) X = data[['Pclass', 'Fare', 'Age', 'Sex']] Y = data['Survived'] clf = tree.DecisionTreeClassifier(random_state=241) clf.fit(X, Y) importances = clf.feature_importances_ print(importances.round(4))
def MaildecisionTree(vec, lab): mode = tree.DecisionTreeClassifier(criterion='gini') mode.fit(vec, lab) res = mode.predict(vec) print("Accuracy: " + str(getAc(res, lab)))
def setUp(self): self.tmp_fn = 'Tmp' self.iris = load_iris() self.n_features = len(self.iris.data[0]) self.clf = tree.DecisionTreeClassifier(random_state=0) self.clf.fit(self.iris.data, self.iris.target)
Image(graph[0].create_png()) ############################################################################################# ############################################################################################# ############################################################################################# ############################################################################################# ############# decision tree ################################################################## ############################################################################################# ############################################################################################# from sklearn.tree import tree X_train, X_test, y_train, y_test = train_test_split_for_oneclass( data_X, data_Y) #making the instance model_DT = tree.DecisionTreeClassifier(criterion='gini') #Hyper Parameters Set params = { 'max_features': ['sqrt', 'log2'], 'max_depth': [2, 3, 4, 5, 10, 20], 'min_samples_split': [2, 3, 4, 5, 10, 50, 100, 200], 'min_samples_leaf': [2, 3, 4, 5, 10, 100], 'random_state': [random_state] } #Making models with hyper parameters sets model_DT = GridSearchCV(model_DT, param_grid=params, n_jobs=-1, cv=10, scoring='roc_auc')
def test_with_invalid_k(self): with self.assertRaises(AssertionError): MRBBagging(0, tree.DecisionTreeClassifier())
def get_classifiers(): dict_clfs = {} dict_clfs[CLF_TYPES.RandomForestClassifier50] = RandomForestClassifier(n_estimators=50, n_jobs=12) dict_clfs[CLF_TYPES.RandomForestClassifier5] = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=12) dict_clfs[CLF_TYPES.BernoulliRBM] = Pipeline(steps=[('rbm', BernoulliRBM(n_components=200, n_iter=1, learning_rate=0.01, verbose=False)), ('logistic', LogisticRegression(C=10000))]) dict_clfs[CLF_TYPES.MLPClassifier] = MLPClassifier(hidden_layer_sizes=(75,), max_iter=250, alpha=1e-4, solver='sgd', verbose=0, tol=1e-4, random_state=RANDOM_SEED, learning_rate_init=.1, early_stopping=True) kwargs = dict(n_estimators=50, learning_rate=1., algorithm='SAMME.R', random_state=RANDOM_SEED) dict_clfs[CLF_TYPES.AdaBoostClassifier] = AdaBoostClassifier(**kwargs) kwargs = {'algorithm': 'auto', 'leaf_size': 5, 'metric': 'minkowski', 'n_jobs': 12, 'n_neighbors': 6, 'p': 1, 'weights': 'distance'} # kwargs = {} dict_clfs[CLF_TYPES.KNN] = KNeighborsClassifier(**kwargs) kwargs = dict(learning_rate=0.1) # kwargs = {} dict_clfs[CLF_TYPES.GBC] = GradientBoostingClassifier(**kwargs) dict_clfs[CLF_TYPES.GNB] = GaussianNB() kwargs = {'alpha': 0.10526315789473684} # kwargs = {} dict_clfs[CLF_TYPES.MultinomialNB] = MultinomialNB(**kwargs) # kwargs = None kwargs = {'alpha': 0.10526315789473684, 'norm': False} # kwargs = {} dict_clfs[CLF_TYPES.ComplementNB] = ComplementNB(**kwargs) # kwargs = None kwargs = {'alpha': 0.05263157894736842, 'binarize': 0.9473684210526315} # kwargs = {} dict_clfs[CLF_TYPES.BernoulliNB] = BernoulliNB(**kwargs) kwargs = {'criterion': 'gini', 'max_depth': 1.0, 'max_features': 3, 'min_samples_leaf': 0.4, 'min_samples_split': 0.01, 'min_weight_fraction_leaf': 0.4, 'random_state': 42, 'splitter': 'best'} # kwargs = {} dict_clfs[CLF_TYPES.DecisionTreeClassifier] = tree.DecisionTreeClassifier(**kwargs) dict_clfs[CLF_TYPES.ExtraTreeClassifier] = tree.ExtraTreeClassifier() kwargs = {'C': 10, 'gamma': 0.001, 'kernel': 'rbf', 'random_state': 42, 'probability': True} # kwargs = {} dict_clfs[CLF_TYPES.SVC] = svm.SVC(**kwargs) kwargs = {'C': 1.0, 'dual': False, 'fit_intercept': True, 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l1', 'solver': 'liblinear'} # kwargs = {} dict_clfs[CLF_TYPES.LogReg100] = LogisticRegression(**kwargs) dict_clfs[CLF_TYPES.LogReg10k] = LogisticRegression(C=10000) return dict_clfs
# -------------------------------------------------------------------------------------------------------------------------# #Definition des classifieurs dans un dictionnaire clf_init = None clfs = { #Naive Bayes Classifier 'NBS' : GaussianNB(), #Random Forest 'RF': RandomForestClassifier(n_estimators=100), #K plus proches voisins 'KNN': KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='auto', p=2, metric='minkowski'), #Arbres de décision CART 'CART': tree.DecisionTreeClassifier(min_samples_split=50, random_state=99,criterion='gini'), #Adaboost avec arbre de décision 'ADAB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,random_state=99,criterion='gini'),algorithm="SAMME",n_estimators=100), # MLP perceptron multi-couches, 'MLP' : MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200,10), random_state=3, learning_rate = 'adaptive'), #Gradient boosting classifier 'GBC' : GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=100, subsample=0.3,min_samples_split=2, min_samples_leaf=1, max_depth=1, init=clf_init,random_state=1, max_features=None, verbose=0), #Bagging with KNearestNeighbours 'BC With KNN' : BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5), # Extra Trees Classifier
def GraphdecisionTree(trainvec, trainlab, testvec, testlab): mode = tree.DecisionTreeClassifier(criterion='gini') mode.fit(trainvec, trainlab) res = mode.predict(testvec) print("Accuracy: " + str(getAc(res, testlab)))
continue if not 2 in cluster_datasets[index].keys(): continue X_cluster = cluster_datasets[index][0] Y_cluster = cluster_datasets[index][1] X_cluster_unlabelled = cluster_datasets[index][2] index_list = cluster_datasets[index][3] trash = False if len(np.unique(Y_cluster)) == 1 and len(X_cluster_unlabelled) < 10: #and len(Y_cluster) > 10 continue if not trash: model_cluster = tree.DecisionTreeClassifier() model_cluster.fit(X_cluster, Y_cluster) Y_cluster_guess = model_cluster.predict(X_cluster_unlabelled) Y_cluster_guess_proba = model_cluster.predict_proba(X_cluster_unlabelled) model_cluster = None for index2 in range(0, len(Y_cluster_guess)): label_index = index_list[index2] if np.max(Y_cluster_guess_proba[index2]) > 0.99: labels_[label_index] = Y_cluster_guess[index2] else: labels_[label_index] = -1 else: for index2 in range(0, len(X_cluster_unlabelled)): label_index = index_list[index2] labels_[label_index] = -1
'SVC': { 'resampler': resamplers, 'classifier': [SVC(random_state=RANDOM_STATE)], 'classifier__kernel': ['rbf', 'linear'], 'classifier__C': [0.1, 1.0, 10, 100, 1000] # 'classifier__C': [0.1, 1.0, 10] }, 'KNeighborsClassifier': { 'resampler': resamplers, 'classifier': [KNeighborsClassifier()], 'classifier__n_neighbors': [1, 2, 5, 10, 20], 'classifier__weights': ['uniform', 'distance'] }, 'DecisionTreeClassifier': { 'resampler': resamplers, 'classifier': [tree.DecisionTreeClassifier(random_state=RANDOM_STATE)], 'classifier__max_depth': [None, 2, 3, 5, 10], 'classifier__min_samples_leaf': [2, 5, 10] }, 'RandomForestClassifier': { 'resampler': resamplers, 'classifier': [RandomForestClassifier(random_state=RANDOM_STATE)], 'classifier__n_estimators': [10, 50, 100, 200] } } # PARAM_GRID_MAP = { # 'SVC None': { # 'resampler': [None], # 'classifier': [SVC(random_state=RANDOM_STATE)], # 'classifier__C': [0.1, 1.0, 10, 100, 1000]
################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.tree import tree neigh = KNeighborsClassifier(n_neighbors=10, weights='distance') neigh.fit(features_train, labels_train) outcome_knn = neigh.predict(features_test) print " KNN accuracy >> ", accuracy_score(outcome_knn, labels_test) #94% ada = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=5), algorithm="SAMME", n_estimators=200) ada.fit(features_train, labels_train) outcome_ada = ada.predict(features_test) print "ADA accuracy >> ", accuracy_score(outcome_ada, labels_test) # 92.4% rf = RandomForestClassifier(bootstrap=True, max_depth=2, max_features='auto', n_estimators=200) rf.fit(features_train, labels_train) outcome_rf = rf.predict(features_test) outcome_rf_prb = rf.predict_proba(features_test) print "RF accuracy >> ", accuracy_score(outcome_rf, labels_test) # #print "RF probabilities >> ",outcome_rf_prb
def test__group_data_with_none(self): mrbbagging = MRBBagging(1, tree.DecisionTreeClassifier()) x = [[1, 1, 1], [2, 2, 2], [3, 3, 3]] y = ["A", None, "C"] with self.assertRaises(AssertionError): mrbbagging._group_data(x, y)
### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels # features_train, features_test, labels_train, labels_test = preprocess() features_train, labels_train, features_test, labels_test = makeTerrainData() ######################################################### ### your code goes here ### from sklearn.tree import tree from sklearn.metrics import accuracy_score clear() print("Start execution") # min_samples_split = 50 classifier = tree.DecisionTreeClassifier(min_samples_split=50) classifier.fit(features_train, labels_train) prediction = classifier.predict(features_test) pictureName = "decision_tree_classifier_bigger.png" accuracy = accuracy_score(labels_test, prediction) print(accuracy) prettyPicture(classifier, features_test, labels_test, pictureName) show_img(pictureName) #########################################################
from pandas import read_csv from sklearn import linear_model from sklearn.model_selection import train_test_split from sklearn.tree import tree, export_graphviz from obprueba import Diabetes as odi reglog = linear_model.LogisticRegression() navidad = tree.DecisionTreeClassifier() archivo = 'dataset_final.csv' df = read_csv(archivo) # print(df) arreglox = df[df.columns[1:-1]].as_matrix() arregloy = df[df.columns[-1]].as_matrix() # print(arregloy) # jugando con el modelo X_train, X_test, y_train, y_test = train_test_split(arreglox, arregloy) entrena = navidad.fit(X_train, y_train) # reglog.fit(X_train, y_train) entrena2 = reglog.fit(X_train, y_train) # reglog.fit(X_train, y_train) print(entrena) print(str(entrena.score(X_test, y_test)) + ' scort arbol ') print(entrena2) print(str(entrena2.score(X_test, y_test)) + ' scort regresion lineal ')
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=0) predictions = {} # classifier - logistic regression classifier = LogisticRegression() classifier.fit(X_train, y_train) predict = classifier.predict(X_test) predictions["Logistic Regression"] = predict # classifier - decision trees classifier = tree.DecisionTreeClassifier() classifier.fit(X_train, y_train) predict = classifier.predict(X_test) predictions["Decision Tree"] = predict # classifier - k neighbours classifier = KNeighborsClassifier(n_neighbors=1) classifier.fit(X_train, y_train) predict = classifier.predict(X_test) predictions["K Neighbours"] = predict # classifier - naive bayes classifier = GaussianNB() classifier.fit(X_train, y_train)
def getTrainAndTest(self): #df = pd.read_csv('H:\pc programming\Django(Prac)\ML\Classification\Classification\Review_Testing_Format.txt') df = pd.read_csv('Review_Testing_Format.txt') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) X = np.array(df.drop(['class'], 1)) y = np.array(df['class']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.10) # Built-In Decision Tree self.clf_DTree = tree.DecisionTreeClassifier() self.clf_DTree.fit(X_train, y_train) accuracy = self.clf_DTree.score(X_test, y_test) print("Accuracy in Decision Tree: %s" % accuracy) # Built-In K-Nearest Neighbour self.clf_KNN = neighbors.KNeighborsClassifier() self.clf_KNN.fit(X_train, y_train) accuracy = self.clf_KNN.score(X_test, y_test) print("Accuracy in KNN: %s" % accuracy) # Built-In Support Vector Machine self.clf_SVM = svm.SVC() self.clf_SVM.fit(X_train, y_train) accuracy = self.clf_SVM.score(X_test, y_test) print("Accuracy in SVM: %s" % accuracy) Y = label_binarize(y, classes=['A', 'B', 'C']) n_classes = Y.shape[1] X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=.5, ) classifier = OneVsRestClassifier(svm.LinearSVC(random_state=None)) classifier.fit(X_train, Y_train) y_score = classifier.decision_function(X_test) # For each class precision = dict() recall = dict() average_precision = dict() ''' for i in range(n_classes): average_precision[i] = average_precision_score(Y_test[:, i], y_score[:, i]) average_precision["micro"] = average_precision_score(Y_test, y_score, average="micro") average_precision["macro"] = average_precision_score(Y_test, y_score, average="macro") average_precision["weighted"] = average_precision_score(Y_test, y_score, average="weighted") print('Average precision score, micro-averaged over all classes: {0:0.2f}' .format(average_precision["micro"])) recall["micro"] = recall_score(Y_test, y_score,average="micro") print('Recall score, micro over all classes: {0:0.2f}' .format(recall["micro"])) ''' '''
y = np.array(recipes['label']) print("feature load done") #x_train, x_test, y_train, y_test = tts(X, y, test_size=0.6) #print("train-test done") ###classifiers #clf_nb = MultinomialNB() print("model start") #clf_svm = svm.LinearSVC(verbose=True) clf_lr = LogisticRegression(verbose=True) clf_tree = tree.DecisionTreeClassifier() clf_knn = KNeighborsClassifier(n_neighbors=5) clf_nb = MultinomialNB() ######### ##model save print("training start.........") print(".") print("tree start") clf_tree.fit(X, y) filename = 'tree.sav' pickle.dump(clf_tree, open(filename, 'wb')) print("tree done") print(".") print("lr start")
trees_str = gen_quant_trees_str(tree, precisions) with open(filename, 'w') as f: f.write(trees_str) def get_tree_results(tree, Xtest): """ Runs data through a quantized DecisionTreeClassifier :param tree: DTC function handle :param Xtest: data to test :returns: predicted results """ results = [tree(X) for X in Xtest] return np.array([results], ndmin=1).T if __name__ == '__main__': DIR = r'C:\Users\brady\GitHub\MinVAD\feature_extract' tr_data = np.load(os.path.join(DIR, 'train_130k.npy')) tr_class = np.load(os.path.join(DIR, 'train_130k_class.npy')) myData = np.hstack((tr_data, tr_class)) np.random.shuffle(myData) cutoff = int(np.floor(0.8 * len(tr_class))) clf = tree.DecisionTreeClassifier(max_depth=5) clf = clf.fit(myData[:cutoff, :19], myData[:cutoff, 20]) test_str = gen_quant_trees_str(clf, np.arange(16, 15, -1)) print(test_str)