items = [] if '),' in item: items = item.split('),') else: items = item.split(',', 1) for i in items: i = i.lstrip().rstrip() i = i.lstrip('\'').rstrip('\'') i = i.lstrip('\"').rstrip('\"') nx += [i] for i in range(len(nx)): if i % 2 == 0: if nx[i] == 'base_estimator': p['base_estimator'] = tree.DecisionTreeClassifier( splitter='random', max_depth=1) elif nx[i] == 'learning_rate': p[nx[i].lstrip().rstrip()] = float(nx[i + 1]) elif nx[i] == 'n_estimators': p[nx[i].lstrip().rstrip()] = int(nx[i + 1]) else: p[nx[i].lstrip().rstrip()] = nx[i + 1] if type( nx[i + 1]) != str else nx[i + 1].lstrip().rstrip() (accuracy_train, accuracy_test) = runDecisionTreeClassifier( x_train, y_train, x_test, y_test, p) print("\t".join([ 'decision tree', data, str(accuracy_train), str(accuracy_test), fs
def main(): if len(sys.argv) != 4 or not sys.argv[2] in [ 'freq', 'chi2' ] or not sys.argv[3] in [ 'MultinomialNB', 'GaussianNB', 'SVM', 'DecisionTree', 'KNN' ]: print('usage:\n\ python categorisation.py <N> <selection> <classifier>\n\n\ with:\n\ N = number of relevant terms by text\n\ selection = \'freq\' or \'chi2\' (feature selection method)\n\ classifier = \'MultinomialNB\' or \'GaussianNB\' or \'SVM\' or \'DecisionTree\' or \'KNN\'' ) exit() train_set = [] test_set = [] N = sys.argv[1] method = sys.argv[2] s = shelve.open('featuresDictonaries') if ('train_' + str(N)) in s: train_set = s['train_' + str(N)] if ('test_' + str(N)) in s: test_set = s['test_' + str(N)] s.close() if not (train_set and test_set): print('building freq features dictionaries...') for cat in reuters.categories(): dfs = defaultdict(lambda: 0) tfs = defaultdict(lambda: defaultdict(lambda: 0)) tfsidfs = defaultdict(lambda: 0) for file_id in reuters.fileids(cat): fileWords = [] for w in set(reuters.words(file_id)) - set( stopwords.words('english')): if w not in fileWords: dfs[w] += 1 fileWords.append(w) tfs[file_id][w] += 1 for file_id in tfs: for w in tfs[file_id]: tfsidfs[w] = float(tfs[file_id][w]) / dfs[w] tfidfSorted = dict( sorted(tfsidfs.iteritems(), key=operator.itemgetter(1), reverse=True)[:int(N)]) if file_id.startswith('train'): train_set.append((tfidfSorted, cat)) else: test_set.append((tfidfSorted, cat)) print('done') print('saving to featuresDictonaries...') s = shelve.open('featuresDictonaries') s['train_' + str(N)] = train_set s['test_' + str(N)] = test_set s.close() print('done') print('classifying...') pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)), ('svm', svm.LinearSVC())]) classifier = SklearnClassifier(pipeline) # chi2 if sys.argv[2] == 'chi2': if sys.argv[3] == 'KNN': pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)), ('svm', KNeighborsClassifier(n_neighbors=5))]) classifier = SklearnClassifier(pipeline) elif sys.argv[3] == 'MultinomialNB': pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)), ('svm', MultinomialNB())]) classifier = SklearnClassifier(pipeline) elif sys.argv[3] == 'GaussianNB': pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)), ('svm', GaussianNB())]) classifier = SklearnClassifier(pipeline, sparse=False) elif sys.argv[3] == 'DecisionTree': pipeline = Pipeline([('chi2', SelectKBest(chi2, k=290)), ('svm', tree.DecisionTreeClassifier())]) classifier = SklearnClassifier(pipeline, sparse=False) else: if sys.argv[3] == 'KNN': classifier = SklearnClassifier(KNeighborsClassifier(n_neighbors=5)) elif sys.argv[3] == 'MultinomialNB': classifier = SklearnClassifier(MultinomialNB()) elif sys.argv[3] == 'GaussianNB': classifier = SklearnClassifier(GaussianNB(), sparse=False) elif sys.argv[3] == 'DecisionTree': classifier = SklearnClassifier(tree.DecisionTreeClassifier(), sparse=False) elif sys.argv[3] == 'SVM': classifier = SklearnClassifier(svm.LinearSVC()) classifier.train(train_set) test_skl = [] t_test_skl = [] for d in test_set: test_skl.append(d[0]) t_test_skl.append(d[1]) p = classifier.batch_classify(test_skl) print classification_report(t_test_skl, p, labels=list(set(t_test_skl)), target_names=reuters.categories())
from sklearn.datasets import load_iris import matplotlib.pyplot as plt import seaborn as sns from sklearn import tree sns.set(style="ticks") df = sns.load_dataset("iris") df02 = df.iloc[:,[0,2,4]] # 选择一对特征 print(df02) sns.pairplot(df02, hue="species") plt.show() # =================================== print('=======================================') clf = tree.DecisionTreeClassifier() train_index = [i for i in range(150) if i<30 or 50<=i<80 or 100<=i<130] test_index = [i for i in range(150) if 30<=i<50 or 80<=i<100 or 130<=i<150] train_data, train_target = df02.iloc[train_index,[0,1]],df02.iloc[train_index,2] test_data, test_target = df02.iloc[test_index,[0,1]],df02.iloc[test_index,2] clf = clf.fit(train_data, train_target) print(clf) # 结果 test_val = clf.predict(test_data) print(test_val) right = [i for i, j in zip(test_val,test_target) if i==j] percent = len(right) / len(test_target) print(percent) # 0.95
def decisionTree(self): print "***** Testing Decision Tree *****" clf = tree.DecisionTreeClassifier() scores = cross_val_score(clf, self.X_selected, self.y, cv=5) print scores, scores.mean()
def crossval(Matching, Mapping_exercises, Big_tbl, Worktlb): # Build a dataframe for the results results_cv = pd.DataFrame(columns=["Exercise","number", "Type_of_algorithm", "mean_Bcr_train", "mean_Bcr_test", 'Used_fold']) used_fold = 5 '''Main loop''' # Pich one column corresponding to an exercise at a time and make it the label for exercise_number in Matching: # Extract the number of the exercise (example: 1001) name_of_exercise = exercise_number.replace("_frequency", "") # Extract the full name of the exercise (example: Exercise 1 (K): Circulation ) name_of_exercise = Mapping_exercises['name'][ Mapping_exercises.index[Mapping_exercises['number'] == int(name_of_exercise)].tolist()].values[0] # Create the label for the machine learning algorithm label = Big_tbl[exercise_number].notnull().astype(int).to_frame() mean_Bcr_train = 0 mean_Bcr_test = 0 kf = KFold(n_splits=used_fold, shuffle=True, random_state=42) # Define the split - into "n_splits" folds iter = used_fold for train_index, test_index in kf.split(Worktlb): # Split the data and the label into test and train set train, test = Worktlb.iloc[train_index], Worktlb.iloc[test_index] label_train, label_test = label.iloc[train_index], label.iloc[test_index] # If trouble in train set if sum(label_train.values) == 0: print('Issue in kflod of ' + str(name_of_exercise)) # Do nothing mean_Bcr_train = mean_Bcr_train iter -= 1 else: # Train prediction clf = tree.DecisionTreeClassifier(max_depth=5,class_weight ='balanced') clf = clf.fit(train, label_train) # Get the most important feature # Predict the label for train set train_pred = clf.predict(train) # confusion_matrix(y_true, y_pred) bcr_train = balanced_accuracy_score(label_train, train_pred) mean_Bcr_train = mean_Bcr_train + bcr_train # Test prediction with the model build on the train set test_pred = clf.predict(test) # confusion_matrix(y_true, y_pred) bcr_test = balanced_accuracy_score(label_test, test_pred) mean_Bcr_test = mean_Bcr_test + bcr_test # Add everinthing to the Result table if iter != 0: mean_Bcr_train = mean_Bcr_train / used_fold mean_Bcr_test = mean_Bcr_test / used_fold results_cv = results_cv.append( {"Exercise": name_of_exercise,"number":exercise_number.replace("_frequency", ""), "Type_of_algorithm": "Tree", "mean_Bcr_train": mean_Bcr_train, "mean_Bcr_test": mean_Bcr_test, "Used_fold": iter}, ignore_index=True) return results_cv
y = targets = labels = train_df['Results of Last election'].values y = np.array(y) columns = ["Average of last 3 elections", "Average of last 5 elections", "Average of polls 1 mo before election (>0 = Repub)", "% of registered republicans", "% of registered democrats", "State unemployment rate", "Party of governers"] >>>>>>> 6b7f67e5c85747354cf8c8924176d441a10ccfcc features = train_df[list(columns)].values features = np.array(features) print("Y data \n" + str(y)) print("------------------------") print("X data \n" + str(features)) print("-------------------------") X = features <<<<<<< HEAD clf = tree.DecisionTreeClassifier(criterion="entropy") ======= clf = tree.DecisionTreeClassifier(criterion="entropy", max_features=3) >>>>>>> 6b7f67e5c85747354cf8c8924176d441a10ccfcc clf = clf.fit(X, y) print("X shape: " + str(X.shape)) print("Y shape: " + str(y.shape)) print("--------------------------") f = tree.export_graphviz(clf, out_file="decisiontree.dot", feature_names=columns) <<<<<<< HEAD test_df = pd.read_csv('test/florida_test.csv') features2 = test_df[list(columns)].values features2 = np.array(features2) # print(features2.shape) importance = clf.feature_importances_ for i, o in zip(columns, importance):
Balanced_class_dataset.drop('Session_ID', axis=1, inplace=True) #Split of the dataset into training set and testing set X_train, X_test, Y_train, Y_test = train_test_split( Balanced_class_dataset.iloc[:, :-1], Balanced_class_dataset['Buy_Outcome'], test_size=0.3, random_state=1) #Define Decision Tree Depths_Leaves = [(10, 5), (3, 7), (30, 1000), (10, 20)] fposDT, trposDT, threshDT = [], [], [] for item in Depths_Leaves: print(item) clfDT = tree.DecisionTreeClassifier(max_depth=item[0], max_leaf_nodes=item[1], random_state=1) #Training the classifiers clfDT.fit(X_train, Y_train) #Test the trained model on the test set y_test_pred_DT = clfDT.predict(X_test) #Confusion matrix of our model towards the test data confMatrix_Test_DT = confusion_matrix(Y_test, y_test_pred_DT, labels=None) print( f'Decision Tree Depth: {clfDT.get_depth()}, Leaves: {clfDT.get_n_leaves()}' ) print('Confusion Matrix') print(confMatrix_Test_DT, '\n') pr_y_test_pred_DT = clfDT.predict_proba(X_test)
import pandas as pd import numpy as np from subprocess import call from sklearn import tree from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split data = pd.read_csv("../Dataset/onehot.csv", delimiter=",") xTrain, xTest, yTrain, yTest = train_test_split(data.iloc[:, :-1], data.iloc[:, -1:], test_size=0.33, random_state=0) treeClassifier = tree.DecisionTreeClassifier(max_depth=12, max_features="auto") treeClassifier.fit(xTrain, yTrain) yPredict = treeClassifier.predict(xTest) yTrainPredict = treeClassifier.predict(xTrain) print(accuracy_score(yTest, yPredict) * 100) print(accuracy_score(yTrain, yTrainPredict) * 100) # file = "../Visualization/binary.dot" # tree.export_graphviz(treeClassifier, out_file=file, feature_names = data.columns[:-1], class_names = True, filled=True, rounded=True, special_characters=True) # call(['dot', '-Tpng', file, '-o', 'binary.png', '-Gdpi=600'])
#EDA titanic_train.shape titanic_train.info() titanic_train1 = pd.get_dummies(titanic_train, columns=['Pclass', 'Sex', 'Embarked']) titanic_train1.shape titanic_train1.info() titanic_train1.head(6) X_train = titanic_train1.drop( ['PassengerId', 'Age', 'Cabin', 'Ticket', 'Name', 'Survived'], 1) y_train = titanic_train['Survived'] #Note that we take entire data into consideration in boosting. dt_estimator = tree.DecisionTreeClassifier() #Ensemble.AdaBoostClassifier by passing base_estimator as dt_Estimator and n_estimators(no of. trees to grow) = 5 ada_tree_estimator1 = ensemble.AdaBoostClassifier(dt_estimator, 5) scores = model_selection.cross_val_score(ada_tree_estimator1, X_train, y_train, cv=10) print(scores.mean()) ada_tree_estimator1.fit(X_train, y_train) ada_tree_estimator1.estimators_ #extracting all the trees build by ada boost algorithm #This tree building is only for display and understanding purpose but not requiered in reality n_tree = 0 #Since we gave n_estimators(no of. trees to grow) = 5, it builds 5 trees
y_train = y[ind[:split_ind]] y_test = y[ind[split_ind:]] ############################### KNN ########################################### #print('\nKNN') model_KNN = KNeighborsClassifier(weights='distance') start = time.time() model_KNN.fit(x_train, y_train) predTrain = model_KNN.predict(x_train) pred = model_KNN.predict(x_test) elapsed_time = time.time() - start print('{0:.6f} '.format(elapsed_time)) print((np.sum(predTrain == y_train) / len(y_train)) * 100) print((np.sum(pred == y_test) / len(y_test)) * 100) ############################### Decision Tree ################################# print('\nDecision Tree') model_DT = tree.DecisionTreeClassifier(criterion='entropy') #criterion='entropy',max_depth = 34,min_samples_split=2, splitter= 'best' start = time.time() model_DT.fit(x_train, y_train) predTrain = model_DT.predict(x_train) pred = model_DT.predict(x_test) elapsed_time = time.time() - start print('{0:.6f} '.format(elapsed_time)) print((np.sum(predTrain == y_train) / len(y_train)) * 100) print((np.sum(pred == y_test) / len(y_test)) * 100, "\n") ############################### Random Forests ################################ #print('\nRandom Forests') model_RF = RandomForestClassifier(n_estimators=10, max_features='log2') #n_estimators=65 , criterion='entropy', max_features='auto' , max_depth = none,min_samples_split=2 start = time.time() model_RF.fit(x_train, y_train)
def trainVectorizer(): train_set = sklearn.datasets.load_files( container_path= r"C:\Users\Lucas\Documents\EC\10º Período\RI\RI-part1\Treino", random_state=42) data_train = train_set.data count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(data_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) test_set = sklearn.datasets.load_files( container_path= r"C:\Users\Lucas\Documents\EC\10º Período\RI\RI-part1\Teste\html", random_state=42) data_test = test_set.data X_test_counts = count_vect.transform(data_test) X_test_tfidf = tfidf_transformer.transform(X_test_counts) #Train vectorizer start_time = time.time() clf_MLP_tf = MLPClassifier(hidden_layer_sizes=(10, 5), solver='lbfgs').fit(X_train_tfidf, train_set.target) end_time = time.time() train_time_mlp_tf = end_time - start_time start_time = time.time() clf_multinomial_tf = MultinomialNB().fit(X_train_tfidf, train_set.target) end_time = time.time() train_time_multinomial_tf = end_time - start_time start_time = time.time() clf_gaussian_tf = GaussianNB().fit(X_train_tfidf.toarray(), train_set.target) end_time = time.time() train_time_gaussian_tf = end_time - start_time start_time = time.time() clf_rf_tf = RandomForestClassifier(n_estimators=100).fit( X_train_tfidf, train_set.target) end_time = time.time() train_time_rf_tf = end_time - start_time start_time = time.time() clf_lr_tf = linear_model.LogisticRegression().fit(X_train_tfidf, train_set.target) end_time = time.time() train_time_lr_tf = end_time - start_time start_time = time.time() clf_dt_tf = tree.DecisionTreeClassifier().fit(X_train_tfidf, train_set.target) end_time = time.time() train_time_dt_tf = end_time - start_time start_time = time.time() clf_svm_tf = LinearSVC().fit(X_train_tfidf, train_set.target) end_time = time.time() train_time_svm_tf = end_time - start_time #Predict predicted_MLP_tf = clf_MLP_tf.predict(X_test_tfidf) predicted_multinomial_tf = clf_multinomial_tf.predict(X_test_tfidf) predicted_gaussian_tf = clf_gaussian_tf.predict(X_test_tfidf.toarray()) predicted_rf_tf = clf_rf_tf.predict(X_test_tfidf) predicted_lr_tf = clf_lr_tf.predict(X_test_tfidf) predicted_dt_tf = clf_dt_tf.predict(X_test_tfidf) predicted_svm_tf = clf_svm_tf.predict(X_test_tfidf) #Salvando resultados saveVectorizer(predicted_MLP_tf, test_set, "MLP", "TF-IDF", X_train_tfidf.shape, train_time_mlp_tf) saveVectorizer(predicted_multinomial_tf, test_set, "MultinomialNB", "TF-IDF", X_train_tfidf.shape, train_time_multinomial_tf) saveVectorizer(predicted_gaussian_tf, test_set, "GaussianNB", "TF-IDF", X_train_tfidf.shape, train_time_gaussian_tf) saveVectorizer(predicted_rf_tf, test_set, "RandomForest", "TF-IDF", X_train_tfidf.shape, train_time_rf_tf) saveVectorizer(predicted_lr_tf, test_set, "LogisticRegression", "TF-IDF", X_train_tfidf.shape, train_time_lr_tf) saveVectorizer(predicted_dt_tf, test_set, "DecisionTree", "TF-IDF", X_train_tfidf.shape, train_time_dt_tf) saveVectorizer(predicted_svm_tf, test_set, "SVM", "TF-IDF", X_train_tfidf.shape, train_time_svm_tf)
#建立决策树分类模型 from sklearn import tree import pandas as pd import time from json import * print("Scripts starts...") start = time.time() inputfile = 'data.xls' #数据 outputfile = 'tree.xls' #模型输出文件 picture = 'tree.pdf' data = pd.read_excel(inputfile) #读入数据 y = data.iloc[:, 62].as_matrix() #样本标签列 x = data.iloc[:, 0:46].as_matrix() #样本特征 clf = tree.DecisionTreeClassifier(splitter='random') clf.fit(x, y) clf.predict(x) end1 = time.time() print("modeltime: %f s" % (end1 - start)) count = 0 #统计预测正确的结果个数 for left, right in zip(clf.predict(x), y): if left == right: count += 1 print("预测准确度为:%f" % (float(count) / len(y))) r = pd.DataFrame(clf.predict(x), columns=[u'预测结果']) pd.concat([data.iloc[:, :63], r], axis=1).to_excel(outputfile)
from sklearn import datasets iris = datasets.load_iris() X = iris.data Y = iris.target from sklearn.cross_validation import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.5) from sklearn import tree my_classifier = tree.DecisionTreeClassifier() my_classifier.fit(X_train, Y_train) predictions = my_classifier.predict(X_test) from sklearn.metrics import accuracy_score print("Decision Tree:-> ", accuracy_score(Y_test, predictions)) from sklearn.neighbors import KNeighborsClassifier my_classifier = KNeighborsClassifier() my_classifier.fit(X_train, Y_train) predictions = my_classifier.predict(X_test) from sklearn.metrics import accuracy_score print("Kneighbors classifier:-> ", accuracy_score(Y_test, predictions)) from sklearn.ensemble import RandomForestClassifier my_classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) my_classifier.fit(X_train, Y_train)
def __init__(self): self.trainer = "skLearn decisionTree" self.clf = tree.DecisionTreeClassifier() print("Using %s Classifier" % (self.trainer))
from sklearn.model_selection import GridSearchCV from imblearn.combine import SMOTETomek from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss, ClusterCentroids from imblearn.over_sampling import ADASYN from imbalance.classifyCrossValidation import ClassifyCV from imbalance.crossValidationStratified import CrossValidationStratified import pandas as pd import time, datetime if __name__ == '__main__': trainSets = ['p2p_lendingclub_70_1percent.csv'] testSets = ['p2p_lendingclub_30.csv'] classe = 'loan_status' # defino a lista de classificadores clfs = [GaussianNB(), tree.DecisionTreeClassifier(), linear_model.LogisticRegression()] names = ["Naive Bayes", "Decision Tree", "Logistic Regression"] final_names = list() for set in testSets: for name in names: final_names.append(str(name+'_'+set[:-4])) # defino a lista de tecnicas de sampling sTechniques = [RandomUnderSampler(random_state=1), SMOTE(random_state=1)] technique_names = ["RU", "SM"] def getParamsReSampling(reSamplingTechnique): if type(reSamplingTechnique) is SMOTETomek: return dict(smt__ratio=[0.8, 0.9, 1.0], smt__k=[1, 3, 5, 7], smt__m=[1, 3, 5, 7])
def fit(self, X, y): """ Function to train and construct the AdaBoostClassifier Inputs: X: pd.DataFrame with rows as samples and columns as features (shape of X is N X P) where N is the number of samples and P is the number of columns. y: pd.Series with rows corresponding to output variable (shape of Y is N) """ self.out_classes = list(set(list(y))) self.data = X self.labels = y for estimator in range(self.n_estimators): print("--------------------------------------", estimator, "----------------------------------------------") self.all_Xs.append(X) self.all_ys.append(y) total_samples = len(X) # print("total_samples:", total_samples) sample_weights = [1 / total_samples] * total_samples Dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=1) # fit the estimator Dtree.fit(X, y, sample_weight=sample_weights) self.estimators_list.append(Dtree) y_hat = Dtree.predict(X) # count all the wrong predicted output for curr estimator wrong_pred = 0 index_wrong_pred = [] for i in range(len(y)): if (y_hat[i] != y[i]): wrong_pred += sample_weights[i] index_wrong_pred.append(i) # add some delta value to prevent zero division err err = 0.00000001 wrong_pred += err # calculate amount of say amount_of_say = 0.5 * (math.log2(((1 - wrong_pred) / wrong_pred))) self.all_amount_of_says.append(amount_of_say) # remake sampel weights for i in range(len(y)): if (y_hat[i] != y[i]): sample_weights[i] = sample_weights[i] * math.exp( amount_of_say) else: sample_weights[i] = sample_weights[i] * math.exp( -amount_of_say) normalize_val = sum(sample_weights) # normalize sample weights sample_weights = [w / normalize_val for w in sample_weights] # create new data based on new sample weights X, y = self.new_data(X, y, sample_weights)
sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn import tree clf = tree.DecisionTreeClassifier(min_samples_split=40) t0 = time() clf=clf.fit(features_train,labels_train) print "training time:", round(time()-t0, 3), "s" t1 = time() pred=clf.predict(features_test) print "training time:", round(time()-t1, 3), "s" from sklearn.metrics import accuracy_score acc = accuracy_score(labels_test, pred) print(acc) #########################################################
#print(featureList) #特征向量化 vec = DictVectorizer() dummyX = vec.fit_transform(featureList).toarray() #print(vec.feature_names_) #print(dummyX) #针对class label做向量化 lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) #print(dummyY) #使用决策树算法来分类 clf = tree.DecisionTreeClassifier( criterion="entropy" ) #分类器 criterion默认选择cart算法的标准来计算结点,现在指定是用id3算法中计算信息熵的方法来选择结点 entropy 信息熵 clf = clf.fit(dummyX, dummyY) #建模 #print(clf) #生成一个dot文件 以展示决策树 with open(r"C:\Users\Administrator\Desktop\1.dot", "w") as f: f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f) #dos命令 打开命令提示符 将dot文件转pdf 能可视化地更直观的展示决策树 #dot -Tpdf C:\Users\Administrator\Desktop\1.dot -o output.pdf #应用决策树来预测,先取原来数据集的第一行,然后改一改弄个新的 oneRowX = dummyX[0, :]
min_samples_split=5, random_state=1) RF.fit(x_train, y_train) predictions_RF = RF.predict(x_test) probablity_RF = RF.predict_proba(x_test) fpr_RF, tpr_RF, threshold_RF = roc_curve(y_test, probablity_RF[:, 1]) ##(6) Linear Regression LiR = LinearRegression() LiR.fit(x_train, y_train) predictions_LiR = LiR.predict(x_test) #probablity_LiR= LiR.predict_proba(x_test) fpr_LiR, tpr_LiR, threshold_LiR = roc_curve(y_test, predictions_LiR[:, 0]) ###(7)Decision Tree mode = tree.DecisionTreeClassifier(criterion='gini') mode.fit(x_train, y_train) predictions_tree = mode.predict(x_test) probablity_tree = mode.predict_proba(x_test) fpr_tree, tpr_tree, threshold_tree = roc_curve(y_test, probablity_tree[:, 1]) ###(8)Deeplearn model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(105, ))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, epochs=4, batch_size=512) y_pred_label = model.predict_classes(x_test)
from itertools import product import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn import tree iris = load_iris() X = iris.data[:, [2, 3]] y = iris.target clf = tree.DecisionTreeClassifier(max_depth=2) clf.fit(X, y) # plt.plot() # plt.scatter(X[:, 0], X[:, 1]) # plt.show() x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) plt.plot() Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.rainbow) plt.scatter(X[:, 0], X[:, 1], c=y, alpha=1, cmap=plt.cm.YlOrRd) plt.title('Decision Tree') plt.xlabel('Petal.Length') plt.ylabel('Petal.Width')
#test_data = r"/home/pgupta/Dropbox/Shared with Parth/test-dssm.only2.svm"; trainX, trainY = load_svmlight_file(train_data); testX, testY = load_svmlight_file(test_data); train_set = trainX.toarray(); test_set = testX.toarray(); trainY = [int(round(trainY[i])) for i in xrange(len(trainY))] testY = [int(round(testY[i] )) for i in xrange(len(testY))] n_features=24 num_estimators = 20 ##DTC clf = tree.DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) clf.fit(train_set, trainY) y_predicted = clf.predict( train_set ) score_train = clf.score( train_set, trainY ) y_predicted = clf.predict( test_set ) score_test = clf.score( test_set, testY ) print ("DTC") print (precision_recall_fscore_support(testY, y_predicted, average='binary')) ##RFC clf = ensemble.RandomForestClassifier(n_estimators=num_estimators, max_features = 5, max_depth=None, min_samples_split=1,
diag_map = {'B': 'benign', 'M': 'malignant'} df['diagnosis'] = df['diagnosis'].map(diag_map) labs = df['diagnosis'] df.drop(['Unnamed: 32', 'id', 'diagnosis'], axis=1, inplace=True) #Split into train and test and fit decision trees output_dir = output + 'decision_trees/' if not os.path.exists(output_dir): os.makedirs(output_dir) kf = KFold(n_splits=5, shuffle=True, random_state=42) fold_accuracy = [] for train_indices, test_indices in kf.split(df): X_train, X_test = df.iloc[train_indices], df.iloc[test_indices] Y_train, Y_test = labs[train_indices], labs[test_indices] tree_model = tree.DecisionTreeClassifier(random_state=42) tree_model.fit(X_train, Y_train) preds = tree_model.predict(X_test) accuracy = round((sum(preds == Y_test) / len(Y_test)) * 100, 3) print(' '.join([ 'Fold', str(len(fold_accuracy) + 1), 'Accuracy:', str(accuracy) + '%' ])) fname = ' '.join(['Decision Tree Fold', str(len(fold_accuracy) + 1)]) with (open(output_dir + fname + '.dot', 'w')) as f: export_graphviz(tree_model, out_file=f, filled=True, rounded=True,
from sklearn import tree clf = tree.DecisionTreeClassifier(criterion="entropy", min_impurity_split=0.02, min_samples_split=370) ll = [] tcpORudp = [] with open('traceDMA.txt') as f: content = f.readlines() # you may also want to remove whitespace characters like `\n` at the end of each line content = [x.strip() for x in content] for i in range(len(content)): l = [] a, b, c = content[i].split() #print(a) l.append(a) #print(b) l.append(b) #print(c) ll.append(l) tcpORudp.append(c) print("XXXXXXXXX") #print (ll) #print (tcpORudp ) clf = clf.fit(ll, tcpORudp) #prediction = clf.predict([[80, 1480]]) #print(prediction) import graphviz featurenames = ["PORT", "SIZE"]
def decision_tree_classifier(train_x, train_y): from sklearn import tree model = tree.DecisionTreeClassifier() model.fit(train_x, train_y) return model
def main(multi_mode='ovo', winL=90, winR=90, do_preprocess=True, use_weight_class=True, maxRR=True, use_RR=True, norm_RR=True, compute_morph={''}, oversamp_method='', pca_k='', feature_selection='', do_cross_val='', C_value=0.001, gamma_value=0.0, reduced_DS=False, leads_flag=[1, 0]): print("Runing train_SVM.py!") # db_path = '/home/mondejar/dataset/ECG/mitdb/m_learning/scikit/' db_path = 'C:/Users/Matteo/Desktop/data_mining_prog/mit-bih-database/m_learning/scikit/' # Load train data [tr_features, tr_labels, tr_patient_num_beats] = load_mit_db('DS1', winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) # Load Test data [eval_features, eval_labels, eval_patient_num_beats] = load_mit_db('DS2', winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) if reduced_DS == True: np.savetxt('mit_db/' + 'exp_2_' + 'DS2_labels.csv', eval_labels.astype(int), '%.0f') else: np.savetxt('mit_db/' + 'DS2_labels.csv', eval_labels.astype(int), '%.0f') # if reduced_DS == True: # np.savetxt('mit_db/' + 'exp_2_' + 'DS1_labels.csv', tr_labels.astype(int), '%.0f') # else: # np.savetxt('mit_db/' + 'DS1_labels.csv', tr_labels.astype(int), '%.0f') ############################################################## # 0) TODO if feature_Selection: # before oversamp!!????? # TODO perform normalization before the oversampling? if oversamp_method: # Filename oversamp_features_pickle_name = create_oversamp_name(reduced_DS, do_preprocess, compute_morph, winL, winR, maxRR, use_RR, norm_RR, pca_k) # Do oversampling tr_features, tr_labels = perform_oversampling(oversamp_method, db_path + 'oversamp/python_mit', oversamp_features_pickle_name, tr_features, tr_labels) # Normalization of the input data # scaled: zero mean unit variance ( z-score ) scaler = StandardScaler() scaler.fit(tr_features) tr_features_scaled = scaler.transform(tr_features) # scaled: zero mean unit variance ( z-score ) eval_features_scaled = scaler.transform(eval_features) ############################################################## # 0) ????????????? feature_Selection: also after Oversampling??? if feature_selection: print("Runing feature selection") best_features = 7 tr_features_scaled, features_index_sorted = run_feature_selection(tr_features_scaled, tr_labels, feature_selection, best_features) eval_features_scaled = eval_features_scaled[:, features_index_sorted[0:best_features]] # 1) if pca_k > 0: # Load if exists?? # NOTE PCA do memory error! # NOTE 11 Enero: TEST WITH IPCA!!!!!! start = time.time() print("Runing IPCA " + str(pca_k) + "...") # Run PCA IPCA = sklearn.decomposition.IncrementalPCA(pca_k, batch_size=pca_k) # gamma_pca # tr_features_scaled = KPCA.fit_transform(tr_features_scaled) IPCA.fit(tr_features_scaled) # Apply PCA on test data! tr_features_scaled = IPCA.transform(tr_features_scaled) eval_features_scaled = IPCA.transform(eval_features_scaled) """ print("Runing TruncatedSVD (singular value decomposition (SVD)!!!) (alternative to PCA) " + str(pca_k) + "...") svd = decomposition.TruncatedSVD(n_components=pca_k, algorithm='arpack') svd.fit(tr_features_scaled) tr_features_scaled = svd.transform(tr_features_scaled) eval_features_scaled = svd.transform(eval_features_scaled) """ end = time.time() print("Time runing IPCA (rbf): " + str(format(end - start, '.2f')) + " sec") ############################################################## # 2) Cross-validation: if do_cross_val: print("Runing cross val...") start = time.time() # TODO Save data over the k-folds and ranked by the best average values in separated files perf_measures_path = create_svm_model_name( 'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/' + multi_mode, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') # TODO implement this method! check to avoid NaN scores.... if do_cross_val == 'pat_cv': # Cross validation with one fold per patient cv_scores, c_values = run_cross_val(tr_features_scaled, tr_labels, tr_patient_num_beats, do_cross_val, len(tr_patient_num_beats)) if not os.path.exists(perf_measures_path): os.makedirs(perf_measures_path) np.savetxt(perf_measures_path + '/cross_val_k-pat_cv_F_score.csv', (c_values, cv_scores.astype(float)), "%f") elif do_cross_val == 'beat_cv': # cross validation by class id samples k_folds = {5} for k in k_folds: ijk_scores, c_values = run_cross_val(tr_features_scaled, tr_labels, tr_patient_num_beats, do_cross_val, k) # TODO Save data over the k-folds and ranked by the best average values in separated files perf_measures_path = create_svm_model_name( 'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/' + multi_mode, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') if not os.path.exists(perf_measures_path): os.makedirs(perf_measures_path) np.savetxt(perf_measures_path + '/cross_val_k-' + str(k) + '_Ijk_score.csv', (c_values, ijk_scores.astype(float)), "%f") end = time.time() print("Time runing Cross Validation: " + str(format(end - start, '.2f')) + " sec") else: ################################################################################################ # 3) Train models models_path_randomForest = db_path + 'models/' + 'random_forest/'+multi_mode + '_rbf' models_path_kNN = db_path + 'models/' + 'kNN/' + multi_mode + '_rbf' models_path_c45 = db_path + 'models/' + 'c45/' + multi_mode + '_rbf' models_path_randomForest=create_svm_model_name(models_path_randomForest, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '_') models_path_kNN=create_svm_model_name(models_path_kNN, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '_') models_path_c45=create_svm_model_name(models_path_c45, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '_') if os.path.isfile(models_path_randomForest): # Load the trained model! randomForest = joblib.load(models_path_randomForest) else: print("Training model on MIT-BIH DS1: " + models_path_randomForest + "...") randomForest = RandomForestRegressor(n_estimators=100,random_state=42) start = time.time() randomForest.fit(tr_features_scaled, tr_labels) end = time.time() print("Trained completed!\n\t" + models_path_randomForest+ "\n \ \tTime required: " + str(format(end - start, '.2f')) + " sec") # Export model: save/write trained SVM model joblib.dump(randomForest, models_path_randomForest) if os.path.isfile(models_path_kNN): # Load the trained model! kNN = joblib.load(models_path_kNN) else: print("Training model on MIT-BIH DS1: " + models_path_kNN + "...") kNN = KNeighborsClassifier(n_neighbors=5) start = time.time() kNN.fit(tr_features_scaled, tr_labels) end = time.time() print("Trained completed!\n\t" + models_path_kNN + "\n \ \tTime required: " + str(format(end - start, '.2f')) + " sec") # Export model: save/write trained SVM model joblib.dump(kNN, models_path_kNN) if os.path.isfile(models_path_c45): # Load the trained model! c45 = joblib.load(models_path_c45) else: print("Training model on MIT-BIH DS1: " + models_path_c45 + "...") c45 = tree.DecisionTreeClassifier() start = time.time() c45.fit(tr_features_scaled, tr_labels) end = time.time() print("Trained completed!\n\t" + models_path_c45+ "\n \ \tTime required: " + str(format(end - start, '.2f')) + " sec") # Export model: save/write trained SVM model joblib.dump(c45, models_path_c45) # 4) Test SVM model print("Testing model on MIT-BIH DS2: " + models_path_randomForest + "...") # Evaluate the model on the training data perf_measures_path = create_svm_model_name( 'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/randomForest/' + multi_mode, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') print("Evaluation of randon forest on DS1 ...") eval_other_model(randomForest, tr_features_scaled, tr_labels, perf_measures_path,'Training') print("Evaluation of randon forest on DS2 ...") eval_other_model(randomForest, eval_features_scaled, eval_labels, perf_measures_path, 'Testing') perf_measures_path = create_svm_model_name( 'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/kNN/' + multi_mode, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') print("Evaluation of kNN on DS1 ...") eval_other_model(kNN, tr_features_scaled, tr_labels, perf_measures_path, 'Training') print("Evaluation of kNN forest on DS2 ...") eval_other_model(kNN, eval_features_scaled, eval_labels, perf_measures_path, 'Testing') perf_measures_path = create_svm_model_name( 'C:/Users/Matteo/Desktop/data_mining_prog/ecg-classification-master/python/results/c45/' + multi_mode, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') print("Evaluation of randon forest on DS1 ...") eval_other_model(c45, tr_features_scaled, tr_labels, perf_measures_path, 'Training') print("Evaluation of randon forest on DS2 ...") eval_other_model(c45, eval_features_scaled, eval_labels, perf_measures_path, 'Testing') ''' # TODO load best params from cross validation! use_probability = False model_svm_path = db_path + 'svm_models/' + multi_mode + '_rbf' model_svm_path = create_svm_model_name(model_svm_path, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '_') if gamma_value != 0.0: model_svm_path = model_svm_path + '_C_' + str(C_value) + '_g_' + str(gamma_value) + '.joblib.pkl' else: model_svm_path = model_svm_path + '_C_' + str(C_value) + '.joblib.pkl' print("Training model on MIT-BIH DS1: " + model_svm_path + "...") if os.path.isfile(model_svm_path): # Load the trained model! svm_model = joblib.load(model_svm_path) else: class_weights = {} for c in range(4): class_weights.update({c: len(tr_labels) / float(np.count_nonzero(tr_labels == c))}) # class_weight='balanced', if gamma_value != 0.0: # NOTE 0.0 means 1/n_features default value svm_model = svm.SVC(C=C_value, kernel='rbf', degree=3, gamma=gamma_value, coef0=0.0, shrinking=True, probability=use_probability, tol=0.001, cache_size=200, class_weight=class_weights, verbose=False, max_iter=-1, decision_function_shape=multi_mode, random_state=None) else: svm_model = svm.SVC(C=C_value, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=use_probability, tol=0.001, cache_size=200, class_weight=class_weights, verbose=False, max_iter=-1, decision_function_shape=multi_mode, random_state=None) # Let's Train! start = time.time() svm_model.fit(tr_features_scaled, tr_labels) end = time.time() # TODO assert that the class_ID appears with the desired order, # with the goal of ovo make the combinations properly print("Trained completed!\n\t" + model_svm_path + "\n \ \tTime required: " + str(format(end - start, '.2f')) + " sec") # Export model: save/write trained SVM model joblib.dump(svm_model, model_svm_path) # TODO Export StandardScaler() ######################################################################### ''' '''
predicted = text_clf.predict(X_test) print(metrics.classification_report(y_test, predicted)) # <h1>Decision Tree</h1> # In[55]: from sklearn import tree text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', tree.DecisionTreeClassifier()), ]) text_clf.fit(X_train, y_train) predicted = text_clf.predict(X_test) print(metrics.classification_report(y_test, predicted)) # <h1>Random Forest</h1> # In[56]:
test_Boosting = True cm_plot = True times_plot = True # Split into training and test data. Use random_state to get the same results in every run X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs) """ Decision tree - test decision for learning curve, model complexity, hyper parm tuning """ # Learning Curve, sample size, fit times if test_DT: #print('learning curve processing') clf_dt = tree.DecisionTreeClassifier(random_state=rs) train_sizes = np.linspace(0.1, 1.0, 5) train_sizes, train_scores, test_scores, fit_times, _ = \ learning_curve(clf_dt, X_train, y_train, cv=cv, n_jobs=-1, train_sizes=train_sizes, return_times=True) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) fit_times_mean = np.mean(fit_times, axis=1) fit_times_std = np.std(fit_times, axis=1) DT_train_mean = train_scores_mean DT_test_mean = test_scores_mean DT_fit_mean = fit_times_mean # plot learning curve
#Preprocesamiento imp = SimpleImputer(missing_values=np.NaN, strategy='mean') X_salida = imp.fit_transform(X_inicial) Aprepro = preprocessing.normalize(X_salida) Aprepro = preprocessing.scale(Aprepro) aux1 = Aprepro #print(Aprepro) X = np.delete(aux1, 20, axis=1) #print(len(X[1])) #y=np.delete(Aprepro, np.arange(20), axis=1) y = np.delete(X_inicial, np.arange(20), axis=1) #print(len(y[1])) #print(y) from sklearn import tree clasificador = tree.DecisionTreeClassifier(criterion='entropy') clasificador.fit(X, y) #Datos prueba #yp=pd.read_csv(r'test.csv') #yp=datos.to_numpy() #yp=np.delete(yp, 0, axis=1) #print(len(yp[1])) from sklearn import model_selection from sklearn.metrics import confusion_matrix X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.33) clasificador.fit(X_train, y_train)
# Print out how many wrong classifications it did print("Number of mislabeled points out of a total %d points : %d" % (x_test.shape[0], (y_test != y_pred).sum())) # Calculate the accuracy correct = (y_test == y_pred).sum() accuracy = correct / len(y_pred) # Printing the accuracy print('Accuracy NB:', accuracy * 100) # Confusion Matrix names = ['recurr', 'no-recurr'] metrics.plot_confusion_matrix(gnb, x_test, y_test, display_labels=names) # Decision Tree classification # Create an instance of a Decision Tree clf = tree.DecisionTreeClassifier(criterion='gini', ccp_alpha=0.0075) # Fit the model to the training data and predict the testing data clf = clf.fit(x_train, y_train) # Calculate the accuracy tree_pred = clf.predict(x_test) corr = (y_test == tree_pred).sum() accuracy_tree = corr / len(tree_pred) # Print out how many wrong classifications it did print("Number of mislabeled points out of a total %d points : %d" % (x_test.shape[0], (y_test != tree_pred).sum())) # Printing the accuracy print('Accuracy Tree:', accuracy_tree * 100) # Matrix metrics.plot_confusion_matrix(clf, x_test, y_test, display_labels=names)
import sklearn.metrics as metrics iris = datasets.load_iris() # X = iris.data[:, [2, 3]] X = iris.data y = iris.target from sklearn.preprocessing import StandardScaler sc = StandardScaler() sc.fit(X) X = sc.transform(X) # split data into train and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=0) clf.fit(X_train, y_train) # generate evaluation metrics print("Train - Accuracy :", metrics.accuracy_score(y_train, clf.predict(X_train))) print("Train - Confusion matrix :", metrics.confusion_matrix(y_train, clf.predict(X_train))) print("Train - classification report :", metrics.classification_report(y_train, clf.predict(X_train))) print("Test - Accuracy :", metrics.accuracy_score(y_test, clf.predict(X_test))) print("Test - Confusion matrix :", metrics.confusion_matrix(y_test, clf.predict(X_test))) print("Test - classification report :", metrics.classification_report(y_test, clf.predict(X_test))) tree.export_graphviz(clf, out_file='tree.dot')