def tree(labels,X,df,i): tree = DT(max_depth = 4) tree.fit(X,labels) impt = tree.feature_importances_ para = tree.get_params() export_graphviz(tree, out_file = OUTPUT_DIRECTORY+str(i)+"_tree.dot", feature_names = df.columns) return impt
def fit_sktree(path, index_filter=None, class_filter=None, feature_filter=None, folds=10, inverse=False, max_depth=10, min_samples_split=20, lc_filter=None): data = pd.read_csv(path, index_col=0) data, y = utils.filter_data(data, index_filter, class_filter, feature_filter, lc_filter) skf = cross_validation.StratifiedKFold(y, n_folds=folds) results = [] for train_index, test_index in skf: if inverse: aux = train_index train_index = test_index test_index = aux train_X, test_X = data.iloc[train_index], data.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] clf = None clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, min_samples_split=min_samples_split) clf.fit(train_X, train_y) results.append(metrics.predict_table(clf, test_X, test_y)) return pd.concat(results)
def get_clfs(rank, Nfeatures=20, Nscores=10): """ Traning decision tree on a chank of data and returns predictions""" df = pd.read_csv('data/train_%d.csv'%rank, names=headers) print rank, df.shape np.random.seed(rank) fselect = np.random.choice(range(2, Nscores), Nfeatures, replace = False) print rank, fselect indexes = np.array(scores_indexes)[fselect] Nr, Nc = df.shape Nf = len(indexes) X = np.zeros([Nr,Nf+1]) y = np.zeros([Nr]) get_X_y(X, y, df, features_touples, indexes) print rank, 'Xy read' del df if rank == 0: print 'Size of numpy array in GB:', X.nbytes/1.e9 clf = DecisionTreeClassifier(random_state=0) clf.fit(X, y) y_pred = clf.predict_proba(X) etmp = log_loss(y, y_pred) del X, y print 'IN error on rank:', rank, 'is', etmp return (clf, rank, etmp)
def decision_tree_entropy(training_data): clf = DecisionTreeClassifier(criterion="entropy",random_state=0) clf.fit(training_data[0], training_data[1]) #with open("/media/deeksha/e/Deeksha/Dropbox/Coursework/MachineLearning/HW3/entropy.dot", 'w') as f: # f = tree.export_graphviz(clf, out_file=f) print "entropy:Number of Nodes", clf.tree_.node_count return clf
def quize1(data): # 1. Select count of neighbors.Загрузите выборку из файла titanic.csv с помощью пакета Pandas. # 2.Оставьте в выборке четыре признака: класс пассажира (Pclass), цену билета (Fare), возраст пассажира (Age) и его пол (Sex). # 3.Обратите внимание, что признак Sex имеет строковые значения. # 4.Выделите целевую переменную — она записана в столбце Survived. # 5.В данных есть пропущенные значения — например, для некоторых пассажиров неизвестен их возраст. # 6.Такие записи при чтении их в pandas принимают значение nan. # Найдите все объекты, у которых есть пропущенные признаки, и удалите их из выборки. # Обучите решающее дерево с параметром random_state=241 и остальными параметрами по умолчанию. # Вычислите важности признаков и найдите два признака с # наибольшей важностью. Их названия будут ответами для данной задачи # (в качестве ответа укажите названия признаков через запятую или пробел, порядок не важен). dataF = data[['Pclass', 'Fare', 'Age', 'Sex','Survived']] dataF = dataF.dropna() Y = dataF['Survived'] dataF = dataF[['Pclass', 'Fare', 'Age', 'Sex']] clf = DecisionTreeClassifier(random_state=241) dataF.loc[dataF['Sex'] != 'male', 'Sex'] = 0 dataF.loc[dataF['Sex'] == 'male', 'Sex'] = 1 print (dataF) clf.fit(dataF, Y) importances = clf.feature_importances_ print(importances) # d = zip(dataF.columns, clf.feature_importanc_) # print(d) return
def test(train_feature,train_label,test_feature,test_label): from sklearn import metrics from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier # fit a CART model to the data model = DecisionTreeClassifier() from sklearn.svm import SVC # fit a SVM model to the data # model = SVC() # model = GaussianNB() # model = LogisticRegression() from sklearn.neighbors import KNeighborsClassifier # fit a k-nearest neighbor model to the data import time currenttime = time.time() # model = KNeighborsClassifier() model.fit(train_feature, train_label) print(model) # make predictions expected = test_label predicted = model.predict(test_feature) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) print(metrics.accuracy_score(expected,predicted))
class Transformer: def __init__(self, use_PCA=True): self._clf = DecisionTreeClassifier(min_samples_leaf=10) self._idx = None self._scaler = StandardScaler() self._trans = PCA('mle') self._use_PCA = use_PCA def fit(self, X, y): X = np.array(X) self._clf.fit(X, y) self._idx = filter(lambda x: self._clf.feature_importances_[x] > 0, \ range(len(self._clf.feature_importances_))) new_set = [X[i][self._idx] for i in xrange(len(X))] # new_set = self._scaler.fit_transform(new_set) if self._use_PCA: new_set = self._trans.fit_transform(new_set) return new_set def transform(self, features): features = features[self._idx] # features = self._scaler.transform(features.astype(float)) if self._use_PCA: features = self._trans.transform(features) return features
def Train(self, X, y): N = X.shape[0] New = np.zeros(N) for i in self.names: A = [] C = [] weight = np.ones(N)/N New[np.where(y == i)[0]] = 1 New[np.where(y != i)[0]] = -1 for j in range(self.itr): # Input a generic sklearn classifier clf = DecisionTreeClassifier(max_depth = self.dep) clf.fit(X, New, sample_weight = weight) Pre = clf.predict(X) err = weight.dot((New != Pre).astype(int)) if (err != 0): A.append(.5*np.log((1-err)/err)) else: A.append(1) C.append(clf) weight *= np.exp(-A[j]*New*Pre) weconight = weight/np.sum(weight) self.C.append(C) self.A.append(A)
def decision_tree(train_bow,train_labels,test_bow,test_labels,bow_indexes): print("Training decision tree") dt_classifier=DecisionTreeClassifier() dt_classifier.fit(train_bow,train_labels) print("Testing decision tree") test(dt_classifier,"dt",test_bow,test_labels,bow_indexes)
def test_scoring(): X, y = iris_data() clf1 = LogisticRegression(random_state=1, solver='liblinear', multi_class='ovr') clf2 = DecisionTreeClassifier(random_state=1) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.5, random_state=123) score1 = clf1.fit(X_train, y_train).score(X_test, y_test) score2 = clf2.fit(X_train, y_train).score(X_test, y_test) assert round(score1, 2) == 0.96, round(score1, 2) assert round(score2, 2) == 0.91, round(score2, 2) t, p = paired_ttest_kfold_cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='accuracy', random_seed=1) assert round(t, 3) == -1.861, t assert round(p, 3) == 0.096, p t, p = paired_ttest_kfold_cv(estimator1=clf1, estimator2=clf2, X=X, y=y, scoring='recall_micro', random_seed=1) assert round(t, 3) == -1.861, t assert round(p, 3) == 0.096, p
def evaluateDecisionTree(train_x,train_y,test_x,test_y): clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,max_depth=20) clf.fit(train_x,train_y) p = clf.predict_proba(test_x)[:,1] auc = roc_auc_score(test_y,p) plotAUC(test_y,clf.predict_proba(test_x)[:,1],'DT') return auc
def test_graphviz_errors(): """Check for errors of export_graphviz""" clf = DecisionTreeClassifier(max_depth=3, min_samples_split=1) clf.fit(X, y) out = StringIO() assert_raises(IndexError, export_graphviz, clf, out, feature_names=[])
def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10, "Failed with {0}".format(name)) assert_equal(n_important, 3, "Failed with {0}".format(name)) X_new = clf.transform(X, threshold="mean") assert_less(0, X_new.shape[1], "Failed with {0}".format(name)) assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name)) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) clf.fit(iris.data, iris.target) clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data)) clf2.fit(iris.data, iris.target) assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
def buildTree(options, treefile, dataFile = None): dt = loadTree(treefile) if dt is not None: return dt if dataFile is None: raise ValueError("No data file specified") dt = DecisionTreeClassifier(min_samples_split=20, random_state=99) files = [] featureFrames = [] targetFrames = [] if os.path.isdir(dataFile): files = getFiles(dataFile, ".csv") else: files.append(dataFile) for _file in files: print("Loading data %s" % _file) (featureValues, targetValues, features, df) = loadData(_file, options) featureFrames.append(featureValues) targetFrames.append(targetValues) dt.fit(pd.concat(featureFrames), pd.concat(targetFrames)) saveTree(treefile, dt) print("Building graph") visualize_tree(treefile, dt, features) return dt
def train_adaboost(features, labels, learning_rate, n_lab, n_runs, n_estim, n_samples): uniqLabels = np.unique(labels) print 'Taking ', str(n_lab), ' labels' uniqLabels = uniqLabels[:n_lab] used_labels = uniqLabels pbar = start_progressbar(len(uniqLabels), 'training adaboost for %i labels' %len(uniqLabels)) allLearners = [] for yy ,targetLab in enumerate(uniqLabels): runs=[] for rrr in xrange(n_runs): #import ipdb;ipdb.set_trace() feats,labs = get_binary_sets(features, labels, targetLab, n_samples) #print 'fitting stump' #import ipdb;ipdb.set_trace() baseClf = DecisionTreeClassifier(max_depth=4, min_samples_leaf=10, min_samples_split=10) baseClf.fit(feats, labs) ada_real = AdaBoostClassifier( base_estimator=baseClf, learning_rate=learning_rate, n_estimators=n_estim, algorithm="SAMME.R") #import ipdb;ipdb.set_trace() runs.append(ada_real.fit(feats, labs)) allLearners.append(runs) update_progressbar(pbar, yy) end_progressbar(pbar) return allLearners, used_labels
def plot_tree(max_depth=1): fig, ax = plt.subplots(1, 2, figsize=(15, 7)) h = 0.02 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) if max_depth != 0: tree = DecisionTreeClassifier(max_depth=max_depth, random_state=1).fit(X, y) Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) faces = tree.tree_.apply(np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) faces = faces.reshape(xx.shape) border = ndimage.laplace(faces) != 0 ax[0].contourf(xx, yy, Z, alpha=.4) ax[0].scatter(xx[border], yy[border], marker='.', s=1) ax[0].set_title("max_depth = %d" % max_depth) ax[1].imshow(tree_image(tree)) ax[1].axis("off") else: ax[0].set_title("data set") ax[1].set_visible(False) ax[0].scatter(X[:, 0], X[:, 1], c=np.array(['b', 'r'])[y], s=60) ax[0].set_xlim(x_min, x_max) ax[0].set_ylim(y_min, y_max) ax[0].set_xticks(()) ax[0].set_yticks(())
def decision_tree_prediction(features_train, labels_train, features_test, ids): X_train, X_test, y_train, y_test = cross_validation.train_test_split(features_train, labels_train, random_state=1301, stratify=labels_train, test_size=0.3) clf = DecisionTreeClassifier(criterion='gini', min_samples_split=10, max_depth=10, max_leaf_nodes=16, max_features=2) #clf_acc = clf.fit(X_train, y_train) # print(clf.best_estimator_) #feature_importance = clf.feature_importances_ #print (feature_importance) #pred = clf_acc.predict_proba(X_test)[:,1] #print (y_test, pred) # acc = accuracy_score(y_test, pred) # print ("Acc {}".format(acc)) clf = clf.fit(features_train, labels_train) pred = clf.predict_proba(features_test)[:,1] predictions_file = open("data/canivel_decision_tree.csv", "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["ID", "TARGET"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close()
def calculate_single_tree(noisy_fold_single_tree, folds_single_tree): accuracy = 0.0 clf = DecisionTreeClassifier(criterion='entropy', splitter='best', min_samples_split=49) for k in range(0, len(noisy_fold_single_tree)): learn_group_x_single_tree = get_union_of_all_but_i(noisy_fold_single_tree, k) learn_group_y_single_tree = [] for l in learn_group_x_single_tree: learn_group_y_single_tree.append(l.pop()) curr_tree_single_tree = clf.fit(learn_group_x_single_tree, learn_group_y_single_tree) num_of_success = 0 for m in folds_single_tree[k]: ans = m.pop() tree_ans = curr_tree_single_tree.predict([m]) m.append(ans) if ans == tree_ans: num_of_success += 1 for l in learn_group_x_single_tree: l.append(learn_group_y_single_tree.pop(0)) accuracy += num_of_success / (float(len(folds_single_tree[k]))) accuracy /= float(len(noisy_fold_single_tree)) print('single tree. acc: {}'.format(k, accuracy))
class TreeClassifier(Classifier): def __init__(self, min_samples_split=20, random_state=99): self.classifier = DecisionTreeClassifier(min_samples_split=min_samples_split, random_state=random_state) def do_train(self, X, y): self.classifier.fit(X, y) def do_classification(self, X, y): self.classifier.predict(X[:, 'age':'thal']) print('wtf') def visualize_tree(tree, feature_names): """Create tree png using graphviz. Args ---- tree -- scikit-learn DecsisionTree. feature_names -- list of feature names. """ with open("dt.dot", 'w') as f: export_graphviz(tree, out_file=f, feature_names=feature_names) command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"] try: subprocess.check_call(command) except Exception, e: print(e) exit("Could not run dot, ie graphviz, to produce visualization")
def train(self, X, Y): N, D = X.shape for t in xrange(self.boostrap_sample): sampleX, sampleY = self.get_sample(X, Y) clf = DecisionTreeClassifier(criterion="entropy", max_depth = 1) clf.fit(sampleX, sampleY) self.weak_clfs.append(clf)
def decisionTree(dataTrain,featuresTrain,dataTest,featuresTest,filename='result'): criterion=['gini','entropy'] splitter=['best','random'] max_features=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,None,"log2","sqrt"] max_accuracy=0.0 recall_score=0.0 best_clf=None for param in product(criterion,splitter,max_features): print("\n========================================================") clf=DecisionTreeClassifier(criterion=param[0],splitter=param[1],max_features=param[2]) result=clf.fit(dataTrain,featuresTrain) print result print result.score(dataTest,featuresTest) resultFeatures=clf.predict(dataTest) accuracy=metrics.accuracy_score(featuresTest,resultFeatures) recallu=metrics.recall_score(featuresTest,resultFeatures) print accuracy,recallu if accuracy > max_accuracy: max_accuracy=accuracy recall_score=recallu best_clf=result predict_features=resultFeatures print("\n========================================================") print ("\n Get result") print ("Best accuracy is %0.6f\nBest paramters is %s" % (max_accuracy,best_clf)) fd = open("DT"+filename+".txt",'a') fd.write("Best accuracy and recall is %0.6f\t%0.6f\nBest paramters is %s" % (max_accuracy,recall_score,best_clf)) fd.close()
def main(): data = run_game() clf = DecisionTreeClassifier(criterion='entropy') game_data = [[i[0], i[1]] for i in data] profits = [i[2] for i in data] clf.fit(game_data, profits) with open('tree.dot', 'w') as dotfile: export_graphviz( clf, dotfile, feature_names=['coin', 'bet'] ) predictions_lose1 = [clf.predict([0, 0]) for x in xrange(100)] predictions_lose2 = [clf.predict([0, 1]) for x in xrange(100)] predictions_win = [clf.predict([1, 1]) for x in xrange(100)] print 'All these profit predictions should be zero:' print predictions_lose1 print 'Accuracy was', calculate_accuracy(predictions_lose1, np.array([0])) print 'All these profit predictions should be zero:' print predictions_lose2 print 'Accuracy was', calculate_accuracy(predictions_lose2, np.array([0])) print 'All these profit predictions should be two:' print predictions_win print 'Accuracy was', calculate_accuracy(predictions_win, np.array([2]))
class MultEstimator(BaseEstimator): def __init__(self, categories): self.categories = categories def fit(self, X, y, **params): self.models = {_: None for _ in self.categories} self.tot_model = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100) categ = X[:, -1] data = X[:, :-1] self.tot_model.fit(data, y) for c in self.models.keys(): mask = categ == c m = DecisionTreeClassifier(max_depth=8, min_samples_leaf=100) m.fit(data[mask], y[mask]) self.models[c] = m def predict(self, X): categ = X[:, -1] data = X[:, :-1] p = self.tot_model.predict(data) for c in self.models.keys(): mask = categ == c if mask.any(): p[mask] = self.models[c].predict(data[mask]) return p def predict_proba(self, X): categ = X[:, -1] data = X[:, :-1] p = self.tot_model.predict_proba(data) for c in self.models.keys(): mask = categ == c if mask.any(): p[mask] = self.models[c].predict_proba(data[mask]) return p
def programmer_2(): datafile = 'data/model.xls' data = pd.read_excel(datafile) data = data.as_matrix() shuffle(data) # 随机打乱数据 # 设置训练数据比8:2 p = 0.8 train = data[:int(len(data) * p), :] test = data[int(len(data) * p):, :] # 构建CART决策树模型 treefile = 'tmp/tree.pkl' tree = DecisionTreeClassifier() tree.fit(train[:, :3], train[:, 3]) joblib.dump(tree, treefile) cm_plot(train[:, 3], tree.predict(train[:, :3])).show() # 显示混淆矩阵可视化结果 # 注意到Scikit-Learn使用predict方法直接给出预测结果。 fpr, tpr, thresholds = roc_curve( test[:, 3], tree.predict_proba(test[:, :3])[:, 1], pos_label=1) plt.plot(fpr, tpr, linewidth=2, label='ROC of CART', color='green') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') # 设定边界范围 plt.ylim(0, 1.05) plt.xlim(0, 1.05) plt.legend(loc=4) plt.show() print(thresholds)
def test_graphviz_errors(): # Check for errors of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2) # Check not-fitted decision tree error out = StringIO() assert_raises(NotFittedError, export_graphviz, clf, out) clf.fit(X, y) # Check if it errors when length of feature_names # mismatches with number of features message = ("Length of feature_names, " "1 does not match number of features, 2") assert_raise_message(ValueError, message, export_graphviz, clf, None, feature_names=["a"]) message = ("Length of feature_names, " "3 does not match number of features, 2") assert_raise_message(ValueError, message, export_graphviz, clf, None, feature_names=["a", "b", "c"]) # Check class_names error out = StringIO() assert_raises(IndexError, export_graphviz, clf, out, class_names=[]) # Check precision error out = StringIO() assert_raises_regex(ValueError, "should be greater or equal", export_graphviz, clf, out, precision=-1) assert_raises_regex(ValueError, "should be an integer", export_graphviz, clf, out, precision="1")
def text_learning_experiment(words_to_remove=[]): from_sara = open("../text_learning/from_sara.txt", "r") from_chris = open("../text_learning/from_chris.txt", "r") word_data, authors = vectorize_emails(from_sara, from_chris, max_emails=300, words_to_remove=words_to_remove) features_train, features_test, labels_train, labels_test = \ cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train = vectorizer.fit_transform(features_train) features_test = vectorizer.transform(features_test).toarray() features_train = features_train[:150].toarray() labels_train = labels_train[:150] clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) predict_train = clf.predict(features_train) predict_test = clf.predict(features_test) print "train acc:", accuracy_score(labels_train, predict_train) print "test acc: ", accuracy_score(labels_test, predict_test) feature_index = np.argmax(clf.feature_importances_) feature_importance = clf.feature_importances_[feature_index] feature_name = vectorizer.get_feature_names()[feature_index] print "Most important feature, and relative importance:", feature_name, ":", feature_importance return feature_name, feature_importance
def decision_trees(features, labels): classifier = DecisionTreeClassifier(random_state=0, criterion="entropy") classifier.fit(features, labels) scores = cross_validation.cross_val_score( classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support ) print_table("Decision Trees", numpy.around(numpy.mean(scores, axis=0), 2))
def main(percentage): """Given a percentage for splitting the dataset, fit the training set and apply the rest as a test set.""" df = pd.read_csv('cellStrength.log') df.drop('SSID', 1, inplace=True) processed = preprocess(df) location_col = processed[0].shape[1]-4 hash_to_location = {y:x for x,y in processed[1].items()} df2, targets = encode_target(processed[0], location_col) msk = np.random.rand(len(df)) < percentage test = df2[~msk].copy() train = df2[msk].copy() open('golden.csv', 'w').write(','.join([hash_to_location[p] for p in test['Target'].tolist()]) + '\n' ) test.drop(186, 1, inplace=True) test.drop('Target', 1, inplace=True) features = list(df2.columns[:location_col]) + list(df2.columns[location_col+1:-1]) y = train['Target'] X = train[features] dt = DecisionTreeClassifier(min_samples_split=3, random_state=99) try: dt.fit(X, y) except ValueError: return predictions = dt.predict(test).tolist() open('golden.csv', 'a').write(','.join([hash_to_location[p] for p in predictions])) # get_code(dt, features, targets) return get_accuracy('golden.csv')
def train_dtc(X, y): """ Create and train the Decision Tree Classifier. """ dtc = DecisionTreeClassifier() dtc.fit(X, y) return dtc
def main(args): exec "import main.pandas_talib.sig_%s as conf" % args.signame build.work2(20, 'sp500Top50', args.signame) df = base.get_merged(conf.__name__, yeod.get_sp500Top50()) df.to_csv("ta.csv") tree = DecisionTreeClassifier() feat_names = base.get_feat_names(df) dfTrain = df[(df.date>='1970-01-01') & (df.date <='2009-12-31')] npTrainFeat = dfTrain.loc[:,feat_names].values.copy() npTrainLabel = dfTrain.loc[:,"label5"].values.copy() npTrainLabel[npTrainLabel > 1.0] = 1 npTrainLabel[npTrainLabel < 1.0] = 0 tree.fit(npTrainFeat, npTrainLabel) joblib.dump(tree, "tree.pkl", compress = 3) dfTest = df[(df.date>='2010-01-01') & (df.date <='2099-12-31')] npTestFeat = dfTest.loc[:, feat_names].values.copy() npPred = tree.predict_proba(npTestFeat) dfTest.loc[:,"pred"] = npPred[:,1] print dfTest['pred'].head() dfPos = dfTest[ dfTest['pred'] > 0.55 ] print 1.0 * len(dfPos[dfPos['label5']>1]) / len(dfPos) print 1.0 * len(dfTest[dfTest['label5']>1]) / len(dfTest)
x__train, x__test, y__train, y__test = train_test_split(x_train, y_train, test_size=0.25, random_state=0) x__train.shape x__test.shape y__train.shape y__test.shape #Decision tree from sklearn import tree from sklearn.tree import DecisionTreeClassifier parameters = {'max_depth': [5], 'min_samples_split': [10, 30, 50]} decision_tree = DecisionTreeClassifier(criterion='gini') from sklearn.model_selection import GridSearchCV grid_search = GridSearchCV(decision_tree, parameters, n_jobs=-1, cv=3) grid_search.fit(x__train, y__train) print(grid_search.best_params_) predicted = grid_search.predict(x__test) from sklearn.metrics import accuracy_score accuracy = accuracy_score(y__test, predicted) * 100 print("Accuracy = {}".format(accuracy)) # In[4]: from sklearn import tree from sklearn.tree import DecisionTreeClassifier
y = iris.target sns_plot = seaborn.countplot(y) sns_plot.figure.savefig("rating.png") plt.close() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33) sns_plot = seaborn.countplot(y_test) sns_plot.figure.savefig("rating_test.png") plt.close() sns_plot = seaborn.countplot(y_train) sns_plot.figure.savefig("rating_train.png") plt.close() decision_tree = DecisionTreeClassifier() decision_tree.fit(x_train, y_train) decision_tree_predictions = decision_tree.predict(x_test) cnf_matrix = confusion_matrix(y_test, decision_tree_predictions) sns_plot = seaborn.heatmap(cnf_matrix, annot=True, center=0) sns_plot.figure.savefig("cnf_matrix.png") plt.close() normalized_cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum( axis=1)[:, np.newaxis] sns_plot = seaborn.heatmap(normalized_cnf_matrix, annot=True, center=0) sns_plot.figure.savefig("normalized_cnf_matrix.png") plt.close() print("decision tree perecision: ",
# In[25]: # Train,Test Splitting of data from sklearn.model_selection import train_test_split X_trainset, X_testset, y_trainset, y_testset = train_test_split(X_del_zero, y_del_zero, test_size=0.3, random_state=3) # In[26]: # Prediction Using decision tree Algo Clf_dt = DecisionTreeClassifier(criterion="entropy", max_depth=4) print(Clf_dt) # it shows the default parameters Clf_dt.fit(X_trainset, y_trainset) predTree = Clf_dt.predict(X_testset) # In[27]: # Metrics and Accuracy from sklearn import metrics import matplotlib.pyplot as plt print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))
plt.show() # 首先对数据进行切分,即划分出训练集和测试集 from sklearn.model_selection import train_test_split #调入sklearn库中交叉检验,划分训练集和测试集 all_inputs = df[['alcohol', 'malic_acid', 'ash', 'alcalinity ash', 'magnesium']].values all_species = df['species'].values (X_train, X_test, Y_train, Y_test) = train_test_split(all_inputs, all_species, train_size=0.85, random_state=1)#85%的数据选为训练集 # 使用决策树算法进行训练 from sklearn.tree import DecisionTreeClassifier #调入sklearn库中的DecisionTreeClassifier来构建决策树 # 定义一个决策树对象 decision_tree_classifier = DecisionTreeClassifier() # 训练模型 model = decision_tree_classifier.fit(X_train, Y_train) # 输出模型的准确度 print(decision_tree_classifier.score(X_test, Y_test)) # ~ print([[13.52,3.17,2.72,23.5,97],[12.42,2.55,2.27,22,90],[13.76,1.53,2.7,19.5,132]])#利用3个数据进行测试,即取3个数据作为模型的输入端 result=model.predict([[13.52,3.17,2.72,23.5,97],[12.42,2.55,2.27,22,90],[13.76,1.53,2.7,19.5,132]]) dict1={'Sauvignon':"赤霞珠",'Syrah':"西拉",'Zinfandel':"先粉黛"} list2=[] for i in result: word=dict1.get(i) list2.append(word) print(list2) #输出测试的结果,即输出模型预测的结果
from sklearn.naive_bayes import ComplementNB clf2 = ComplementNB() clf2.fit(x_train,y_train) print("\n","GaussianNB:",nb.score(x_test,y_test),"\n","MultinomialNB:",clf1.score(x_test,y_test),"\n","ComplementNB:",clf2.score(x_test,y_test)) # en uygunu accuracy i yüksek olduğu için GaussianNB seçildi predictionnb = nb.predict(x_test) y_prednb = nb.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_prednb)) print( confusion_matrix(y_test,y_prednb)) print("GaussianNB") print(classification_report(y_test,y_prednb)) #%% Decision Tree from sklearn.tree import DecisionTreeClassifier dt = DecisionTreeClassifier(criterion="entropy", max_depth=None,min_samples_split=10,max_features=18,random_state=0) dt = dt.fit(x_train,y_train) predictiondt = dt.predict(x_test) y_preddt = dt.predict(x_test) print("Accuracy:",metrics.accuracy_score(y_test, y_preddt)) print( confusion_matrix(y_test,y_preddt)) print("Decision Tree") print(classification_report(y_test,y_preddt)) #%% Logistic Regression from sklearn.linear_model import LogisticRegression from sklearn import metrics #logreg = LogisticRegression() #%% from sklearn import model_selection models = []
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap) plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) for idx, cl in enumerate(np.unique(y)): plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8, c=colors[idx], marker=markers[idx], label=cl, edgecolor='black') ## Building a decision tree tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1) tree.fit(X_train1, y_train1) X_combined = np.vstack((X_train1, X_test1)) y_combined = np.hstack((y_train1, y_test1)) plot_decision_regions(X_combined, y_combined, classifier=tree, test_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.tight_layout() #plt.savefig('images/03_20.png', dpi=300) plt.show()
label_set = numpy.loadtxt( './ml_datasets/titanic_passengers_dataset_min.csv', delimiter=',', skiprows=1, usecols=(1,) ) target_names = ["survived", "not survived"] for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]): # We only take the two corresponding features feature_set = feature_data[:, pair] # Train clf = DecisionTreeClassifier().fit(feature_set, label_set) # Plot the decision boundary plt.subplot(2, 3, pairidx + 1) x_min, x_max = feature_set[:, 0].min() - 1, feature_set[:, 0].max() + 1 y_min, y_max = feature_set[:, 1].min() - 1, feature_set[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) plt.xlabel(feature_names[pair[0]]) plt.ylabel(feature_names[pair[1]]) plt.axis("tight") # Plot the training points for i, color in zip(range(n_classes), plot_colors):
from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from models.sentence_encoders import HandcraftedEncoder #sent_encoder = HandcraftedEncoder() sent_encoder = HandcraftedEncoder(precomputed_embeddings=settings.PRECOMPUTED_HANDCRAFTED_EMBEDDINGS_FNAME) feature_list = ["Quote_count", "Sent_position", "R_difficult", "POS_PRP", "POS_VB", "A_concreteness"] #HandcraftedEncoder._all_features + "best" #feature = "best" for feature in feature_list: print(feature) sent_encoder.set_features(feature) model = SimplePQModel(sent_encoder=sent_encoder, clf_type=AdaBoostClassifier, clf_args={'n_estimators':100, 'base_estimator':DecisionTreeClassifier(max_depth=1, class_weight="balanced")}) print("training {}...".format(feature)) model.fit(train_articles) print("generating...") combined_samples[feature] = generate_samples(model, test_articles) elif model_name == "ngrams": from models.sentence_encoders import NGramEncoder for mode, n in [('char', 2), ('word', 1)]: print(mode, n) sent_encoder = NGramEncoder(mode=mode, n=n, store_results=False, vocab_size=1000) print("preparing encoder...")
mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False ## 创建模拟数据 X, y = make_gaussian_quantiles(n_samples=13000, n_features=10, n_classes=3, random_state=1) n_split = 3000 X_train, X_test = X[:n_split], X[n_split:] y_train, y_test = y[:n_split], y[n_split:] #建立两个模型,algorithm算法不同,bdt_real选择的是samme.r bdt_real = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1) bdt_discrete = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1, algorithm="SAMME") bdt_real.fit(X_train, y_train) bdt_discrete.fit(X_train, y_train) #获得预测的准确率,accuracy_score,是单个分类器的准确率。 #预测的误差率estimator_errors_ real_test_errors = [] #第一个模型每一个分类器的误差率 discrete_test_errors = [] #第二个模型每一个分类器的误差率
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import Binarizer from sklearn.decomposition import PCA from sklearn.preprocessing import PolynomialFeatures from sklearn.neural_network import MLPRegressor from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier tree_classification_pipeline = Pipeline([ ('tree', DecisionTreeClassifier()), # Forest instead of Trees # ('forest', RandomForestClassifier()) ]) ridge_regression_pipeline = Pipeline([ # Apply scaling to Ridge Regression # ('scale', StandardScaler()), ('ridge', Ridge()) ]) lasso_regression_pipeline = Pipeline([ # Apply scaling to Lasso Regression # ('scale', StandardScaler()), ('lasso', Lasso())
from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score import random import pickle from sklearn.decomposition import PCA clf = DecisionTreeClassifier(random_state=0) feature_vector = [] with open('final_data.csv', 'r') as fp: for i, line in enumerate(fp): if i == 0: pass else: feature_vector.append([int(x.strip()) for x in line.split(',')]) random.shuffle(feature_vector) X = [x[:-1] for x in feature_vector] Y = [y[-1] for y in feature_vector] pca = PCA(n_components=5) X = [x[:-1] for x in feature_vector] Y = [y[-1] for y in feature_vector] X = pca.fit_transform(X) pickle.dump(pca, open("pca_decision.p", "wb")) k_fold = KFold(10) results = []
plt.figure() colors = rainbow(np.linspace(0, 1, len(kernels))) plt.bar(kernels, svc_scores, color = colors) for i in range(len(kernels)): plt.text(i, svc_scores[i], svc_scores[i]) plt.xlabel('Kernels') plt.ylabel('Scores') plt.title('Support Vector Classifier scores for different kernels') #Decision Tree Classifier dt_scores = [] for i in range(1, len(X.columns) + 1): dt_classifier = DecisionTreeClassifier(max_features = i, random_state = 0) dt_classifier.fit(X_train, y_train) dt_scores.append(dt_classifier.score(X_test, y_test)) plt.figure() plt.plot([i for i in range(1, len(X.columns) + 1)], dt_scores, color = 'green') for i in range(1, len(X.columns) + 1): plt.text(i, dt_scores[i-1], (i, dt_scores[i-1])) plt.xticks([i for i in range(1, len(X.columns) + 1)]) plt.xlabel('Max features') plt.ylabel('Scores') plt.title('Decision Tree Classifier scores for different number of maximum features') #Random Forest Classifier
#df=(df-df.min())/(df.max()-df.min()) #df = df.reset_index() ##########разделение на тестовую и обучающую############# X= df.iloc[:, :-1].values y = df.iloc[:, 5].values X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=0) #####выбрать минимум 3 классификатора##### ########k-соседей, метод опорных векторов, дерево решений ########################## scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) SVC_model = SVC() DTC_model = DecisionTreeClassifier() KNN_model = KNeighborsClassifier(n_neighbors=5) SVC_model.fit(X_train, y_train) KNN_model.fit(X_train, y_train) DTC_model.fit(X_train, y_train) SVC_prediction = SVC_model.predict(X_test) KNN_prediction = KNN_model.predict(X_test) DTC_prediction = DTC_model.predict(X_test) # Оценка точности — простейший вариант оценки работы классификатора print(accuracy_score(SVC_prediction, y_test)) print(accuracy_score(KNN_prediction, y_test)) print(accuracy_score(DTC_prediction, y_test)) ############Часть 2.2 ############################################## data2= pd.read_csv('6.csv')
# split dataset into training and test set from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.fit_transform(x_test) # start making decision tree classifier from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier(criterion='entropy', random_state=0) clf.fit(x_train, y_train) # predict our model predict = clf.predict(x_test) #create confusion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predict) #visualization of train data from matplotlib.colors import ListedColormap x_set, y_set = x_train, y_train x1, x2 = np.meshgrid( np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
from sklearn.datasets import load_iris iris = load_iris() examples = iris.data truths = iris.target from sklearn.model_selection import train_test_split train_examples, test_examples, train_truths, test_truths = train_test_split(examples, truths, test_size=0.33) from sklearn.tree import DecisionTreeClassifier decision_tree = DecisionTreeClassifier() decision_tree.fit(train_examples, train_truths) prediction = decision_tree.predict(test_examples) from sklearn.metrics import accuracy_score print("result with decision tree:", accuracy_score(test_truths, prediction)) from sklearn.tree import export_graphviz export_graphviz(decision_tree, out_file='decision_tree_iris.dot')
from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB import pandas as pd import sklearn.datasets def get_iris_df(): ds = sklearn.datasets.load_iris() df = pd.DataFrame(ds['data'], columns=ds['feature_names']) code_species_map = dict(zip(range(3), ds['target_names'])) df['species'] = [code_species_map[c] for c in ds['target']] return df df = get_iris_df() CLASS_MAP = {'Logistic Regression': ('-', LogisticRegression()), 'Naive Bayes':('--', GaussianNB()), 'Desicion Tree': ('.-', DecisionTreeClassifier(max_depth=5)), 'Random Forest': (':', RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)),} X, Y = df[df.columns[:3]], (df['species']=='virginica') X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.8) for name, (line_fmt, model) in CLASS_MAP.items(): model.fit(X_train, Y_train) preds = model.predict_proba(X_test) pred = pd.Series(preds[:,1]) fpr, tpr, thresholds = roc_curve(Y_test, pred) auc_score = auc(fpr, tpr) label = '%s: auc=%f' % (name, auc_score) plt.plot(fpr, tpr, line_fmt, linewidth=5, label=label) plt.legend(loc="lower right") plt.title("Comparisons among classifiers")
# In[23]: # We can try different combination of variables predictor_var = ['Credit_History','Education','Married','Self_Employed','Property_Area'] classification_model(model, train, predictor_var,outcome_var) # The Credit History variable is a relatively dominating predictor since the additional variables seem to have little effect on the scores. # #### Decision Tree # In[24]: model = DecisionTreeClassifier() predictor_var = ['Credit_History'] classification_model(model, train, predictor_var, outcome_var) # In[25]: #We can try different combination of variables: train.head() predictor_var = ['Credit_History','Loan_Amount_Term','LoanAmount_log'] classification_model(model, train, predictor_var,outcome_var) # #### Random Forest
import random data_set = DataSet() data, label, class_names = data_set.get_train_data_set() indexs = random.sample(range(len(data)), 50000) data = data[indexs] label = label[indexs] X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, random_state=42) est = [('count_vect', CountVectorizer()), ('tr', TruncatedSVD(n_components=10, n_iter=100, random_state=42)), ('clf_DT', DecisionTreeClassifier())] pipeline_DT = Pipeline(est) pipeline_DT = pipeline_DT.fit(X_train, y_train) y_pred = pipeline_DT.predict(X_test) print("F1 score - DT:", f1_score(y_test, pipeline_DT.predict(X_test), average='micro')) print("Accuracy Score - DT:", accuracy_score(y_test, pipeline_DT.predict(X_test))) cnf_matrix = confusion_matrix(y_test, y_pred) plt.figure() plt = plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix DT')
self.baseModels = copy.deepcopy(self.oriBaseModels) self.metaModel = copy.deepcopy(self.oriMetaModel) self.fit(trainX, trainy) print("训练集表现:") y_train = self.predict(trainX) cm = confusion_matrix(trainy, y_train) print(cm) y_pred = self.predict(testX) cm = confusion_matrix(testy, y_pred) cmTotal += np.array(cm) print("测试集表现:") print(cm) print(cmTotal) return cmTotal if __name__ == '__main__': clf = StackingClassifier() from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB, BaseNB iris = datasets.load_iris() X = iris.data y = iris.target clf.setBaseModels(baseModels={'desisionTree': DecisionTreeClassifier(), 'mlp': MLPClassifier(hidden_layer_sizes=(50)), 'KNN': KNeighborsClassifier(n_neighbors=10), "NB": GaussianNB()}) clf.setMetaModel( DecisionTreeClassifier()) clf.kFoldValidatoin({'desisionTree': X, 'mlp': X, 'KNN': X, 'NB': X}, y, classNum=3)
# - KNN # - Logistic regression # - Linear Discriminant Analysis # In[ ]: # Cross validate model with Kfold stratified cross val kfold = StratifiedKFold(n_splits=10) # In[ ]: # Modeling step Test differents algorithms random_state = 7 classifiers = [] classifiers.append(SVC(random_state=random_state)) classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append( AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state), random_state=random_state, learning_rate=0.1)) classifiers.append(RandomForestClassifier(random_state=random_state)) classifiers.append(ExtraTreesClassifier(random_state=random_state)) classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) classifiers.append(KNeighborsClassifier()) classifiers.append(LogisticRegression(random_state=random_state)) classifiers.append(LinearDiscriminantAnalysis()) cv_results = [] for classifier in classifiers: cv_results.append(
data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=42) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html from sklearn.tree import DecisionTreeClassifier t0 = time() clf = DecisionTreeClassifier() clf.fit(features_train,labels_train) score = clf.score(features_test,labels_test) pred= clf.predict(features_test) print 'accuracy', score print "Decision tree algorithm time:", round(time()-t0, 3), "s" importances = clf.feature_importances_ import numpy as np indices = np.argsort(importances)[::-1] print 'Feature Ranking: ' for i in range(11): print "{} feature {} ({})".format(i+1,features_list[i+1],importances[indices[i]])
import numpy as np import matplotlib.pyplot as pt import pandas as pd import time from sklearn.tree import DecisionTreeClassifier x = int(input("Give the index of the test image you would like to predict : ")) #Read from train.csv data = pd.read_csv("train.csv").as_matrix() #Declare a Decision Tree Classifier Object clf = DecisionTreeClassifier() #training data xtrain = data[20000:, 1:] train_label = data[20000:, 0] #testing data xtest = data[0:21000, 1:] actual_label = data[0:21000:, 0] #training the classifier print("training classifier.....") clf.fit(xtrain, train_label) #Calculating accuracy print("Predicting the image at index %d ....." % x) time.sleep(2) p = clf.predict(xtest) count = 0 for i in range(0, 21000):
forest.fit(X,y) feature_importances = pd.DataFrame(forest.feature_importances_, index = xTrain.columns, columns=['importance']).sort_values('importance', ascending=False) print(feature_importances.head(10)) exit(2) ''' import warnings warnings.filterwarnings('ignore') # "error", "ignore", "always", "default", "module" or "once" # prepare models models = [] models.append(('Nearest Neighbors', KNeighborsClassifier(n_neighbors=29,weights='distance'))) models.append(('Linear SVM', SVC(kernel='linear', C=0.025))) models.append(('Decision Tree', DecisionTreeClassifier(max_depth=5))) models.append(('Random Forest', RandomForestClassifier(n_estimators = 1000, random_state = 42))) models.append(('Neural Net', MLPClassifier(alpha=1, max_iter=1000))) models.append(('AdaBoost', AdaBoostClassifier())) models.append(('Naive Bayes', GaussianNB())) models.append(('SGD Classifier', linear_model.SGDClassifier(max_iter=1000, tol=1e-3))) models.append(('LogisticRegressionCV', linear_model.LogisticRegressionCV(cv=5,max_iter=60000))) models.append(('K-means',KMeans(n_clusters=2))) models.append(('Gradient-Boost',GradientBoostingClassifier())) # evaluate each model in turn results = [] names = [] scoring = 'f1_macro' for name, model in models: cv_results = model_selection.cross_val_score(model, X, y, cv=5, scoring=scoring)
x = x.drop("index", axis=1) x_test = x_test.drop("index", axis=1) x.info() # In[ ]: y = train.Survived x_train, x_cv, y_train, y_cv = train_test_split(x, y, test_size=0.25, random_state=0) # In[ ]: model = DecisionTreeClassifier(min_samples_split = 10) model.fit(x_train,y_train) print("Decision Tree") print(model.score(x_train,y_train)) print(model.score(x_cv,y_cv)) imp = pd.DataFrame({"Features":x_train.columns}) imp["DecTree"] = model.feature_importances_ print("----------------------------") rf = RandomForestClassifier(min_samples_split =20, n_estimators=100) rf.fit(x_train,y_train) print("Random Forest") print(rf.score(x_train,y_train)) print(rf.score(x_cv,y_cv))
def create_classifiers_features(): clf_list = [] params_kbest = {"kbest__k": [1,2, 3, 5, 10, 15]} kbest_clf_naive = GaussianNB() kbest_params_naive={} kbest_params_naive.update(params_kbest) kbest = SelectKBest() clf_list.append( (Pipeline([("kbest", kbest), ("naive", kbest_clf_naive)]), kbest_params_naive) ) # kbest_clf_tree = DecisionTreeClassifier() kbest_params_tree = {"tree__min_samples_split":[2, 5, 10, 20], "tree__criterion": ('gini', 'entropy'),'tree__random_state':[50]} kbest_params_tree.update(params_kbest) kbest = SelectKBest() clf_list.append((Pipeline([("kbest", kbest), ("tree", kbest_clf_tree)]), kbest_params_tree)) # kbest_clf_linearsvm = LinearSVC() kbest_params_linearsvm = {"svm__C": [0.1, 1, 5, 10, 100], "svm__tol": [10**-1, 10**-3, 10**-5], "svm__class_weight": ['balanced'] } kbest_params_linearsvm.update(params_kbest) kbest = SelectKBest() clf_list.append((Pipeline([("kbest", kbest), ("svm", kbest_clf_linearsvm)]), kbest_params_linearsvm)) # kbest_clf_adaboost = AdaBoostClassifier() kbest_params_adaboost = { "adaboost__n_estimators":[20, 50, 100], 'adaboost__learning_rate': [0.4, 0.6, 1]} kbest_params_adaboost.update(params_kbest) kbest = SelectKBest() clf_list.append((Pipeline([("kbest", kbest), ("adaboost", kbest_clf_adaboost)]), kbest_params_adaboost)) kbest_clf_random_tree = RandomForestClassifier() kbest_params_random_tree = { "random_tree__n_estimators":[2, 3, 5,10,15], "random_tree__criterion": ('gini', 'entropy'), 'random_tree__min_samples_split': [1, 2, 4] } kbest_params_random_tree.update(params_kbest) kbest = SelectKBest() clf_list.append((Pipeline([("kbest", kbest), ("random_tree", kbest_clf_random_tree )]), kbest_params_random_tree )) # kbest_clf_log = LogisticRegression() kbest_params_log = { "log__C":[0.05, 0.5, 1, 10, 10**2,10**5,], "log__tol":[10**-1, 10**-5, 10**-10], "log__penalty":['l2','l1'], "log__class_weight":['balanced'] } kbest_params_log.update(params_kbest) kbest = SelectKBest() clf_list.append((Pipeline([("kbest", kbest), ("log", kbest_clf_log)]), kbest_params_log)) return clf_list
X_test_counts = count_vect.fit_transform(test['text']) X_train_counts = count_vect.transform(train['text']) X_train, X_test, y_train, y_test = train_test_split(X_train_counts, train['sentiment'], test_size=0.4, random_state=0) clf = MLPClassifier(alpha=1, random_state=65) clf.fit(X_train_counts, train['sentiment']) clf2 = SVC(probability=True, gamma=2, C=1) clf2.fit(X_train_counts, train['sentiment']) clf3 = DecisionTreeClassifier(random_state=0) clf3.fit(X_train_counts, train['sentiment']) clf5 = BaggingClassifier(random_state=54) clf5.fit(X_train, y_train) clf6 = ExtraTreesClassifier(random_state=0) clf6.fit(X_train, y_train) clf7 = GradientBoostingClassifier(random_state=32) clf7.fit(X_train, y_train) vc = VotingClassifier(estimators=[('mlp', clf), ('dt', clf3), ('et', clf6), ('bag', clf5), ('grad', clf7)], voting='soft', weights=[0.3, 0.1, 0.2, 0.1, 0.3])
y_test = pd.read_csv(filePath + "y_test.csv") featureList = [] for i in range(len(x_train.columns)): featureList.append("x" + str(i)) x_train.columns = featureList x_test.columns = featureList return x_train, x_test, y_train, y_test x_train, x_test, y_train, y_test = load_dataset() # y_train = y_train.values.ravel() # y_test = y_test.values.ravel() pipeline = Pipeline([('enc', OneHotEncoder(handle_unknown='ignore')), ('clf', DecisionTreeClassifier(criterion='entropy', min_samples_split=200, max_depth=24))]) pipeline.fit(x_train, y_train) y_pred = pipeline.predict(x_test) # USE FOR CLASSIFICATION GRID SEARCH """ pipeline = Pipeline([('oh', OneHotEncoder(handle_unknown='ignore')), ('dt', DecisionTreeClassifier(criterion='entropy', min_samples_split=200, max_depth=24))]) # pipeline = Pipeline([('enc', OneHotEncoder(handle_unknown='ignore')), ('clf', RandomForestClassifier())]) # Create lists of parameter for Decision Tree Classifier n_estimators = list(range(200, 1001, 200))
def create_classifiers(): clf_list = [] params_pca = {"pca__n_components": [2, 3, 5, 10, 15], "pca__whiten": [False]} # clf_naive = GaussianNB() params_naive = {} clf_list.append( (clf_naive, params_naive) ) pca_clf_naive = GaussianNB() pca_params_naive={} pca_params_naive.update(params_pca) pca = PCA() clf_list.append( (Pipeline([("pca", pca), ("naive", pca_clf_naive)]), pca_params_naive) ) # clf_tree = DecisionTreeClassifier() params_tree = { "min_samples_split":[2, 5, 10, 20], "criterion": ('gini', 'entropy'), 'random_state':[50] } clf_list.append( (clf_tree, params_tree) ) pca_clf_tree = DecisionTreeClassifier() pca_params_tree = {"tree__min_samples_split":[2, 5, 10, 20], "tree__criterion": ('gini', 'entropy'),'tree__random_state':[50]} pca_params_tree.update(params_pca) pca = PCA() clf_list.append((Pipeline([("pca", pca), ("tree", pca_clf_tree)]), pca_params_tree)) # clf_linearsvm = LinearSVC() params_linearsvm = {"C": [0.1, 1, 5, 10, 100], "tol":[10**-1, 10**-3, 10**-5], "class_weight":['balanced'] } clf_list.append( (clf_linearsvm, params_linearsvm) ) pca_clf_linearsvm = LinearSVC() pca_params_linearsvm = {"svm__C": [0.1, 1, 5, 10, 100], "svm__tol": [10**-1, 10**-3, 10**-5], "svm__class_weight": ['balanced'] } pca_params_linearsvm.update(params_pca) pca = PCA() clf_list.append((Pipeline([("pca", pca), ("svm", pca_clf_linearsvm)]), pca_params_linearsvm)) # clf_adaboost = AdaBoostClassifier() params_adaboost = { "n_estimators":[20, 50, 100], 'learning_rate': [0.4, 0.6, 1]} clf_list.append( (clf_adaboost, params_adaboost) ) # pca_clf_adaboost = AdaBoostClassifier() # pca_params_adaboost = { "adaboost__n_estimators":[20, 50, 100], # 'adaboost__learning_rate': [0.4, 0.6, 1]} # pca_params_adaboost.update(params_pca) # pca = PCA() # clf_list.append((Pipeline([("pca", pca), ("adaboost", pca_clf_adaboost)]), pca_params_adaboost)) # clf_random_tree = RandomForestClassifier() params_random_tree = { "n_estimators":[2, 3, 5,10,15], "criterion": ('gini', 'entropy'), 'min_samples_split': [1, 2, 4], 'max_features': [1, 2, 3,'sqrt',5,10] } clf_list.append( (clf_random_tree, params_random_tree) ) pca_clf_random_tree = RandomForestClassifier() pca_params_random_tree = { "random_tree__n_estimators":[2, 3, 5,10,15], "random_tree__criterion": ('gini', 'entropy'), 'random_tree__min_samples_split': [1, 2, 4] } pca_params_random_tree.update(params_pca) pca = PCA() clf_list.append((Pipeline([("pca", pca), ("random_tree", pca_clf_random_tree )]), pca_params_random_tree )) # clf_log = LogisticRegression() params_log = { "C":[0.05, 0.5, 1, 10, 10**2,10**5,], "tol":[10**-1, 10**-5, 10**-10], "class_weight":['balanced'], "penalty": ['l2', 'l1'] } clf_list.append( (clf_log, params_log) ) pca_clf_log = LogisticRegression() pca_params_log = { "log__C":[0.05, 0.5, 1, 10, 10**2,10**5,], "log__tol":[10**-1, 10**-5, 10**-10], "log__penalty":['l2','l1'], "log__class_weight":['balanced'] } pca_params_log.update(params_pca) pca = PCA() clf_list.append((Pipeline([("pca", pca), ("log", pca_clf_log)]), pca_params_log)) return clf_list
def cart(X, y, params): clf = CART(n_jobs=params['n_jobs'] if 'n_jobs' in params else 1) clf.fit(X, y) return clf
# import time # for i, clf in enumerate(clf_list): # start_time = time.time() # result=evaluate_classifier(clf, labels,features,scv) # summary_list1[i]=result # summary_list[clf] = result # print clf,result # print("--- %s seconds ---" % (time.time() - start_time)) # # ordered_list = sorted(summary_list.keys(), key=lambda k: summary_list[k][3], reverse=True) # print [(key,summary_list[key]) for key in summary_list.keys() if summary_list[key][1]>0.3 and summary_list[key][2]>0.3] # print ordered_list # print "*"*100 # print summary_list # print "*"*100 # # clf = ordered_list[0] # scores = summary_list[clf] # print "Best classifier is ", clf # print "With scores of accuracy,recall, precision,f1,f2: ", scores clf= DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,max_features=None, max_leaf_nodes=None, min_samples_leaf=20,min_samples_split=20, min_weight_fraction_leaf=0.0,presort=False, random_state=50, splitter='best') test_classifier(clf, my_dataset, features_list) # Example starting point. Try investigating other evaluation techniques! ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)