def graph(): dt_data = tree.export_graphviz(tree(), out_file=None, feature_names=iris.feature_names, class_names=iris.target_names, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dt_data) graph.render('output')
def getSimEntDataset(f,words,task): data = open(f,'r') lines = data.readlines() examples = [] for i in lines: i=i.strip() if(len(i) > 0): i=i.split('\t') if len(i) == 3: if task == "sim": e = (tree(i[0], words), tree(i[1], words), float(i[2])) examples.append(e) elif task == "ent": e = (tree(i[0], words), tree(i[1], words), i[2]) examples.append(e) else: raise ValueError('Params.traintype not set correctly.') else: print(i) return examples
def getSentimentDataset(f,words): data = open(f,'r') lines = data.readlines() examples = [] for i in lines: i=i.strip() if(len(i) > 0): i=i.split('\t') if len(i) == 2: e = (tree(i[0], words), i[1]) examples.append(e) else: print(i) return examples
def main(argv): if FLAGS.dataset == 'toy': train_X, train_y, test_X, test_y, num_classes = get_toy_dataset() elif FLAGS.dataset == 'mnist': train_X, train_y, test_X, test_y, num_classes = get_mnist() train_pred = None if FLAGS.method == 'knn': pred = knn(train_X, train_y, test_X) elif FLAGS.method == 'svm': train_pred, pred = svm(train_X, train_y, test_X) elif FLAGS.method == 'tree': pred = tree(train_X, train_y, test_X) elif FLAGS.method == 'boosting': pred = boosting(train_X, train_y, test_X) elif FLAGS.method == 'nn': train_pred, pred = nn(train_X, train_y, test_X, num_classes) if train_pred is not None: print('Train Accuracy: %f' % compute_accuracy(train_pred, train_y)) print('Accuracy: %f' % compute_accuracy(pred, test_y))
# ('knn', neighbors.KNeighborsClassifier()), # #SVM: http://scikit-learn.org/stable/modules/svm.html # ('svc', svm.SVC(probability=True)), # #xgboost: http://xgboost.readthedocs.io/en/latest/model.html # ('xgb', XGBClassifier()) # ] # vote_hard = ensemble.VotingClassifier(estimators = vote_est, voting='hard') # vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv = cv_split) # vote_hard.fit(data1[data1_x_bin], data1[Target]) # # print("VOTING_CLASSIFIER Parameters: ", dtree.get_params()) # # print("VOTING_CLASSIFIER Training w/bin score mean: {:.2f}", format(vote_hard_cv['train_score'].mean())) # print("HARD_VOTING_CLASSIFIER Test w/bin score mean: {:.2f}", format(vote_hard_cv['test_score'].mean())) # vote_soft = ensemble.VotingClassifier(estimators = vote_est, voting='soft') # vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv = cv_split) # vote_soft.fit(data1[data1_x_bin], data1[Target]) # # print("VOTING_CLASSIFIER Parameters: ", dtree.get_params()) # # print("VOTING_CLASSIFIER Training w/bin score mean: {:.2f}", format(vote_hard_cv['train_score'].mean())) # print("SOFT_VOTING_CLASSIFIER Test w/bin score mean: {:.2f}", format(vote_soft_cv['test_score'].mean())) data_val['Survived'] = tree(data_val).astype(int) submit = data_val[['PassengerId', 'Survived']] submit.to_csv("submission2.csv", index=False) print(submit.info())
import pandas as pd data = pd.read_pickle('/home/hudson/Downloads/prostate.df') data.head(2) #cell 5 y = data.values[:, -1] print y.shape, Counter(y.tolist()) x = data.values[:, :-1] print x.shape #cell 6 ## Task 1 (You can use DecisionTree implementation from scikit-learn.) Try decision tree on the above dataset. consider different values for the max depth of the tree ('max_depth') and min number of samples required to be a leaf node ('min-samples_leaf'). Conduct 10-fold cross-validation and: - plot training error and testing error v.s. tree depth - plot training error and testing error v.s. min. sample for leaf nodes Error should be measured by percentage of misclassification (i.e., return 'normal' for 'tumor' and vice versa). #cell 7 from sklearn import tree from sklearn import metrics from sklearn.cross_validation import KFold n_folds = 10 #10 fold cross-validation kf = KFold(len(y), n_folds=n_folds)
def predict(obj): p = tree().predict([obj]) print('this is ', p)
classifier.predict(testing_set.drop('class', axis=1))) dataframe = pd.read_csv("diabetes.csv") classes = [x for x in list(dataframe['class'].unique())] # split dataframe (67% / 33%) training_set = dataframe.sample(frac=0.67) testing_set = dataframe[~dataframe.isin(training_set).all(1)] print(training_set) print(testing_set) nn3 = knn(training_set, 3) nn5 = knn(training_set, 5) nn11 = knn(training_set, 11) tree = tree(training_set) gnb = gnb(training_set) functions = [nn3, nn5, nn11, tree, gnb] classifiers = ["3NN", "5NN", "11NN", "tree", "naive_bayes"] scores = [get_score(x, testing_set) for x in functions] ## confusion matrices for classifier, function, score in zip(classifiers, functions, scores): print("\n" + classifier + " classifier") print("accuracy = " + str(round(score * 100, 2)) + "%") print(get_confusion_matrix(function, testing_set)) ## bar chart plt.bar(classifiers, scores, align='center') plt.ylabel("score")
df_test_tmp = df_test.replace("male", 0).replace("female", 1) df_test_tmp = df_test_tmp.replace("C", 0).replace("Q", 1).replace("S", 2) df_test_tmp["Age"].fillna(df_test_tmp["Age"].median(), inplace=True) df_test_tmp["Fare"].fillna(df_test_tmp["Fare"].median(), inplace=True) df_test_tmp["Embarked"].fillna(df_test_tmp["Embarked"].median(), inplace=True) test_data = df_test_tmp.loc[:, [ "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked" ]] # In[76]: max_acc = 0 max_depth = 0 max_model = None for i in range(1, 100): ret = tree(i, train_data, train_target, valid_data, valid_target) if max_acc < ret[1]: max_acc = ret[1] max_depth = i max_model = ret[0] print(max_depth, ",", max_acc) # In[79]: predicted = max_model.predict(test_data) with open("predict_result_data.csv", "w") as f: writer = csv.writer(f, lineterminator='\n') writer.writerow(["PassengerId", "Survived"]) for pid, survived in zip(df_test["PassengerId"], predicted): writer.writerow([pid, survived])
def diseases(node, depth): if tree_.feature[node] != _tree.TREE_UNDEFINED: disease_name = feature_name[node] threshold = tree_.threshold[node] print("\n" + disease_name + "?\n") answer = input() # Asking the user if they have the symptoms displayed if answer == 'Yes': val = 1 else: val = 0 if val <= threshold: diseases(tree_.children_left[node], depth + 1) else: symptoms_present.append(disease_name) diseases(tree_.children_right[node], depth + 1) else: present_disease = probable_disease(tree_.value[node]) print("Possible disease: " + present_disease) red_column = data.columns symptoms_given = red_column[ data.loc[present_disease].values[0].nonzero()] print("\nPresent symtomp: " + str(list(symptoms_present))) print("\nKnown symptomps of the disease: " + str(list(symptoms_given))) diseases(0, 1) tree(classifier, column)
return f1_score(y_test, clf.predict(X_test), average='macro') def less_X(): X = pd.merge(X.iloc[:, 0:25], X.iloc[:, 1024:1049], how='outer', left_index=True, right_index=True) X, y = shuffle(X, y, random_state=0) X_train, X_valid, X_test = X[:int((0.6 * len(X)))], X[int(( 0.6 * len(X))):int((0.8 * len(X)))], X[int((0.8 * len(X))):] y_train, y_valid, y_test = y[:int((0.6 * len(X)))], y[int(( 0.6 * len(X))):int((0.8 * len(X)))], y[int((0.8 * len(X))):] randomforest(100, 50) if __name__ == "__main__": svm() randomforest(100, 100) randomforest(100, 50) NB() less_X() plot_data = [] for i in xrange(0, 100): plot_data.append(tree()) plt.plot(plot_data)