def linear_regression_model(train, validation, alpha, depth=None): X_train = train['X'].values[:, 1:] y_train = np.ravel(train['y'].values) X_validation = validation['X'].values[:, 1:] y_validation = np.ravel(validation['y'].values) models = { 'type': ['ridge', 'decision tree', 'random forest'], 'model': [ Ridge(alpha=0.1, fit_intercept=fit_intercept, normalize=normalize, max_iter=max_iter, tol=tol, random_state=random_state), dt(criterion='mse', splitter='best', max_depth=depth, min_samples_split=2, min_samples_leaf=1, random_state=random_state), rfr(n_estimators=100, criterion='mse', max_depth=depth, min_samples_split=2, min_samples_leaf=1, random_state=random_state) ], 'score_train': [], 'score_valid': [], 'mse_train': [], 'mse_valid': [] } y_train_predict = [] y_valid_predict = [] for i in np.arange(0, len(models['type']), 1): m = models['model'][i] m.alpha = alpha m.fit(X_train, y_train) models['score_train'].append(m.score(X_train, y_train)) models['score_valid'].append(m.score(X_validation, y_validation)) y_train_predict.append(m.predict(X_train)) y_valid_predict.append(m.predict(X_validation)) models['mse_train'].append(mse(y_train, y_train_predict[i])) models['mse_valid'].append(mse(y_validation, y_valid_predict[i])) print('models: ', models['type']) print('R2 training:', models['score_train']) print('R2 validation:', models['score_valid']) print('MSE training:', models['mse_train']) print('MSE validation:', models['mse_valid']) return models
def DecisionTree(data_directory, model_dir, features): X_train, X_test, y_train, y_test, predict_X, features = pre(data_directory, features) os.chdir(model_dir) model = dt(random_state=1) grid = gs(estimator=model, param_grid={'criterion': ['mse', 'friedman_mse', 'mae'], 'splitter': ['best', 'random'], 'max_features': ['auto', 'sqrt', 'log2']}, cv=5) grid.fit(X_train, y_train) print(grid.best_params_) print(grid.best_estimator_.score(X_test, y_test)) joblib.dump(grid.best_estimator_, 'dtr_%d_%.4f.m'%(len(features),grid.best_estimator_.score(X_test, y_test))) df = pd.DataFrame(columns=['ml_bandgap', 'pbe_bandgap']) df['pbe_bandgap'] = y_test df['ml_bandgap'] = grid.best_estimator_.predict(X_test) print(df)
def doit(inp, k): x, y = loadData("train", 225) x = x.toarray() train_x = x[0:10000] train_y = y[0:10000] test_x = x[9000:10000] test_y = y[9000:10000] model = dt(max_depth=10) model.fit(train_x, train_y) ret = model.predict_proba(X=inp) predict = model.predict(X=inp) clas = model.classes_ for i in range(ret.shape[0]): ret[i] = clas[np.argsort(ret[i])] # print(predict[i],ret[i][ret[0].size-1:ret[0].size]) return np.flip(ret[:, ret[0].size - k:ret[0].size], axis=1)
def import_lib(models): #from sklearn.tree import DecisionTreeClassifier as dt #clf = dt() #return clf model_obj = {} for model in models: # check before importing its root model and create obj # tree_model if model == "Decision Tree": from sklearn.tree import DecisionTreeClassifier as dt dt_model = dt() model_obj[model] = dt_model # linar model elif model == "Logistic Regression": from sklearn.linear_model import LogisticRegression as lr lr_model = lr() model_obj[model] = lr_model return model_obj
# In[24]: # In[22]: from sklearn.tree import DecisionTreeClassifier as dt # In[25]: # In[23]: model=dt(criterion='entropy') # In[26]: # In[24]: model.fit(train[predictors],train[target]) # In[27]: # In[25]:
plt.legend(loc='upper left') plt.savefig('iris_petal_lengthvswidth.png') plt.close() mean_scaler = StandardScaler() # use fit to estimate s and var for X_train individual features mean_scaler.fit(X_train) mean_scaler.transform(X_train) mean_scaler.transform(X_test) print('scaled train:\n{}\nscaled test:\n{}'.format(X_train[:2], X_test[:2])) # single decision tree using a criterion and max_depth for regularization tree = dt( criterion='entropy', max_depth=6, random_state=7 ) print('DT params:\n{}'.format(tree)) # using scaled features for better decision boundary viz tree.fit(X_train, Y_train) # check decision boundary for test points with 5 max depth of tree # test labels 45 105 -> 150 plot_decision_region(X_comb, Y_comb, clsfr=tree, test_idx=range(105,150)) plt.xlabel('Petal length(cm)') plt.ylabel('Petal Width(cm)') plt.legend(loc='upper left') plt.title('0->setosa 1->versicolor 2->virginica') plt.savefig('dt_6dentropy_iris_petal_length_width.png')
# In[102]: from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0) # # Decision Tree Classification Model # In[111]: from sklearn.tree import DecisionTreeClassifier as dt model = dt(max_depth=10, random_state=100) model.fit(x_train, y_train) # # Feature Scaling # In[112]: from sklearn.preprocessing import StandardScaler sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) print(x_train) # # Predicting Model # In[113]:
from sklearn.datasets import load_iris as ir import numpy as np from sklearn.tree import DecisionTreeClassifier as dt """ 0 - sat 1 - ves 2 - vt """ iris=ir() #print(iris.feature_names) #print(iris.target_names) #print(iris.data[0]) #print(iris.target[0]) x=[0,50,100] xtrain = np.delete(iris.data,x,axis=0) ytrain = np.delete(iris.target,x) xtest = iris.data[x] ytest = iris.target[x] clf = dt() clf.fit(xtrain,ytrain) print(ytest) print("prediction = ",clf.predict(xtest))
#Accuracy of Naive bayes classifier print('accuracy by Naive Bayes classifier with PCA =', accuracy_score(Y_test, y_predict_NB)) a_NB = accuracy_score(Y_test, y_predict_NB) #Mathews correlation coefficient for Naive bayes classifier m_NB = matthews_corrcoef(Y_test, y_predict_NB) print( 'Mathew\'s correlation coefficient for Naive Bayes classifier with PCA =', m_NB) # In[29]: #Decision Tree Classifier without PCA _withoutpca from sklearn.tree import DecisionTreeClassifier as dt classifier_withoutpca = dt(criterion='entropy', random_state=0) classifier_withoutpca.fit(X_train, y_train) y_predict_DT_withoutpca = classifier_withoutpca.predict(X_test) #Accuracy of Decision Tree Classifier a_DT_withoutpca = accuracy_score(y_test, y_predict_DT_withoutpca) print('accuracy by Decision Tree without PCA=', accuracy_score(y_test, y_predict_DT_withoutpca)) #Matthews correlation coefficient for Decision Tree Classifier m_DT_withoutpca = matthews_corrcoef(y_test, y_predict_DT_withoutpca) print( 'Matthew\'s correlation coefficient for Decision Tree Classifier without PCA =', m_DT_withoutpca)
df_cancer = cancer() df_cancer.keys() # dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']) # 2. train set, test set split x_train, x_test, y_train, y_test = train_test_split(df_cancer['data'], df_cancer['target'], random_state=0) # 3. Data learngin from sklearn.tree import DecisionTreeClassifier as dt from sklearn.tree import DecisionTreeRegressor as dt_r m_dt = dt() m_dt.fit(x_train, y_train) # 4. Model evaluation m_dt.score(x_test, y_test) # 88% # 5. Parameter tunning score_train = [] score_test = [] for i in np.arange(2, 21): m_dt = dt(min_samples_split=i) m_dt.fit(x_train, y_train) score_train.append(m_dt.score(x_train, y_train)) score_test.append(m_dt.score(x_test, y_test))
accuracy = accuracy_score(label_test, predicts) print('Accuracy of Naive Bayes classifier :',accuracy) # =82% # 3.Using Decision Tree Induction Classification # First we need to convert continuous values into categorial as much as we can print(max(featuers.age),min(featuers.age)) # to know pins range #70 #32 ageCategory=pd.cut(featuers.age,bins=[0,17,32,65,100],labels=['child','teenager','adult','elderly']) cigsPerDayCategory=pd.cut(featuers.cigsPerDay,bins=[-1,2.0,5.0,7.0,20.0],labels=['low','medium','high','veryHigh']) featuers.insert(2,"ageCategory",ageCategory) featuers.insert(4,"cigsPerDayCategory",cigsPerDayCategory) # Now after adding a categorial columns we need to drop continuous values columns del featuers['age'] del featuers['cigsPerDay'] ''' All totChol,sysBP,diaBP,BMI,heartRate,glucose has continuous values between : -0.8,+0.8''' continuousValuesWithSameRange=["totChol","sysBP","diaBP","BMI","heartRate","glucose"] for column in continuousValuesWithSameRange: columnCategory = pd.cut(featuers[column], bins=[-0.9, -0.3,4,9], labels=['low', 'medium', 'high']) featuers.insert(13, column+"Category",columnCategory) # Now after adding a categorial columns we need to drop continuous values columns del featuers[column] print("Categorial data set:",featuers) # NOW Data is ready to use Decision Tree Induction Classification from sklearn.tree import DecisionTreeClassifier as dt print("here") model = dt(random_state=1) model.fit(feature_train, label_train) predicts=model.predict(feature_test) print("PREDICT RESULT Using Decision Tree Induction:",predicts) accuracy = accuracy_score(label_test, predicts) print('Accuracy of Decision Tree Induction classifier :',accuracy) # =77%
# In[124]: accuracy = confusion_matrix(y_test, clf_pred) accuracy # In[127]: from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, clf_pred) accuracy # # This exaample to determine accuracy via Decision Tree # In[130]: clf_DT = dt() # In[131]: clf_DT.fit(X_train, y_train) # In[132]: clf_dt_pred = clf_DT.predict(X_test) # In[133]: clf_dt_pred # In[134]:
def main(): train, test = load("cancer-data-train.csv"), load("cancer-data-test.csv") X_train, y_train = train X_test, y_test = test X_train, X_test, print_pred = arguments(sys.argv, X_train, X_test) fig = plot.figure() # Passing training data and classes to find best C and number of leaf nodes to use. Also creating graphs to display this info classifier_plotter(X_train, y_train) # Setting up graphs for each plot ax1 = fig.add_subplot(234) ax1.set_title('Average Precsion Scores') ax1.set_ylabel('Precsion Score') ax1.set_xlabel('Classifier') ax2 = fig.add_subplot(235) ax2.set_title('Average Recall Scores') ax2.set_ylabel('Recall Score') ax2.set_xlabel('Classifier') ax3 = fig.add_subplot(236) ax3.set_title('Average F-measures') ax3.set_ylabel('F-measure') ax3.set_xlabel('Classifier') # Create and train the classifiers classifier_svm, classifier_gini, classifier_ig, classifier_lda = svm( kernel='linear', C=0.1), dt(criterion='gini', max_leaf_nodes=10), dt(criterion='entropy', max_leaf_nodes=5), lda() classifier_svm.fit(X_train, y_train), classifier_gini.fit( X_train, y_train), classifier_ig.fit(X_train, y_train), classifier_lda.fit( X_train, y_train) # Make the predictions pred_svm, pred_gini, pred_ig, pred_lda = classifier_svm.predict( X_test), classifier_gini.predict(X_test), classifier_ig.predict( X_test), classifier_lda.predict(X_test) # Calculate the precision, recall, f-measure avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda = average_precision_score( y_test, pred_svm), average_precision_score( y_test, pred_gini), average_precision_score( y_test, pred_ig), average_precision_score(y_test, pred_lda) recall_svm, recall_gini, recall_ig, recall_lda = recall_score( y_test, pred_svm, average='weighted'), recall_score( y_test, pred_gini, average='weighted'), recall_score( y_test, pred_ig, average='weighted'), recall_score(y_test, pred_lda, average='weighted') f_svm, f_gini, f_ig, f_lda = f1_score( y_test, pred_svm, average='weighted'), f1_score( y_test, pred_gini, average='weighted'), f1_score( y_test, pred_ig, average='weighted'), f1_score(y_test, pred_lda, average='weighted') ################## Extra Credit ######################### # Train classifier and make predictions on test set classifier_rfc = rfc(n_estimators=100, max_depth=2) classifier_rfc.fit(X_train, y_train) pred_rfc = classifier_rfc.predict(X_test) #Calculate precision, recall and f-measure for Random Forest Classifier avg_precision_rfc = average_precision_score(y_test, pred_rfc) recall_rfc = recall_score(y_test, pred_rfc, average='weighted') f_rfc = f1_score(y_test, pred_rfc, average='weighted') ######################################################### # Printing scores and predictions print_scores([[ avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda, avg_precision_rfc ], [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc], [f_svm, f_gini, f_ig, f_lda, f_rfc]]) print_predictions([pred_svm, pred_gini, pred_ig, pred_lda, pred_rfc], print_pred) # Create the graphs for the scores score_plotter(ax1, [ avg_precision_svm, avg_precision_gini, avg_precision_ig, avg_precision_lda, avg_precision_rfc ]) score_plotter(ax2, [recall_svm, recall_gini, recall_ig, recall_lda, recall_rfc]) score_plotter(ax3, [f_svm, f_gini, f_ig, f_lda, f_rfc]) plot.tight_layout(w_pad=1.5, h_pad=2.0) plot.show()
def classifier_plotter(X_train, y_train): ''' Takes the training data and runs through SVM, DT-Gini and DT-IG with multiple C values and max_leaf_nodes to try. The method then creates a graph by taking the average of cross validation scores for that C value or max_leaf_node. Params: X_train: List/s of features already standardized from the initial dataset y_train: List of classifiers for X_train taken from the original dataset Return: Outputs a graph of the average cross validation scores. ''' i, d = 1, 0 # Values to test c_values = [0.01, 0.1, 1, 10, 100] k_values = [2, 5, 10, 20] classifiers = ["SVM", "DT-Gini & DT-IG"] for clf in classifiers: count = 1 if clf == "SVM": if d == 0: ax = plot.subplot(231) ax.set_title(clf) plot.ylabel('F-measure') plot.xlabel('C values') d += 1 print('SVM') for c in c_values: classi = svm(kernel='linear', C=c).fit(X_train, y_train) scores = cross_val_score(classi, X_train, y_train, cv=10) ax.plot(str(c), scores.mean(), 'bs') print('%d.) %.4f%%' % (count, scores.mean() * 100)) count += 1 plot.axis([None, None, 0.90, 1]) print('\n') i += 1 d = 0 elif clf == "DT-Gini & DT-IG": count = 1 if d == 0: ax = plot.subplot(232) plot.ylabel('F-measure') plot.xlabel('Max Leaf Nodes') print(' Gini\tIG') for k in k_values: gini_class, ig_class = dt(criterion='gini', max_leaf_nodes=k), dt( criterion='entropy', max_leaf_nodes=k) score_gini, score_ig = cross_val_score(gini_class, X_train, y_train, cv=10), cross_val_score( ig_class, X_train, y_train, cv=10) ax.plot(str(k), score_gini.mean(), 'r.', str(k), score_ig.mean(), 'g.') print('%d.) %.4f%%\t%.4f%%' % (count, score_gini.mean() * 100, score_ig.mean() * 100)) count += 1 plot.axis([None, None, 0.889, 0.96]) ax.legend(('Gini', 'IG'), loc=2) print('\n') i += 1 d = 0 else: return "Should not get here."
from time import time sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### print len(features_train[0]) from sklearn.tree import DecisionTreeClassifier as dt clf = dt(min_samples_split=40) t0 = time() clf.fit(features_train[:], labels_train[:]) print 'Training time:', round(time() - t0, 3), 's' t0 = time() pred = clf.predict(features_test).tolist() print 'Chis Occurrences:', pred.count(1) print 'Shara Occurrences:', pred.count(0) # print 'Predictions: ', pred print 'Predicting time:', round(time() - t0, 3), 's' from sklearn.metrics import accuracy_score t0 = time() accuracy = accuracy_score(pred, labels_test)
# 23. Split train into training and validation dataset X_train, X_test, y_train, y_test = train_test_split( X, target, test_size = 0.3) # 23.1 X_train.shape # 43314 X 135 if no kmeans: (43314, 126) X_test.shape # 18564 X 135; if no kmeans: (18564, 126) # 24 Decision tree classification # 24.1 Create an instance of class clf = dt(min_samples_split = 5, min_samples_leaf= 5 ) start = time.time() # 24.2 Fit/train the object on training data # Build model clf = clf.fit(X_train, y_train) end = time.time() (end-start)/60 # 1 minute # 24.3 Use model to make predictions classes = clf.predict(X_test) # 24.4 Check accuracy
def GridSearchCV_hp_tuning(X_train, X_test, y_train, y_test): ##for SVM for best parameters from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV import matplotlib.pyplot as plt from sklearn.metrics import classification_report param_grid = {'C': [1, 10], 'gamma': ('auto','scale'), 'kernel': ['linear']} grid = GridSearchCV(SVC(), param_grid, cv=2) grid.fit(X_train, y_train) print('Best parameters: ', grid.best_params_) print('Best estimator: ', grid.best_estimator_) grid_predictions = grid.predict(X_test) acc = Accuracy(grid_predictions, y_test) print('Acc: ', acc) print(classification_report(y_test, grid_predictions)) ##for Knn for best parameters from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() params_knn = {'n_neighbors': [2,3,4,5], 'weights': ['uniform'], 'metric': ['euclidean'] } knn_grid = GridSearchCV(knn, params_knn, cv=3) knn_grid.fit(X_train, y_train) print('Best parameters: ', knn_grid.best_params_) print('Best estimator: ', knn_grid.best_estimator_) grid_predictions = knn_grid.predict(X_test) acc = Accuracy(grid_predictions, y_test) print('Acc: ', acc) print(classification_report(y_test, grid_predictions)) ##for Decision Tree for best parameters from sklearn.tree import DecisionTreeClassifier as dt clf = dt() param_grid = {'max_depth':[1,2,3], 'min_samples_leaf':[1,2,3,4,5], 'min_samples_split':[2,3,4], 'criterion':['gini','entropy'] } grid = GridSearchCV(clf, param_grid, cv=10) grid.fit(X_train, y_train) print('Best parameters: ', grid.best_params_) print('Best estimator: ', grid.best_estimator_) grid_predictions = grid.predict(X_test) acc = Accuracy(grid_predictions, y_test) print('Acc: ', acc) print(classification_report(y_test, grid_predictions)) ##Tuning the hyperparameters ##for SVM: parameter C from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report c_values = [0.1, 1, 10 , 100] acc = [] for i in c_values: param_grid = {'C': [i], 'gamma': ('auto','scale'), 'kernel': ['linear']} grid = GridSearchCV(SVC(), param_grid, cv=2) grid.fit(X_train, y_train) print('Best parameters: ', grid.best_params_) print('Best estimator: ', grid.best_estimator_) grid_predictions = grid.predict(X_test) acc_1 = Accuracy(grid_predictions, y_test) acc.append(acc_1) xi = list(range(len(c_values))) plt.plot(xi, acc, marker='o', linestyle='--', color='r', label='acc') plt.xlabel('C values',fontweight="bold",fontsize = 12) plt.ylabel('accuracy',fontweight="bold",fontsize = 12) plt.title("C vs accuracy for GridSearchCV SVM",fontweight="bold",fontsize = 16) plt.xticks(xi, c_values) plt.legend() plt.show() ##for SVM: parameter kernel from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report kernel_values = ['linear', 'rbf'] acc_k = [] for i in kernel_values: # defining parameter range param_grid = {'C': [10], 'gamma': ('auto','scale'), 'kernel': [i]} grid = GridSearchCV(SVC(), param_grid, cv=2) grid.fit(X_train, y_train) print('Best parameters: ', grid.best_params_) print('Best estimator: ', grid.best_estimator_) grid_predictions = grid.predict(X_test) acc_1 = Accuracy(grid_predictions, y_test) acc_k.append(acc_1) xi = list(range(len(kernel_values))) plt.plot(xi, acc_k, marker='o', linestyle='--', color='r', label='acc') plt.xlabel('kernel',fontweight="bold",fontsize = 12) plt.ylabel('accuracy',fontweight="bold",fontsize = 12) plt.title("kernels vs accuracy for GridSearchCV SVM",fontweight="bold",fontsize = 16) plt.xticks(xi, kernel_values) plt.legend() plt.show() ##for decision tree: parameter max_depth from sklearn.tree import DecisionTreeClassifier as dt max_depth_values = [1, 2, 3] acc_dep = [] clf=dt() for i in max_depth_values: param_grid = {'max_depth':[i], 'min_samples_leaf':[1,2,3,4,5], 'min_samples_split':[2,3,4], 'criterion':['gini','entropy']} grid = GridSearchCV(clf,param_grid, cv=10) a = grid.fit(X_train, y_train) y_pred = grid.predict(X_test) print('Best parameters: ', grid.best_params_) print('Best estimator: ', grid.best_estimator_) grid_predictions = grid.predict(X_test) acc = Accuracy(grid_predictions, y_test) acc_dep.append(acc) xi = list(range(len(max_depth_values))) plt.plot(xi, acc_dep, marker='o', linestyle='--', color='r', label='acc') plt.xlabel('max_depth values',fontweight="bold",fontsize = 12) plt.ylabel('accuracy',fontweight="bold",fontsize = 12) plt.title("max_depth vs accuracy for GridSearchCV Decision Tree",fontweight="bold",fontsize = 16) plt.xticks(xi, max_depth_values) plt.legend() plt.show() ##for Knn: parameter K knn = KNeighborsClassifier() acc_knn = [] n_values = [2, 3, 4, 5] for i in n_values: params_knn = {'n_neighbors': [i], 'weights': ['uniform'], 'metric': ['euclidean'] } knn_grid= GridSearchCV(knn, params_knn, cv=3) knn_grid.fit(X_train, y_train) print('Best parameters: ', grid.best_params_) print('Best estimator: ', grid.best_estimator_) grid_predictions = knn_grid.predict(X_test) acc_1 = Accuracy(grid_predictions, y_test) print('acc: ',acc_1) acc_knn.append(acc_1) xi = list(range(len(n_values))) plt.plot(xi, acc_knn, marker='o', linestyle='--', color='r', label='acc') plt.xlabel('k values',fontweight="bold",fontsize = 12) plt.ylabel('accuracy',fontweight="bold",fontsize = 12) plt.title("k vs accuracy for GridSearchCV Knn",fontweight="bold",fontsize = 16) plt.xticks(xi, n_values) plt.legend() plt.show()
import pandas as pd from sklearn.tree import DecisionTreeRegressor as dt from sklearn.metrics import mean_absolute_error as me # for calculating errors from sklearn.ensemble import RandomForestRegressor as rf #another model iris_data = pd.read_csv('iris.csv') #print(iris_data.columns) # now defining x and y for y = mx + c + E(epsilon) x = iris_data[['sepal length', 'sepal width', 'petal length', 'petal width']] y = iris_data[['category']] #print(y) # selecting the model mymodel = dt() # training the machine mymodel.fit(x, y) #print(x.head()) #pridicting the values print(mymodel.predict(x.head())) predicted_y = mymodel.predict(x) # getting error with test data so that it can be included and ans. become more accurate print(me(y, predicted_y)) # randomforest model myforest = rf() myforest.fit(x, y) print(myforest.predict(x.head()))
y[index] = 0 else: y[index] = 1 np.save('x.npy', x) np.save('y.npy', y) else: print('Loading from x.npy, y.npy') x = np.load('x.npy') y = np.load('y.npy') # Training # Split into test/train x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) clf = dt(max_depth=10) print("Starting decision tree training...") clf = clf.fit(x_train, y_train) print("DT result: ", clf.score(x_test, y_test)) target_names = ['Not Churn', 'Churn'] y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred, target_names=target_names)) print("Starting SVM training...") clf = SVC(C=1, kernel='rbf', gamma=0.125, decision_function_shape='ovr') clf = clf.fit(x_train, y_train) print("SVM result: ", clf.score(x_test, y_test)) target_names = ['Not Churn', 'Churn'] y_pred = clf.predict(x_test) print(classification_report(y_test, y_pred, target_names=target_names))
enc.fit(y_train) # Let the object learn data y_tr = enc.transform(y_train) # Let it encode y_tr # 2.3 Check mapping enc.classes_ # array(['setosa', 'versicolor', 'virginica'] # Corresponds to 0,1,2 # 2.4 Verify: enc.transform(['setosa','versicolor', 'virginica']) # 3. Start modeling # 3.1 Initialize our decision tree object. # Supply relevant parameters ct = dt( criterion="gini", # Alternative 'entropy' max_depth=None # Alternative, specify an integer # 'None' means full tree till single leaf ) # 3.2 Train our decision tree c_tree = ct.fit(X_train,y_tr) # 4.0 Make predictions of test data # 4.1 First transform y_test into inetgers # just as in y_tr # We use the already trained enc() object y_te = enc.transform(y_test) # 4.2 Now make prediction out = ct.predict(X_test) out
import sklearn from sklearn.tree import DecisionTreeClassifier as dt from sklearn.model_selection import train_test_split datasub = data[['Age', 'EstimatedSalary', 'Purchased']] datasub.head(5) X = datasub[['Age', 'EstimatedSalary']] y = datasub['Purchased'] X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2) X_train dtclf = dt() dtclf.fit(X_train, y_train) pred = dtclf.predict(X_test) from sklearn.metrics import accuracy_score print('accuracy is ', accuracy_score(pred, y_test)) from sklearn.metrics import classification_report df = pd.DataFrame({'Actual': y_test, 'Predicted': pred}) df import matplotlib.pyplot as plt
scores.append(sum(stats['test_score']) / len(stats['test_score'])) for score in scores: print('monk', i, ':', score) print('-----------------------------') return sum(scores) / len(scores) x1, y1 = read_data("monks-1.csv") x2, y2 = read_data("monks-2.csv") x3, y3 = read_data("monks-3.csv") feats = [x1, x2, x3] labs = [y1, y2, y3] print('***Using 3-fold validation***') worst = show_stats(feats, labs, pct(max_iter=100, tol=0), 3, 'perceptron') best = show_stats(feats, labs, dt(max_depth=10), 3, 'decision tree') show_stats(feats, labs, knn(n_neighbors=3), 3, 'K-nearest-neighbors') show_stats(feats, labs, gnb(), 3, 'Gaussian Naive Bayes') print('t test between perceptron and decision tree:', ttest_ind(worst, best)) print('***Using Leave-one-out***') worst = show_stats(feats, labs, pct(max_iter=50, tol=0), loo(), 'perceptron') best = show_stats(feats, labs, dt(max_depth=10), loo(), 'decision tree') show_stats(feats, labs, knn(n_neighbors=3), loo(), 'K-nearest-neighbors') show_stats(feats, labs, gnb(), loo(), 'Gaussian Naive Bayes') print('t test between perceptron and decision tree:', ttest_ind(worst, best))
#%% imported library import pandas as pd import matplotlib.pyplot as plt import numpy as np from sklearn.tree import DecisionTreeRegressor as dt #%% readed data df = pd.read_csv("dataset.csv", sep=";") #%% converted list to DataFrame dfTemp = pd.DataFrame(df.iloc[:, 0].values) dfRainy = pd.DataFrame(df.iloc[:, 1].values) #%% made Decision Tree tree = dt() tree.fit(dfTemp, dfRainy) #%% border drawing dfTempArange=np.arange(min(df.iloc[:,0].values)\ ,max(df.iloc[:,0].values),0.001).reshape(-1,1) #%% finded model results y_results = tree.predict(dfTempArange).reshape(-1, 1) #%% showed values plt.scatter(dfTemp, dfRainy, color="black") plt.xlabel("Temperature") plt.ylabel("Rainy") plt.plot(dfTempArange, y_results, color="gray") plt.show()