coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)), ('KNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')), ('NB', GaussianNB(priors=None, var_smoothing=1e-09))] results = [] names = [] folds = 7 for name, model in models: kfold = model_selection.KFold(n_splits=folds, random_state=folds) accuracy = model_selection.cross_val_score(model, X, np.ravel(Y), cv=kfold,
if labels_train[ii] == 1 ] #### initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() ################################################################################ from sklearn.neighbors import KNeighborsClassifier #Create KNN Classifier knn_clf = KNeighborsClassifier(n_neighbors=13, p=6) #Fit data into the classifier to create model knn_clf.fit(features_train, labels_train) #Pass Testing Features to Get A Prediction Out of The Classifier pred = knn_clf.predict(features_test) from sklearn.metrics import accuracy_score #Get Accuracy of The Prediction By Comparing With the Testing Labels acc = accuracy_score(pred, labels_test) print(acc) ######################################################### #Accuracy = 0.94 try: prettyPicture(clf, features_test, labels_test)
if opts.print_cm: print("confusion matrix:") print(metrics.confusion_matrix(y_test, pred)) print() clf_descr = str(clf).split('(')[0] return clf_descr, score, train_time, test_time results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50, tol=1e-3), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50, tol=1e-3), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(n_estimators=100), "Random forest")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty)))
X = pd.read_csv('.\\Datasets\\wheat.data', index_col=0) # load dataset X.dropna(inplace=True) y = X.wheat_type.copy().map({'canadian': 0, 'kama': 1, 'rosa': 2}) # create X.drop('wheat_type', axis=1, inplace=True) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7) svc = SVC(kernel='linear', C=C) from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5) from sklearn.tree import DecisionTreeClassifier decision_tree_model = DecisionTreeClassifier(max_depth=9, random_state=2) benchmark(decision_tree_model, X_train, X_test, y_train, y_test, 'Decision Tree Classifier') drawPlots(decision_tree_model, X_train, X_test, y_train, y_test, 'Decision Tree Classifier') benchmark(knn, X_train, X_test, y_train, y_test, 'KNeighbors') drawPlots(knn, X_train, X_test, y_train, y_test, 'KNeighbors') benchmark(svc, X_train, X_test, y_train, y_test, 'SVC') drawPlots(svc, X_train, X_test, y_train, y_test, 'SVC')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=0) #feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting classifier to the Training set from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(
def train_test(code_classifier): training_set = [] class_training = [] annot_training = [] total_training_set = [] total_class_training = [] testing_set = [] class_testing = [] annot_testing = [] TP = 0 FP = 0 TN = 0 FN = 0 Prec = 0 Rec = 0 Fscore = 0 Spec = 0 res_sequence = [] list_name = read_training_testing_file(2) if code_classifier == 0: clf = svm.SVC() # clf_name = "SVM" elif code_classifier == 1: clf = tree.DecisionTreeClassifier(random_state=1) clf_name = "Decision_Tree" elif code_classifier == 2: clf = KNeighborsClassifier(n_neighbors=1, leaf_size=40) clf_name = "KNN" else: clf = LogisticRegression(C=1e8) clf_name = "Logistic" for name in list_name: for sub_name in list_name: if name == sub_name: print "loading test samples" testing_set, class_testing = read_file(sub_name) else: print "loading training samples " + str(sub_name) training_set, class_training = read_file(sub_name) for i in range(len(training_set)): total_training_set.append(training_set[i]) total_class_training.append(class_training[i]) print "training and testing " + name print len(total_training_set) clf = clf.fit(total_training_set, total_class_training) prediction_val = clf.predict(testing_set) TP, FP, TN, FN = calc_metrics(prediction_val, class_testing, name) #the case when the result is on the edge if TP == 0 and FP == 0: Prec = 0 else: Prec = float(TP) / (TP + FP) * 100 Rec = float(TP) / (TP + FN) * 100 Fscore = float((2 * TP)) / ((2 * TP) + FP + FN) * 100 Spec = float(TN) / (FP + TN) * 100 del total_training_set[:] del total_class_training[:] result_metric = [name, TP, FP, TN, FN, Prec, Rec, Fscore, Spec] res_sequence.append(result_metric) #print_read_classifier(fin_ml, clf_name, code, percentage, True) return res_sequence
print(counter) # print(df) # print(df.shape); """ KNN """ #Create arrays for features and target variable y = df['osteoporosis'] X = df.drop('osteoporosis', axis=1) #train test split (X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) knn = KNeighborsClassifier(n_neighbors=3, algorithm='brute', metric='euclidean') cv_results = cross_val_score(knn, X, y, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (cv_results.mean(), cv_results.std() * 2)) print("Execution time : %0.3f seconds" % (time.time() - start_time)) """ Accuracy: 0.56 (+/- 0.12) Execution time : 2.072 seconds """
def knnClassify(trainData, trainLabel): knnClf = KNeighborsClassifier( ) # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10) knnClf.fit(trainData, ravel(trainLabel)) # ravel Return a contiguous flattened array. return knnClf
kf = ms.KFold( n_splits=5, shuffle=True, random_state=42 ) k_score = list() for k in range(1, 51): k_score.append( ( ( ms.cross_val_score( estimator=KNeighborsClassifier( n_neighbors=k ), X=X, y=y, cv=kf, scoring='accuracy' ) ).mean(), k ) ) print( sorted( k_score, reverse=True
features = scaler.fit_transform(features) ### Try a variety of classifiers # Import classifiers from sklearn.naive_bayes import GaussianNB from sklearn import tree #from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier # Initialize classifiers clf_NB = GaussianNB() clf_DT = tree.DecisionTreeClassifier(min_samples_split=5,criterion='entropy') #clf_SVC = SVC() clf_KN = KNeighborsClassifier() clf_RF = RandomForestClassifier() clf_AB = AdaBoostClassifier() # Leverage tester.py to fit and test the classifiers test_classifier(clf_NB, my_dataset, features_list) #test_classifier(clf_DT, my_dataset, features_list) #test_classifier(clf_SVC, my_dataset, features_list) #test_classifier(clf_KN, my_dataset, features_list) #test_classifier(clf_RF, my_dataset, features_list) #test_classifier(clf_AB, my_dataset, features_list) # Apply Grid Search to fine tune the parameters from sklearn import grid_search # Set the parameters for my two chosen classifiers
feature_data = pd.DataFrame(x_scaled, columns=feature_data.columns) body_type_labels = body_type_df["body_type"] body_type_train, body_type_test, body_type_labels_train, body_type_labels_test = train_test_split( feature_data, body_type_labels, train_size=0.8, test_size=0.2, random_state=6) ### CLASSIFICATION start = timeit.default_timer() ### KNeighbors classifier = KNeighborsClassifier(n_neighbors=80) classifier.fit(body_type_train, body_type_labels_train) print("Score: " + str(classifier.score(body_type_test, body_type_labels_test))) prediction = classifier.predict(body_type_test) stop = timeit.default_timer() print("\nAccuracy score: " + str(accuracy_score(body_type_labels_test, prediction))) print("Recall score: " + str(recall_score(body_type_labels_test, prediction, average='micro'))) print("Precision score: " + str(precision_score(body_type_labels_test, prediction, average='micro')))
def random_subspace(n_estimators, M0, M1, verbose=False): if n_estimators == 0: ave_sub_model_acc = 0 acc = 0 else: standard = False # M__pca_ideal = 147 # M__lda_ideal = 46 # if verbose: # print ('M__pca_ideal = ', M__pca_ideal) # print ('M__lda_ideal = ', M__lda_ideal) M_pca_bag = N - 1 M_pca = 147 # M__pca_ideal M_lda = 46 # M__lda_ideal assert (M1 <= (N - 1 - M0)) assert (M0 + M1 > M_lda) estimators = [('lda', LinearDiscriminantAnalysis(n_components=M_lda)), ('knn', KNeighborsClassifier(n_neighbors=1))] base_est = Pipeline(estimators) base_est.fit(X_train.T, y_train.T.ravel()) acc = base_est.score(X_test.T, y_test.T.ravel()) if verbose: print('Accuracy of base estimator with no pre PCA = %.2f%%' % (acc * 100)) pca = PCA(n_components=M_pca_bag) W_train = pca.fit_transform(X_train.T) W_test = pca.transform(X_test.T) base_est.fit(W_train, y_train.T.ravel()) acc = base_est.score(W_test, y_test.T.ravel()) if verbose: print('Accuracy of base estimator with pre PCA applied = %.2f%%' % (acc * 100)) estimators = [] sub_model_accuracies = [] masks = [] for i in range(n_estimators): mask0 = np.arange(M0) mask1 = np.random.choice(np.arange(M0, (N - 1)), M1, replace=False) mask1 = np.array(mask1).ravel() mask = np.concatenate((mask0, mask1), axis=None) masks.append(mask) W_bag = W_train[:, mask] y_bag = y_train estimator = clone(base_est) estimator.fit(W_bag, y_bag.T.ravel()) name = 'est_' + str(i + 1) estimators.append((name, estimator)) sub_model_acc = estimator.score(W_test[:, mask], y_test.T.ravel()) sub_model_accuracies.append(sub_model_acc) if verbose: print('Accuracy of sub model ', i + 1, ' = %.2f%%' % (sub_model_acc * 100)) ave_sub_model_acc = sum(sub_model_accuracies) / n_estimators if verbose: print('Average accuracy of sub models = %.2f%%' % (ave_sub_model_acc * 100)) y_hat = [] for w in W_test: prediction_sum = 0 predictions = np.empty(n_estimators, dtype=np.int64) for i, (name, estimator) in enumerate(estimators): y = estimator.predict(w[masks[i]].reshape(1, -1)) prediction_sum = prediction_sum + float(y[0]) predictions[i] = int(y[0]) #sum prediction = round(prediction_sum / n_estimators) # y_hat.append(prediction) #voting counts = np.bincount(predictions) y_hat.append(np.argmax(counts)) acc = accuracy_score(y_test.T, y_hat) if verbose: print('Accuracy of ensemble models = %.2f%%' % (acc * 100)) return acc, ave_sub_model_acc, sub_model_accuracies
def first_generation(X, y, seed=None): mlp_parameters = list( itertools.product([1, 2, 4, 8, 16], [0, 0.2, 0.5, 0.9], [0.3, 0.6])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m, learning_rate_init=a) for (h, m, a) in mlp_parameters ] mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters] neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 20)] weighting_methods = ['uniform', 'distance', lambda x: abs(1 - x)] knn_clf = [ KNeighborsClassifier(n_neighbors=nn, weights=w) for (nn, w) in itertools.product(neigbhors_number, weighting_methods) ] knn_name = [ 'knn_{0}_{1}'.format(*param) for param in itertools.product( neigbhors_number, ['uniform', 'distance', 'similarity']) ] C = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100] degree = [2, 3] gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_clf_poly = [ SVC(C=c, kernel='poly', degree=d) for (c, d) in itertools.product(C, degree) ] svm_clf_poly_name = [ 'svm_poly_{0}_{1}'.format(*param) for param in itertools.product(C, degree) ] svm_clf_rbf = [ SVC(C=c, kernel='rbf', gamma=g) for (c, g) in itertools.product(C, gamma) ] svm_clf_rbf_name = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] dt_max_depth_params = list( itertools.product(['gini', 'entropy'], [1, 2, 3, 4, None])) dt_max_depth_clf = [DecisionTreeClassifier(criterion=c, max_depth=d) \ for (c, d) in dt_max_depth_params] dt_max_depth_name = [ 'dt_max_depth_{0}_{1}'.format(*param) for param in dt_max_depth_params ] dt_max_features_params = list( itertools.product(['gini', 'entropy'], [None, 'sqrt', 'log2', 0.5])) dt_max_features_clf = [DecisionTreeClassifier(criterion=c, max_features=f) \ for (c, f) in dt_max_features_params] dt_max_features_name = [ 'dt_max_features_{0}_{1}'.format(*param) for param in dt_max_features_params ] dt_min_leaf_params = [2, 3] dt_min_leaf_clf = [ DecisionTreeClassifier(min_samples_leaf=l) for l in dt_min_leaf_params ] dt_min_leaf_name = [ 'dt_min_leaf_{0}'.format(param) for param in dt_min_leaf_params ] pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_max_depth_clf + dt_max_features_clf + \ dt_min_leaf_clf pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_max_depth_name + \ dt_max_features_name + dt_min_leaf_name ensemble = VotingClassifier(estimators=list(zip(pool_name, pool))) ensemble.fit(X, y) estimators = ensemble.estimators_ return estimators, pool_name
def third_generation(X, y, size=200, seed=None): mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\ [0, 0.2, 0.5, 0.9], [0.1, 0.3, 0.6])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m, learning_rate_init=a) for (h, m, a) in mlp_parameters ] mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters] neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)] weighting_methods = ['uniform', 'distance'] knn_clf = [ KNeighborsClassifier(n_neighbors=nn, weights=w) for (nn, w) in itertools.product(neigbhors_number, weighting_methods) ] knn_name = [ 'knn_{0}_{1}'.format(*param) for param in itertools.product( neigbhors_number, ['uniform', 'distance']) ] C = np.logspace(-3, 7, num=11) degree = [2, 3, 4] gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_clf_poly = [ SVC(C=c, kernel='poly', degree=d) for (c, d) in itertools.product(C, degree) ] svm_clf_poly_name = [ 'svm_poly_{0}_{1}'.format(*param) for param in itertools.product(C, degree) ] svm_clf_rbf = [ SVC(C=c, kernel='rbf', gamma=g) for (c, g) in itertools.product(C, gamma) ] svm_clf_rbf_name = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] dt_params = list(itertools.product(['gini', 'entropy'], \ [1, 2, 3, 4, 5, None], \ [None, 'sqrt', 'log2'], \ ['best', 'random'])) dt_clf = [ DecisionTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] et_clf = [ ExtraTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] ada_params = list(itertools.product([2**i for i in range(1, 14)], \ [1, 2, 3])) ada_dt_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=DecisionTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_et_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=ExtraTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params] ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params] nb_bag_est = 50 nb_bag_stumps = 200 bag_dt = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=DecisionTreeClassifier()) bag_et = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=ExtraTreeClassifier()) bag_stumps = BaggingClassifier( n_estimators=nb_bag_stumps, base_estimator=DecisionTreeClassifier(max_depth=1)) bag_dt.fit(X, y) bag_et.fit(X, y) bag_stumps.fit(X, y) dt_bag_clf = bag_dt.estimators_ et_bag_clf = bag_et.estimators_ stump_bag_clf = bag_stumps.estimators_ dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] stump_bag_name = [ 'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps) ] bag_dt_clf = [bag_dt] bag_et_clf = [bag_dt] bag_stump_clf = [bag_stumps] bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))] bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))] bag_stump_name = ['bag_stump_{0}'.format(str(200))] nb_rf = 15 rf = RandomForestClassifier(n_estimators=nb_rf) rf.fit(X, y) dt_rf_clf = rf.estimators_ dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)] log_parameters = list(itertools.product(['l1', 'l2'],\ np.logspace(-5, 9, num=15), [True, False])) log_clf = [ LogisticRegression(penalty=l, C=c, fit_intercept=f) for (l, c, f) in log_parameters ] log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters] sgd_parameters = list( itertools.product([ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1))) sgd_clf = [ SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1) for (l, p, f, l1) in sgd_parameters ] sgd_name = [ 'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters ] pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \ dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \ log_clf + sgd_clf pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \ ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \ bag_stump_name + dt_rf_name + log_name + sgd_name for model in pool: if not check_model_is_fitted(model, X[0, :].reshape((1, -1))): model.fit(X, y) np.random.seed(seed) order = np.random.permutation(range(len(pool))) estimators = [pool[i] for i in order[:size]] return estimators, pool_name
#CV = model_selection.LeaveOneOut() errors = np.zeros((K,L)) i=0 for train_index, test_index in CV.split(X): print('Crossvalidation fold: {0}/{1}'.format(i+1,K)) # extract training and test set for current CV fold X_train = X[train_index,:] y_train = y[train_index] X_test = X[test_index,:] y_test = y[test_index] # print(test_index) # Fit classifier and classify the test points (consider 1 to 40 neighbors) for l in range(1,L+1): knclassifier = KNeighborsClassifier(n_neighbors=l); knclassifier.fit(X_train, y_train); y_est = knclassifier.predict(X_test); errors[i,l-1] = np.sum(y_est!=y_test) i+=1 # Plot the classification error rate #figure() #plot(100*sum(errors,0)/N) #xlabel('Number of neighbors') #ylabel('Classification error rate (%)') #show() #error_sum = 0 #for i in range(0,len(y_test)-1): # if y_test[i]!=y_est[i]:
print(classification_report(y_test, y_pred)) # Accuracy from sklearn.metrics import accuracy_score print("Accuracy: ",accuracy_score(y_test, y_pred))# Recall from sklearn.metrics import recall_score print("Recall: ",recall_score(y_test, y_pred, average='weighted'))# Precision from sklearn.metrics import precision_score print("Precision: ",precision_score(y_test, y_pred, average='weighted')) """# **ML Model**""" from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() knn.fit(X_train, y_train) y_knn_pred = knn.predict(X_test) # Accuracy from sklearn.metrics import accuracy_score print("Accuracy: ",accuracy_score(y_test, y_knn_pred))# Recall from sklearn.metrics import recall_score print("Recall: ",recall_score(y_test, y_knn_pred, average='weighted'))# Precision from sklearn.metrics import precision_score print("Precision: ",precision_score(y_test, y_knn_pred, average='weighted')) """# **Simple Neural Network**""" # Number of times we want to iterate over whole training data BATCH_SIZE = 1000
# Nomalize Data X = preprocessing.StandardScaler().fit(X).transfrom(X.astype(float)) # Train Test Split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4) print ('Train set:', X_train.shape, y_train.shape) print ('Test set:', X_test.shape, y_test.shape) # Classfication KNN # import library from Sklearn from sklearn.neighbors import KNeighborsClassifier # Training k = 4 #Train Model and Predict neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train) neigh #Predicting yhat = neigh.predict(X_test) # Accuracy evaluation from sklearn import metrics print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train))) print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat)) # Find the best K Ks = 10 mean_acc = np.zeros((Ks-1)) std_acc = np.zeros((Ks-1)) ConfustionMx = []; for n in range(1,Ks):
# 我们看看这 6 种算法: # # 逻辑回归(LR) # 线性判别分析(LDA) # K最近邻算法(KNN) # 分类和回归树(CART) # 高斯朴素贝叶斯(NB) # 支持向量机(SVM) # 这里面既有简单的线性算法(LA和LDA),也有非线性算法(KNN,CART,NB和SVM)。我们每次运行算法前都要重新设置随机数量的种子,以确保是在用相同的数据拆分来评估每个算法。这样能保证最终结果可以直接进行比较。 # # 我们来搭建和评估模型: # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) # cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold) # results.append(cv_results) # names.append(name) # msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) # print(msg)
def predefined_estimators(estimator, random_state, n_jobs, p): """ Provides the classifiers and parameters using by the module Parameters ----------- estimator : str Name of scikit learn estimator. random_state : Any number Seed to use in randomized components. n_jobs : int Number of processing cores to use. p : dict Classifier setttings (keys) and values. Returns ------- clf : object Scikit-learn classifier object mode : str Flag to indicate whether classifier performs classification or regression. """ try: from sklearn.experimental import enable_hist_gradient_boosting except ImportError: pass from sklearn.linear_model import ( LogisticRegression, LinearRegression, SGDRegressor, SGDClassifier, ) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import ( RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, ) from sklearn.ensemble import (GradientBoostingClassifier, GradientBoostingRegressor) from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor estimators = { "SVC": SVC(C=p["C"], probability=True, random_state=random_state), "SVR": SVR(C=p["C"], epsilon=p["epsilon"]), "LogisticRegression": LogisticRegression( C=p["C"], solver="liblinear", random_state=random_state, multi_class="auto", n_jobs=1, fit_intercept=True, ), "LinearRegression": LinearRegression(n_jobs=n_jobs, fit_intercept=True), "SGDClassifier": SGDClassifier( penalty=p["penalty"], alpha=p["alpha"], l1_ratio=p["l1_ratio"], n_jobs=n_jobs, random_state=random_state, ), "SGDRegressor": SGDRegressor( penalty=p["penalty"], alpha=p["alpha"], l1_ratio=p["l1_ratio"], random_state=random_state, ), "DecisionTreeClassifier": DecisionTreeClassifier( max_depth=p["max_depth"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, ), "DecisionTreeRegressor": DecisionTreeRegressor( max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, ), "RandomForestClassifier": RandomForestClassifier( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, oob_score=True, ), "RandomForestRegressor": RandomForestRegressor( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, oob_score=True, ), "ExtraTreesClassifier": ExtraTreesClassifier( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, bootstrap=True, oob_score=True, ), "ExtraTreesRegressor": ExtraTreesRegressor( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, bootstrap=True, n_jobs=n_jobs, oob_score=True, ), "GradientBoostingClassifier": GradientBoostingClassifier( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "GradientBoostingRegressor": GradientBoostingRegressor( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "HistGradientBoostingClassifier": GradientBoostingClassifier( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "HistGradientBoostingRegressor": GradientBoostingRegressor( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "MLPClassifier": MLPClassifier( hidden_layer_sizes=p["hidden_layer_sizes"], alpha=p["alpha"], random_state=random_state, ), "MLPRegressor": MLPRegressor( hidden_layer_sizes=p["hidden_layer_sizes"], alpha=p["alpha"], random_state=random_state, ), "GaussianNB": GaussianNB(), "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=p["n_neighbors"], weights=p["weights"], n_jobs=n_jobs), "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=p["n_neighbors"], weights=p["weights"], n_jobs=n_jobs), } # define classifier model = estimators[estimator] # classification or regression if (estimator == "LogisticRegression" or estimator == "SGDClassifier" or estimator == "MLPClassifier" or estimator == "DecisionTreeClassifier" or estimator == "RandomForestClassifier" or estimator == "ExtraTreesClassifier" or estimator == "GradientBoostingClassifier" or estimator == "HistGradientBoostingClassifier" or estimator == "GaussianNB" or estimator == "LinearDiscriminantAnalysis" or estimator == "QuadraticDiscriminantAnalysis" or estimator == "SVC" or estimator == "KNeighborsClassifier"): mode = "classification" else: mode = "regression" return (model, mode)
def main(verbose): # Data to train/test sentences, language = prepare_data() tests_language, tests_text = get_directory_content( "identification_langue/corpus_test1/*.txt") # Use cases for test test_cases = [ ClassifierTest('MultinomialNB', MultinomialNB(), 1), ClassifierTest('MultinomialNB', MultinomialNB(), 2), ClassifierTest('MultinomialNB', MultinomialNB(), 3), ClassifierTest('LogisticRegression', LogisticRegression(), 1), ClassifierTest('LogisticRegression', LogisticRegression(), 2), ClassifierTest('LogisticRegression', LogisticRegression(), 3), ClassifierTest('KNeighborsClassifier 3 neighbors', KNeighborsClassifier(3), 1), # Strange predictions ClassifierTest('KNeighborsClassifier 3 neighbors', KNeighborsClassifier(3), 2), # Strange predictions ClassifierTest('KNeighborsClassifier 3 neighbors', KNeighborsClassifier(3), 3), # Strange predictions ClassifierTest('KNeighborsClassifier 5 neighbors', KNeighborsClassifier(5), 1), # Strange predictions ClassifierTest('KNeighborsClassifier 5 neighbors', KNeighborsClassifier(5), 2), # Strange predictions ClassifierTest('KNeighborsClassifier 5 neighbors', KNeighborsClassifier(5), 3), # Strange predictions ClassifierTest('LinearSVC', LinearSVC(random_state=0, tol=1e-5), 1), # - ClassifierTest('LinearSVC', LinearSVC(random_state=0, tol=1e-5), 2), # GOOD ClassifierTest('LinearSVC', LinearSVC(random_state=0, tol=1e-5), 3), # GOOD ClassifierTest('SVC gamma auto', SVC(gamma='auto'), 1), # strange ClassifierTest('SVC gamma auto', SVC(gamma='auto'), 2), # strange ClassifierTest('SVC gamma auto', SVC(gamma='auto'), 3), # strange ClassifierTest( 'SVC avec linear', SVC(kernel="linear", C=0.025), 1), # This linear with 1 gram its better than the other class ClassifierTest('SVC avec linear', SVC(kernel="linear", C=0.025), 2), # GOOD ClassifierTest('SVC avec linear', SVC(kernel="linear", C=0.025), 3), # GOOD ClassifierTest('SVC gamma 2', SVC(gamma=2, C=1), 1), # always english... ClassifierTest('SVC gamma 2', SVC(gamma=2, C=1), 2), # always english... ClassifierTest('SVC gamma 2', SVC(gamma=2, C=1), 3), # always english... ClassifierTest('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=5), 1), # very bad ClassifierTest('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=5), 2), # Strange results ClassifierTest('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=5), 3), # Strange results ClassifierTest('SGDClassifier ', SGDClassifier(max_iter=1000), 1), # ClassifierTest('SGDClassifier ', SGDClassifier(max_iter=1000), 2), # GOOD ClassifierTest('SGDClassifier ', SGDClassifier(max_iter=1000), 3), # GOOD ### ClassifierTest('GaussianNB', GaussianNB(), 1), # Doenst work... too dense error, ] # Just to show header headerClassifier = ClassifierTest('Header', None, 0) print(headerClassifier.str_keys()) # tests our cases for test_case in test_cases: #[:1] classifier = Classifier(test_case, language, sentences, verbose=False) predictions = [] for test in tests_text: prediction = classifier.predict(test) predictions.append(prediction[0]) if (verbose): print('# Prediction: {} | Text: {}'.format( prediction, test[:70].replace("\n", ""))) mean = np.mean(np.array(predictions) == tests_language) print("{}{}{} | {}".format(bcolors.HEADER, test_case, bcolors.ENDC, mean))
Ytest = Xtest - np.ones((Ntest,1))*X.mean(0) # Obtain the PCA solution by calculate the SVD of Y U,S,V = linalg.svd(Y,full_matrices=False) V = V.T # Repeat classification for different values of K error_rates = [] for k in K: # Project data onto principal component space, Z = Y @ V[:,:k] Ztest = Ytest @ V[:,:k] # Classify data with knn classifier knn_classifier = KNeighborsClassifier(n_neighbors=1) knn_classifier.fit(Z,y.ravel()) y_estimated = knn_classifier.predict(Ztest) # Compute classification error rates y_estimated = y_estimated.T er = (sum(ytest!=y_estimated)/float(len(ytest)))*100 error_rates.append(er) print('K={0}: Error rate: {1:.1f}%'.format(k, er)) # Visualize error rates vs. number of principal components considered figure() plot(K,error_rates,'o-') xlabel('Number of principal components K') ylabel('Error rate [%]') show()
from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from Group4_SelectProcessOrganize import * if __name__ == '__main__': #defining local variables, isolating explanatory variables from response feature_count = 8 preprocessed = run_data_job() test_partition = 0.4 X = preprocessed[:, :feature_count] y = preprocessed[:, feature_count] #Show us what were working with print("Size of Feature Data : ", X.shape) print("Size of Label Data : ", y.shape) #Generate random test and train sets X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_partition) #a couple model types we experimented with default_model = KNeighborsClassifier(n_neighbors=2) LDA_model = LinearDiscriminantAnalysis() LR_model = LogisticRegression() #fit the default model with the training set and generate predictions for the test set default_model.fit(X_train, y_train) y_modeled = default_model.predict(X_test) #evaluate the model delta = abs(y_modeled - y_test) error_count = np.count_nonzero(delta) print("Classifier Accuracy:", 1 - (error_count / len(y_test))) print("Average Absolute Error", np.mean(delta)) print(confusion_matrix(y_test, y_modeled))
from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.svm import LinearSVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.dummy import DummyClassifier from xgboost import XGBClassifier # define a dictionary for different classifiers and their parameters classifiers = { "Dummy" : DummyClassifier(strategy='uniform', random_state=2), "KNN(3)" : KNeighborsClassifier(3), "RBF SVM" : SVC(gamma=2, C=1), "Decision Tree": DecisionTreeClassifier(max_depth=7), "Random Forest": RandomForestClassifier(max_depth=7, n_estimators=10, max_features=4), "xgboost" : XGBClassifier(), "Neural Net" : MLPClassifier(alpha=1), "AdaBoost" : AdaBoostClassifier(), "Naive Bayes" : GaussianNB(), "QDA" : QuadraticDiscriminantAnalysis(), "Linear SVC" : LinearSVC(), "Linear SVM" : SVC(kernel="linear"), "Gaussian Proc": GaussianProcessClassifier(1.0 * RBF(1.0)), } from time import time nfast = 10 # Run the first nfast learner. Don't run the very slow ones at the end head = list(classifiers.items())[:nfast]
docs = corpus.split('\n') X, y = [], [] for doc in docs: i, l = doc.split(':') X.append(i.strip()) y.append(l.strip()) #Structure input data from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() matrix_X = vec.fit_transform(X) #Applying K-Nearest Neighbors classifier from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3, algorithm='brute', weights='distance') knn.fit(matrix_X[:5], y[:5]) print('KNN Classifier, Label: ' + str(knn.predict(matrix_X[5]))) print('KNN Classifier, prob.' + str(knn.predict_proba(matrix_X[5]))) #Applying Naive Bayes Classifier from sklearn.naive_bayes import MultinomialNB nbc = MultinomialNB(alpha=0.2, fit_prior=False, class_prior=[0.6, 0.4]) nbc.fit(matrix_X[:5], y[:5]) print('Naive Bayes Classifier, Label: ' + str(nbc.predict(matrix_X[5]))) print('Naive Bayes Classifier, prob.' + str(nbc.predict_proba(matrix_X[5]))) #Applying Decision Tree Classifier from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier(max_depth=2)
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.ensemble import VotingClassifier from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( SelectPercentile(percentile=99, score_func=f_classif), KNeighborsClassifier(n_neighbors=2, weights="uniform")) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
cm = confusion_matrix(y_test, svm_predictions) print(cm) print(classification_report(y_test, svm_predictions, target_names = targets)) plot_confusion_matrix(svm_model_linear, X_test, y_test, normalize='true',display_labels=targets, xticks_rotation = 45) plt.title('Electra Query Type Classification using linear SVM') plt.savefig('Electra Query Type Classification using linear SVM.jpg') # In[8]: # training a KNN classifier from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train) # accuracy on X_test accuracy = knn.score(X_test, y_test) print(accuracy) # creating a confusion matrix knn_predictions = knn.predict(X_test) cm = confusion_matrix(y_test, knn_predictions) print(cm) print(classification_report(y_test, knn_predictions, target_names = targets)) plot_confusion_matrix(knn, X_test, y_test, normalize='true',display_labels=targets, xticks_rotation = 45) plt.title('Electra Query Type Classification using KNN classifier') plt.savefig('Electra Query Type Classification using KNN classifier.jpg')
def main(): # choose which dataset(s) we are going to use ('gym' or 'song' prediction) for prob_name in ['gym', 'song']: p = global_params[prob_name] folds = 5 print('\n##### Learning on {0} dataset #####\n'.format(prob_name)) x_train, y_train, x_test, y_test = PreProcess(prob_name, False) learning_algos = [ ( DecisionTreeClassifier( max_depth=p['max_depth'], min_samples_split=p['min_samples_split'], ), True, # Normalize Inputs { # Grid Search Params 'max_depth': range(1, 30, 2), 'min_samples_split': range(2, 400, 20) }, # chosen params dict(max_depth=p['max_depth'], min_samples_split=p['min_samples_split'])), ( KNeighborsClassifier(n_neighbors=p['n_neighbors'], weights=p['weights']), True, # Normalize Inputs { # Grid Search Params 'n_neighbors': [1] + [x * 5 for x in range(1, 9)], 'weights': ['distance', 'uniform'] }, # chosen params dict(n_neighbors=p['n_neighbors'], weights=p['weights'])), ( AdaBoostClassifier(n_estimators=p['n_estimators'], learning_rate=p['learning_rate']), True, # Normalize Inputs { # Grid Search Params 'n_estimators': [100, 250, 400, 550, 700], 'learning_rate': np.logspace(-5, 0, 6) }, # chosen params dict(n_estimators=p['n_estimators'], learning_rate=p['learning_rate'])), ( MLPClassifier(alpha=p['alpha'], hidden_layer_sizes=p['hidden_layer_sizes'], random_state=p['random_state']), True, # Normalize Inputs { # Grid Search Params 'hidden_layer_sizes': [(x * 10, ) for x in range(2, 7)] + [(x * 10, y * 10) for x in range(2, 7) for y in range(2, 7)] + [(x * 10, y * 10, z * 10) for x in range(2, 7) for y in range(2, 7) for z in range(2, 7)], 'alpha': [1e-9, 5e-8, 1e-6, 1e-5, 1e-4, 1e-3] }, # chosen params dict(alpha=p['alpha'], hidden_layer_sizes=p['hidden_layer_sizes'], random_state=p['random_state'])), ( svm.SVC(C=p['C'], gamma=p['gamma'], kernel=p['kernel']), True, # Normalize Inputs [ { # Grid Search Params 'kernel': ['rbf'], 'C': np.logspace(-3, 3, 7), 'gamma': np.logspace(-3, 3, 7) }, { 'kernel': ['linear'], 'C': np.logspace(-3, 3, 7), } ], # chosen params dict(C=p['C'], gamma=p['gamma'], kernel=p['kernel'])) ] learning_algos_chosen = { 'Decision Tree': learning_algos[0], 'K-Nearest Neighbors': learning_algos[1], 'Boosting': learning_algos[2], 'Neural Network': learning_algos[3], 'Support Vector Machine': learning_algos[4] } skf = StratifiedKFold(n_splits=folds) plotting_learning_curve, grid_search, run_cv, testing = (False, False, False, False) testing = True for algoName, (estimator, normalize_data, param_grid, params) in learning_algos_chosen.items(): print('\n{0} Performance\n'.format(algoName)) if normalize_data: # Normalize for less sensitivity: https://www.springboard.com/blog/beginners-guide-neural-network-in-python-scikit-learn-0-18/ scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) if plotting_learning_curve: plot_learning_curve(estimator, '{0} - {1}'.format(algoName, prob_name.upper()), x_train, y_train, train_sizes=np.linspace(.1, 1.0, 10), cv=skf) plt.ion() plt.savefig('{0}_{1}.png'.format(algoName, prob_name.upper())) plt.pause(0.001) plt.show() if grid_search: clf = None grid_pickle_loc = './Grid_Search_{0}_{1}'.format( prob_name, algoName) if not os.path.exists(grid_pickle_loc): print('generating grid search results') # grid search and then pickle the cv_results clf = GridSearchCV(estimator, param_grid, verbose=3, cv=folds) clf.fit(x_train, y_train) results = clf pickle_out = open(grid_pickle_loc, "wb") pickle.dump(results, pickle_out) pickle_out.close() else: print('loading pickled grid search results') clf = pickle.load(open(grid_pickle_loc, "rb")) import ipdb ipdb.set_trace() if run_cv: # cross validation print(params) scores = cross_val_score(estimator, x_train, y_train, cv=folds, verbose=3) print("\tCross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) if testing: # testing on held out test set print(params) t1 = time.time() estimator.fit(x_train, y_train) t2 = time.time() avg_runtime = str( datetime.timedelta(seconds=((t2 - t1) / folds))) print('Wall Clock Time: {0}'.format(avg_runtime)) y_predict = estimator.predict(x_test) print("\tTest Set Accuracy:{0}".format( np.count_nonzero(y_predict == y_test) / len(y_test)))
def knn_classifier(train_x, train_y): from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier() model.fit(train_x, train_y) return model
from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score #Create a GaussianNB object gnb = GaussianNB() pred = gnb.fit(X, Y).predict(x) print("Naive-Bayes accuracy: ", accuracy_score(y, pred, normalize=True)) #2. Linear Support Vector Classifier from sklearn.svm import LinearSVC svc_model = LinearSVC(random_state=0) pred = svc_model.fit(X, Y).predict(x) print("Linear SVC accuracy: ", accuracy_score(y, pred, normalize=True)) #3. k-Nearest-Neighbours classifier from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, Y) pred = neigh.predict(x) print("k-Nearest-Neighbours score: ", accuracy_score(y, pred)) #4. Decision trees from sklearn import tree #Create a tree clf = tree.DecisionTreeClassifier() clf.fit(X, Y) preds = clf.predict(x) print("Decision tree score: ", accuracy_score(y, preds)) #5. Random Forest classifier from sklearn.ensemble import RandomForestClassifier forClf = RandomForestClassifier(n_estimators=10)
from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.7339031339031339 exported_pipeline = make_pipeline( make_union( make_union( FunctionTransformer(copy), RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.45, n_estimators=100), step=0.4) ), FunctionTransformer(copy) ), KNeighborsClassifier(n_neighbors=16, p=1, weights="distance") ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)