def main(path,filename): batchsT = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5'] batchsAux = ['histogramaByN','histogramaColor','patronesCirculaesByN_2_5','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9'] #batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5'] #for batch in batchsAux: #print batch batchs = batchsAux #batchs.remove(batch) X = [] y = [] load_batch(y,path,'clases',filename) y = [j for i in y for j in i] for batch in batchs: load_batch(X,path,batch,filename) #X,y = load_images('/tmp/train/') est = [RandomForest(),Boosting()] for i in xrange(0,15): est.append(Gradient(i)) for i in xrange(0,4): est.append(SVM(i)) #scores = cross_validation.cross_val_score(clf, X, y, cv=5) #print scores clf = VotingClassifier(estimators=est) clf.fit(X,y) pickle.dump( clf, open( "clf_grande.p", "wb" ) ) return X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, y, test_size=0.2,random_state=777) #print clf.sub_score(X_test,Y_test) print 'start' conf_matrix = metrics.confusion_matrix(Y_test,clf.predict(X_test)) print 'confution matrix' print conf_matrix return for name,estim in est: print name #estim.fit(X_train,Y_train) #print estim.score(X_test,Y_test) print cross_validation.cross_val_score(estim, X, y, cv=5,n_jobs=-1) print 'voter' print cross_validation.cross_val_score(clf, X, y, cv=5,n_jobs=-1) return #clf.fit(X_train,Y_train) print clf.score(X_test,Y_test) return
def vclas(w1,w2,w3, w4, w5): Xtrain,Xtest, ytrain,ytest= cv.train_test_split(trainX,trainY,test_size=0.4) clf1 = LogisticRegression() clf2 = GaussianNB() clf3 = RandomForestClassifier(n_estimators=10,bootstrap=True) clf4= ExtraTreesClassifier(n_estimators=10, bootstrap=True) clf5 = GradientBoostingClassifier(n_estimators=10) clfes=[clf1,clf2,clf3,clf4, clf5] eclf = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf2), ('rf', clf3),('et',clf4), ('gb',clf5)], voting='soft', weights=[w1, w2, w3,w4, w5]) [c.fit(Xtrain, ytrain) for c in (clf1, clf2, clf3,clf4, clf5, eclf)] N = 6 ind = np.arange(N) width = 0.3 fig, ax = plt.subplots() for i, clf in enumerate(clfes): print(clf,i) p1=ax.bar(i,clfes[i].score(Xtrain,ytrain,), width=width,color="blue", alpha=0.5) p2=ax.bar(i+width,clfes[i].score(Xtest,ytest,), width=width,color="red", alpha=0.5) ax.bar(len(clfes)+width,eclf.score(Xtrain,ytrain,), width=width,color="blue", alpha=0.5) ax.bar(len(clfes)+width *2,eclf.score(Xtest,ytest,), width=width,color="red", alpha=0.5) plt.axvline(4.8, color='k', linestyle='dashed') ax.set_xticks(ind + width) ax.set_xticklabels(['LogisticRegression', 'GaussianNB', 'RandomForestClassifier', 'ExtraTrees', 'GradientBoosting', 'VotingClassifier'], rotation=40, ha='right') plt.title('Training and Test Score for Different Classifiers') plt.legend([p1[0], p2[0]], ['training', 'test'], loc='lower left') plt.show()
def run_voting(training_set, train_set_labels, validation_set, validation_set_labels): from sklearn.ensemble import VotingClassifier standard_train_inputs = standard_data(training_set) standard_valid_inputs = standard_data(validation_set) kknn_class = KNeighborsClassifier(weights='uniform', n_neighbors=5) logistic_regression_solver = sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.01, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='newton-cg', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=2) svm_class = svm.SVC(decision_function_shape='ovo', tol=0.001) eclf1 = VotingClassifier(estimators=[('knn', kknn_class), ('lr', logistic_regression_solver), ('svm', svm_class)], voting='hard') eclf1.fit(standard_train_inputs,train_set_labels.ravel()) accuracy = eclf1.score(standard_valid_inputs,validation_set_labels.ravel()) print accuracy
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())] ) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy', confidence) predictions = clf.predict(X_test) print('Predicted spread:', Counter(predictions)) return confidence
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25) clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy:', confidence) predictions = clf.predict(X_test) print('Predicted spread:', Counter(predictions)) return confidence
def ensemble_voting(X_train,X_test,y_train,y_test): y_train = y_train.ravel() y_test = y_test.ravel() C_value, gamma_value,kernel_type = svc_param_selection(X_train, y_train, 5) ###################### # fit clf1 with df1 pipe1 = Pipeline([ ('col_extract', ColumnExtractor( cols=range(0,34) )), # selecting features 0 and 1 (df1) to be used with LR (clf1) ('clf', SVC(C=C_value,kernel=kernel_type,gamma=gamma_value)) ]) pipe1.fit(X_train, y_train) # sanity check print(' Sanity check') print(pipe1.score(X_test,y_test)) # sanity check ###################### # fit clf2 with df2 pipe2 = Pipeline([ ('col_extract', ColumnExtractor( cols=range(35,47) )), # selecting features 2 and 3 (df2) to be used with SVC (clf2) ('clf', KNeighborsClassifier()) ]) pipe2.fit(X_train, y_train) # sanity check print(' Sanity check') print(pipe2.score(X_test,y_test)) # sanity check ###################### # fit clf3 with df3 pipe3 = Pipeline([ ('col_extract', ColumnExtractor( cols=range(48,95) )), # selecting features 2 and 3 (df2) to be used with SVC (clf2) ('clf', RandomForestClassifier(n_estimators=20, random_state=0,criterion='entropy')) ]) pipe3.fit(X_train, y_train) # sanity check print(' Sanity check') print(pipe3.score(X_test,y_test)) # sanity check ###################### # ensemble/voting classifier where clf1 fitted with df1 and clf2 fitted with df2 eclf = VotingClassifier(estimators=[('MIR-SVM', pipe1), ('SPO-kNN', pipe2), ('LYR-RF',pipe3)], voting='hard') eclf.fit(X_train, y_train) print(eclf.score(X_test,y_test))
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = neighbors.KNeighborsClassifier(weights='distance') clf.fit(X_train, y_train) print("\n\n") print("Parameters of Kneighbors", clf.get_params()) confidence = clf.score(X_test, y_test) print("Accuracy of Kneighbors", confidence) predicition = clf.predict(X_test) print("Predicted Spread of Kneighbors:", Counter(predicition)) print("\n\n") print("Decision Tree") clf1 = DecisionTreeClassifier(max_depth=4) clf1.fit(X_train, y_train) print("Parameters of Decision Tree", clf1.get_params()) print("Accuracy of Decision Tree", clf1.score(X_test, y_test)) print("Predicted Spread of Decision Tree", Counter(clf1.predict(X_test))) print("\n\n") print("RandomForest") clf2 = RandomForestClassifier() clf2.fit(X_train, y_train) print("Parameters of RandomForest", clf2.get_params()) print("Accuracy of RandomForest", clf2.score(X_test, y_test)) print("Predicted Spread of RandomForest", Counter(clf2.predict(X_test))) print("Ensemble") clfn = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clfn.fit(X_train, y_train) confidence = clfn.score(X_test, y_test) print("Accuracy of Ensembles", confidence) predicition = clfn.predict(X_test) print("Predicted Spread of ensembles:", Counter(predicition)) return confidence
def mlearn(ticker): #X is percent change, y is target classification: 1,-1,0 X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25) clf = neighbors.KNeighborsClassifier() #sklearn has flags for linearSVC, KNN, random forest clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy', confidence) predictions = clf.predict(X_test) print('Predicted spread:', Counter(predictions)) return confidence
def train(x, y): logging.debug("X sample: \ {} ".format(len(x.shape))) logging.debug("y sample: \ {} ".format(len(y.shape))) # random shuffle and split test_size = int(len(y) * 0.2) x_train, x_test, y_train, y_test = x[test_size:], x[:test_size], y[test_size:], y[:test_size] # combine the predictions of several base estimators clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(x_train, y_train) # test data prediction np.set_printoptions(precision=2) confidence = clf.score(x_test, y_test) print('accuracy:', confidence) return confidence, clf
def do_ml(ticker): global clf X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25) clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:', confidence) predictions = clf.predict(X_test) print(np.shape(X_test)) plt.plot(X_test, y_test, '-r') plt.show() print('predicted class counts:', Counter(predictions)) print() return confidence
def do_ml(ticker): X, y, fileDataSet = extract_featuresets(ticker) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() # Replace simple classifier with voting classifier: # Voting classifier will take list of tuples of classifier by name, classifier # List contains tuples (i.e. 3 classifiers: linear svc, neigbors, random forest classifiers) #clf = VotingClassifier([('lsvc', svm.LinearSVC()), clf = VotingClassifier([('lsvc', LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy', confidence) predictions = clf.predict(X_test) print('Predicted spread:', Counter(predictions)) return confidence
def train_test(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:', confidence) predictions = clf.predict(X_test) print('predicted class counts:', Counter(predictions)) #print() #print() #with open("clf.pickle","wb") as f: # pickle.dump(clf,f) return predictions[-1], confidence
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([("lsvc", svm.LinearSVC()), ("knn", neighbors.KNeighborsClassifier()), ("rfor", RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) predictions = clf.predict(X_test) print("Predicted Spread", Counter(predictions)) print("Predicted Accuracy", confidence) return confidence
def machining_the_data(stock): stock_data = precent_change(stock) X = np.array(stock_data.drop(['label'],1)) y = np.array(stock_data['label']) X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .95) clf = VotingClassifier([('lsvc',svm.LinearSVC()), ('knn',neighbors.KNeighborsClassifier()), ('rfor',RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('accuracy:',confidence) print(X_test) predictions = clf.predict(np.array([[ 1.44231225,-9.70757936 ,-1.00000000], [ 2.96192498, -2.77678573 , 1.00000000], [ 2.50403844 , 6.29763054, -1.00000000]])) print(predictions)
class VotingClassifier3(AlgorithmInterface): def __init__(self, rfa, svma, lra): super(VotingClassifier3, self).__init__() self.accuracy_score = 0 self.classifier = VotingClassifier(estimators=[( 'rfa', rfa.classifier), ('svma', svma.classifier), ('lra', lra.classifier)]) def feature_engineering(self): self.convert_symbolic_feature_into_continuous() def train_phase(self): self.classifier.fit(self.test_data, self.test_label) def test_phase(self): self.accuracy_score = self.classifier.score(self.test_data, self.test_label) print("准确度: %f" % self.accuracy_score)
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() ## make the machine vote by itself between 3 classifiers which one is best to use clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('Accuracy:', confidence) predictions = clf.predict(X_test) ## to make differente predictions print('Predicted spread:', Counter(predictions)) return confidence
def ml_operations(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # classifier = neighbors.KNeighborsClassifier() classifier = VotingClassifier([('linearsvc', svm.LinearSVC()), ('randforest', RandomForestClassifier()), ('knearest', neighbors.KNeighborsClassifier())]) classifier.fit(X_train, y_train) confidence = classifier.score(X_test, y_test) print('Accuracy', confidence) predictions = classifier.predict(X_test) print('Predicted spread', Counter(predictions)) return confidence
def voting(X_train, y_train, X_test, y_test): """Predict Dropouts using Voting Classifier with RandomForestClassifier, LogisticRegression, and XGBClassifier Args: X_train: Training feature vetors y_train: Training label vetors X_test: Testing feature vetors y_test: Testing label vetors Returns: None, printing out the prediction results """ t0 = time.time() clf1 = RandomForestClassifier(n_estimators=200, max_depth=12, random_state=0, min_samples_split=3, n_jobs=-1) clf2 = LogisticRegression(tol=1e-3, C=1.5, random_state=0) clf3 = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05, min_child_weight=2, n_jobs=-1, max_delta_step=1, objective='binary:logistic', gamma=3, subsample=1) clf = VotingClassifier(estimators=[('rf', clf1), ('lr', clf2), ('xgb', clf3)], voting='hard') clf = clf.fit(X_train, y_train) expected = y_test predicted = clf.predict(X_test) print('Classifier: %s\n' % (clf, )) print('Classification report: \n %s \n' % (metrics.classification_report(expected, predicted), )) print('Confusion matrix:\n%s\n' % metrics.confusion_matrix(expected, predicted)) print('Testing Score: %f' % clf.score(X_test, y_test)) print('Time: %f seconds \n' % (time.time() - t0))
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25) # defining classifiers used in voting classifier clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) # fit input (X_train: pct_change) to target (Y_train: 1, 0, -1) clf.fit(X_train, y_train) # train classifier confidence = clf.score(X_test, y_test) # get accuracy score print('accuracy:', confidence) predictions = clf.predict(X_test) print('predicted class counts:', Counter(predictions)) print() print() return confidence
def evaluate(self): for ticker in self.tickers: self.ticker = ticker self.add_data_for_label_to_stock_table() self.create_features_and_label() features_train, features_test, label_train, label_test = train_test_split( self.features, self.label, test_size=0.25) clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier(n_estimators=10))]) clf.fit(features_train, label_train) confidence = clf.score(features_test, label_test) predictions = clf.predict(features_test) self.result_output(confidence, predictions)
def _ensemble_model(rf_model, knn_model, X_train, y_train, X_test, y_test): # Create a dictionary of our models estimators = [('knn', knn_model), ('rf', rf_model)] # Create our voting classifier, inputting our models ensemble = VotingClassifier(estimators, voting='hard') # fit model to training data ensemble.fit(X_train, y_train) # test our model on the test data print(ensemble.score(X_test, y_test)) prediction = ensemble.predict(X_test) print(classification_report(y_test, prediction)) print(confusion_matrix(y_test, prediction)) return ensemble
def mlClassifiers(company): x, y, df_final, symbols = get_train_data(company) Xtrain, Xtest, Ytrain, Ytest = cross_validation.train_test_split( x, y, test_size=0.2) X = Xtrain.astype(int) Y = Ytrain.astype(int) Xt = Xtest.astype(int) Yt = Ytest.astype(int) clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rf', RandomForestClassifier())]) clf.fit(X, Y) accuracy = clf.score(Xt, Yt) print('Accuracy', accuracy) prediction = clf.predict(Xt) print('prediction spread:', Counter(prediction)) backtest(df_final, company) backtest_result() return accuracy
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.25) #lsvc = linear support vecotr classifier, knn = k near neighbors #rfor = random forest clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) # train model confidence = clf.score(X_test, y_test) print('Accuracy:', confidence) predictions = clf.predict( X_test) # can be called by itself with pickle to have all pred print('Predicted spread:', Counter(predictions)) return confidence
def doML(ticker): X, y, df = extractFeaturesets(ticker) XTrain, XTest, yTrain, yTest = cross_validation.train_test_split( X, y, test_size=0.25) clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([("lsvc", svm.LinearSVC()), ("Knn", neighbors.KNeighborsClassifier()), ("Rfor", RandomForestClassifier())]) clf.fit(XTrain, yTrain) confidence = clf.score(XTest, yTest) print("Accuracy: ", confidence) predictions = clf.predict(XTest) print("Predicted spread: ", Counter(predictions)) return confidence
def ensemble_(feat, tar, split): scaler = MinMaxScaler() x_tr,x_te,y_tr,y_te = train_test_split(feat,tar,test_size = split,shuffle = True) scaler.fit(x_tr) x_tr = scaler.transform(x_tr) x_te = scaler.transform(x_te) knn = KNeighborsClassifier() params_knn = {'n_neighbors': np.arange(1, 25)} knn_gs = GridSearchCV(knn, params_knn, cv=5) knn_gs.fit(x_tr, y_tr) knn_best = knn_gs.best_estimator_ print(knn_gs.best_params_) rf = RandomForestClassifier() params_rf = {'n_estimators': [50, 100, 200,300,400]} rf_gs = GridSearchCV(rf, params_rf, cv=5) rf_gs.fit(x_tr, y_tr) rf_best = rf_gs.best_estimator_ print(rf_gs.best_params_) log_reg = LogisticRegression() log_reg.fit(x_tr, y_tr) print('knn: {}'.format(knn_best.score(x_te, y_te))) print('rf: {}'.format(rf_best.score(x_te, y_te))) print('log_reg: {}'.format(log_reg.score(x_te, y_te))) estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)] ensemble = VotingClassifier(estimators, voting='hard') ensemble.fit(x_tr, y_tr) print("ensemble voting score: ",str(ensemble.score(x_te, y_te))) ensemble_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10) ensemble_bagging.fit(x_tr, y_tr) print("ensemble bagging score: ",str(ensemble_bagging.score(x_te, y_te))) ensemble_stacking = StackingClassifier(estimators,LogisticRegression()) ensemble_stacking.fit(x_tr, y_tr) print("ensemble stacking score: ", str(ensemble_stacking.score(x_te, y_te)))
def main(): train_dataset = pd.read_csv("../data/train.csv") test_dataset = pd.read_csv("../data/test.csv") # pre-processing X_train_processed, Y_train_processed, test_processed = preprocessor( train_dataset, test_dataset, fill_age_with='advanced_median_1', fill_cabin_with='X', dropPassengerID=False, dropName=True, dropTicket=True) X_train, X_valid, y_train, y_valid = train_test_split( X_train_processed.drop(['PassengerId'], axis=1), Y_train_processed, test_size=0.2, random_state=np.random.seed()) # log_clf = LogisticRegression(random_state=42) rnd_clf = RandomForestClassifier(random_state=42) svm_clf = SVC(random_state=42) gbm_clf = GradientBoostingClassifier(random_state=42) # cat_clf = CatBoostClassifier(random_state=42) xg_clf = XGBClassifier(random_state=42) voting_clf = VotingClassifier(estimators=[ ('gbm', gbm_clf), ('rnd', rnd_clf), ('svm', svm_clf), ('xg', xg_clf) ], voting='hard') voting_clf.fit(X_train, y_train) print("Train score: {0.2f}", voting_clf.score(X_train, y_train)) print("Valid score: {0.2f}", voting_clf.score(X_valid, y_valid)) v = voting_clf.predict(test_processed.drop('PassengerId', axis=1)) V = pd.DataFrame({ 'PassengerId': test_dataset['PassengerId'], 'Survived': v }) V.to_csv('../submission/vc_advanced.csv', index=False)
def algo(lr, dt, rf, gnb): print('{} {} {} {}'.format(lr, dt, rf, gnb)) X = iris.data Y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, random_state=42) ap = LogisticRegression() #ap2 = DecisionTreeClassifier()('dt', ap2), #ap3 = RandomForestClassifier(n_estimators=15)('rf', ap3), ap5 = GaussianNB() dt = VotingClassifier(estimators=[('lr', ap), ('gnb', ap5)], voting='soft', weights=[1, 1]) t0 = time() dt.fit(X_train, y_train) Ac = dt.score(X_test, y_test) Tm = time() - t0
def do_time_ml(ticker): X, y = extract_featuresets(ticker) # without test_size = the line crashes tscv = TimeSeriesSplit(n_splits=3) clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier()), ('gap', GaussianProcessClassifier()), ('bag', BaggingClassifier()), ('nn', MLPClassifier(max_iter=2000))]) for train_index, test_index in tscv.split(X): print(train_index, test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # need to have () after the classifier otherwise it gives an error # TypeError: get_params() missing 1 required positional argument: 'self' clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) predictions = clf.predict(X_test) print('Accuracy:', confidence) print("Predicted Spread:", Counter(predictions))
def do_ml(ticker): X, y, df = extract_featuresets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25) # class fire: # clf = neighbors.KNeighborsClassifier() # changing this: clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) # y_train is 0, -1, or 1: clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) print('acc: ', confidence) predictions = clf.predict(X_test) print('predictions:', Counter(predictions)) return confidence
def do_ml_vote(ticker): features, target, df = extract_featuresets(ticker) x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, stratify=target) # x_train is the percent change classifier = VotingClassifier([('lvsc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) classifier.fit(x_train, y_train) confidence = classifier.score(x_test, y_test) print('Accuracy', confidence) predictions = classifier.predict(x_test) print('Prediction Spread: ', Counter(predictions)) return (confidence)
def analysis_stock(tickers, df, start, end): for ticker in tickers: X, y, df = extract_featuresets(df, ticker) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) #clf = neighbors.KNeighborsClassifier() clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) predictions = clf.predict(X_test) if (confidence > 0.6): print('accuracy:', confidence) print('predicted class counts:', Counter(predictions)) print(' Recommend invesment for next 5-7 days:', ticker) print('Predictions for next 5-7 days: ', clf.predict(X[-1:]))
def classifier5(): # Results # 46.21 Seconds to train SVC... # Test Accuracy of SVC = 0.9851 clf1 = LogisticRegression(random_state=1) clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=1) svc = LinearSVC(C=0.1) svc.probability = True eclf1 = VotingClassifier(estimators=[('svc', svc), ('clf1', clf1)], voting='hard') # Check the training time for the SVC t=time.time() eclf1.fit(X_train, y_train) t2 = time.time() print(round(t2-t, 2), 'Seconds to train SVC...') # Check the score of the SVC acc=eclf1.score(X_test, y_test) print('Test Accuracy of SVC = ', round(acc, 4)) # Check the prediction time for a single sample t=time.time()
def use_std_vote_clf(X, y): ''' estimator 传入一个列表,列表里面是tuple tuple ("name",model) ''' X_train, X_test, y_train, y_test = get_train_test(X, y) voting_clf=VotingClassifier(estimators=[ ("log_clf",LogisticRegression()),\ ("svm_clf",SVC()),\ ("dt_clf",DecisionTreeClassifier()) ],voting="hard") voting_clf.fit(X_train, y_train) score = voting_clf.score(X_test, y_test) print("sklearn_pipe_voting_classifier_score=", score)
def do_ml(ticker): """ Runs 3 machine learning algorithm, inside a Voting classifier, to learn when it should buy sell of hold :param ticker: Ticker of the Cryptocurrency to undergo this process :return: returns the confidence of the model describing the data """ X, y, df = extract_features_sets(ticker) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.25) clf = VotingClassifier([('lsvc', svm.LinearSVC()), ('knn', neighbors.KNeighborsClassifier()), ('rfor', RandomForestClassifier())]) clf.fit(X_train, y_train) confidence = clf.score(X_test, y_test) predictions = clf.predict(X_test) print('Prediction:', Counter(predictions)) print('Accuracy', confidence) return confidence
tree6 = GBC() tree6.fit(xtrain,ytrain1) print(tree6.score(xtest,ytest1)) # look at n_estimators and change that along with changing warmstart to be true # In[31]: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() from sklearn.ensemble import VotingClassifier from sklearn import cross_validation eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain1) print(eclf.score(xtest,ytest1)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain1,scoring='accuracy') # print () # In[ ]:
model = runModel(model=model, trainX=X_train[0:30000], trainY=y_train[0:30000], optimize=False, parameters=None, scoring='roc_auc') print "Applying Model ..." start = time() y_pred = model.predict(X_test) print("Model took %.2f seconds to predict vals" % (time() - start)) ### Evaluation print "Scoring Classifier..." start = time() score = model.score(X_test, y_test) recall = metrics.recall_score(y_test, y_pred, average='binary') auc = metrics.roc_auc_score(y_test, y_pred, average='macro') confusion = metrics.confusion_matrix(y_test, y_pred, labels=[0, 1]) print "Score: \t \t Recall: \t AUC:\n", score, recall, auc print("Model took %.2f seconds to score" % (time() - start)) if plot_roc: fpr, tpr, thrsh = metrics.roc_curve(y_test, y_pred, pos_label=1) plt.figure() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0])
SelectFwe(score_func=f_classif, alpha=0.04), RandomForestClassifier(criterion="entropy", max_features=0.6000000000000001, min_samples_split=5, n_estimators=100) ) # 0.82 #clf4 = exported_pipeline = make_pipeline( # StackingEstimator(estimator=LogisticRegression(C=1.0, dual=True)), # RandomForestClassifier(max_features=0.6000000000000001, min_samples_leaf=20, min_samples_split=18) #) #eclf1 = VotingClassifier(estimators=[ # ('lr', clf1), ('rf', clf2), ('gnb', clf3), ('rnd', clf4)], voting='hard') eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('gnb', clf2), ('rnd', clf3)], voting='hard') eclf1 = eclf1.fit(X_train, y_train) print(eclf1.score(X_test, y_test)) model1 = clf1.fit(X_train, y_train) print(model1.score(X_test, y_test)) model2 = clf2.fit(X_train, y_train) print(model2.score(X_test, y_test)) model3 = clf3.fit(X_train, y_train) print(model3.score(X_test, y_test)) #model4 = clf4.fit(X_train, y_train) #print(model4.score(X_test, y_test)) #tpot = TPOTClassifier(generations=20, population_size=50, verbosity=2) #tpot.fit(X_train, y_train)
"orig_destination_distance", "srch_ci", "srch_co"] features = [column for column in features if column not in removelist] print("The features considered are:") print(features) start_time = timeit.default_timer() # Create and fit a decision tree to the set of data in those features y = trainFull["hotel_cluster"] X = trainFull[features] rf = RandomForestClassifier(n_estimators=20, n_jobs=-1, max_features=None, min_samples_split=250) ovr = OneVsRestClassifier(RandomForestClassifier(n_estimators=10, n_jobs=-1, max_features=None, min_samples_split=250), n_jobs=-1) dt = DecisionTreeClassifier(min_samples_split=250, criterion="entropy") vc = VotingClassifier(estimators=[('rf', rf), ('ovr', ovr), ('dt', dt)], voting='hard') vc.fit(X, y) # Measure ability to predict the right hotel clust for a new subset testX = test_set[features] testy = test_set["hotel_cluster"] prediction = vc.predict(testX) report = classification_report(testy, prediction, digits=5) print(report) elapsed = timeit.default_timer() - start_time print(elapsed) score = vc.score(testX, testy) print("Score is " + str(score))
def myclassify(numfiers,xtrain,ytrain,xtest,ytest): count = 0 print numfiers ytrain = np.ravel(ytrain) ytest = np.ravel(ytest) bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False) bagging2.fit(xtrain,ytrain) #print bagging2.score(xtest,ytest) count += 1 classifiers = [bagging2.score(xtest,ytest)] print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" if count < numfiers: tree2 = ETC() tree2.fit(xtrain,ytrain) #print tree2.fit(xtrain,ytrain) #print tree2.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree2.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: bagging1 = BaggingClassifier(ETC()) bagging1.fit(xtrain,ytrain) #print bagging1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging1.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: # votingClassifiers combine completely different machine learning classifiers and use a majority vote clff1 = SVC() clff2 = RFC(bootstrap=False) clff3 = ETC() clff4 = neighbors.KNeighborsClassifier() clff5 = quadda() eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)]) eclf = eclf.fit(xtrain,ytrain) #print(eclf.score(xtest,ytest)) # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']): # cla # scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy') # print () count+=1 classifiers = np.append(classifiers,eclf.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: svc1 = SVC() svc1.fit(xtrain,ytrain) dec = svc1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,svc1.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: # Quadradic discriminant analysis - classifier with quadratic decision boundary - qda = quadda() qda.fit(xtrain,ytrain) #print(qda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,qda.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: tree1 = DTC() tree1.fit(xtrain,ytrain) #print tree1.fit(xtrain,ytrain) #print tree1.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree1.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user. knn1.fit(xtrain,ytrain) #print(knn1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn1.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: # linear discriminant analysis - classifier with linear decision boundary - lda = linda() lda.fit(xtrain,ytrain) #print(lda.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,lda.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: tree3 = RFC() tree3.fit(xtrain,ytrain) #print tree3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree3.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False) bagging3.fit(xtrain,ytrain) #print bagging3.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging3.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False) bagging4.fit(xtrain,ytrain) #print bagging4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,bagging4.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: tree4 = RFC(bootstrap=False) tree4.fit(xtrain,ytrain) #print tree4.score(xtest,ytest) count+=1 classifiers = np.append(classifiers,tree4.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: tree6 = GBC() tree6.fit(xtrain,ytrain) #print(tree6.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree6.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10) knn2.fit(xtrain,ytrain) #print(knn2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn2.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3) knn3.fit(xtrain,ytrain) #print(knn3.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn3.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree') knn4.fit(xtrain,ytrain) #print(knn4.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn4.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree') knn5.fit(xtrain,ytrain) #print(knn5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,knn5.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: ncc1 = NearestCentroid() ncc1.fit(xtrain,ytrain) #print (ncc1.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc1.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: # Nearest shrunken Centroid for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]: ncc2 = NearestCentroid(shrink_threshold = shrinkage) ncc2.fit(xtrain,ytrain) #print(ncc2.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,ncc2.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" + " " + str(numfiers-count) + "classifiers left to train" if count < numfiers: tree5 = ABC() tree5.fit(xtrain,ytrain) #print(tree5.score(xtest,ytest)) count+=1 classifiers = np.append(classifiers,tree5.score(xtest,ytest)) print "percentage classifcation complete: %s" % str(round(100*(float(count)/numfiers))) + "%" classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC", "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC", "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)", "Nearest Centroid","Shrunken Centroid?","ABC"] classifierlabel = classifierlabel[:len(classifiers)] for i in range(len(classifiers)): print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
cl3 = GradientBoostingClassifier(n_estimators=1000, learning_rate=1, max_depth=10, random_state=0, min_samples_split=5) cl4 = GaussianNB() cl5 = MLPClassifier(algorithm='adam', alpha=0.01, max_iter=500, learning_rate='constant', hidden_layer_sizes=(400,), random_state=0, learning_rate_init=1e-2, activation='logistic') eclf1 = VotingClassifier(estimators=[ ('rf', cl1), ('svc', cl2), ('gbc', cl3), ('gnb',cl4),('mlp',cl5) ], voting='hard') eclf1 = eclf1.fit(X, Y.values.ravel()) print ("Accuracy of Voting Ensemble: "+str(eclf1.score(P,Q))) clf5 = SGDClassifier(loss="perceptron", penalty="elasticnet", random_state=0).fit(X, Y.values.ravel()) print ("Accuracy of SGDClassifier: "+str(clf5.score(P,Q))) gbc = GradientBoostingClassifier(loss='exponential').fit(X, Y.values.ravel()) adaboost = AdaBoostClassifier(n_estimators=10000, learning_rate=100).fit(X, Y.values.ravel()) print ("Accuracy of GBC: "+str(gbc.score(P,Q))) print ("Accuracy of Adaboost: "+str(adaboost.score(P,Q))) ### Calculate MSE of different models rf = clf.predict(P)