def main(): ### PHASE 1 ### num_classes = 21 name_encoder = preprocessing.LabelBinarizer() shuffler.shuffleData() # shuffle dataset trainData = dataset.SigData("data/train_data.csv", name_encoder) testData = dataset.SigData("data/test_data.csv", name_encoder) print("created datasets.") mytrainer = trainer.Trainer( trainData, testData, num_users=num_classes) # number of people in the training set print("training.") mytrainer.train(num_epochs=60) exit() # Load checkpoint as trained weights for CNN ### PHASE 2 ### # new_dataset for a user # features = mytrainer.model.get_feature_vectors(new_dataset) forg_classifier = svm.svc(class_weight='balanced') forg_classifier_rbf = svm.svc(kernel='rbf', class_weight='balanced')
def get_model(choice='lr', class_weight=None): if choice == 'svc': model = svc(verbose=1, class_weight=class_weight, n_jobs=-1) elif choice == 'lsvc': model = lsvc(class_weight=class_weight, n_jobs=-1) elif choice == 'knn': model = KNeighborsClassifier() elif choice == 'msvm': model = MulticlassSVM(C=0.1, tol=0.01, max_iter=100, random_state=0, verbose=1) elif choice == 'gnb': model = gnb(class_weight=class_weight) elif choice == 'gpc': model = gpc(class_weight=class_weight) elif choice == 'sgdc': model = sgdc(class_weight=class_weight) elif choice == 'rf': model = rf(class_weight=class_weight) # elif choice == 'vw': # model = vw() else: model = lr(class_weight=class_weight) return model
def quick_ml(df, results, filter_slice, target_df, loss_df, target, loss): # Create x and y ind = df['indexes'].loc[filter_slice][0].values x = results.loc[ind].copy() y = (target_df.loc[ind, target] < loss_df.loc[ind, loss]).astype(int) # create sets x.reset_index(inplace=True, drop=True) y.index = x.index row = int(x.shape[0] * .8) x_train = x.loc[:row] x_test = x.loc[row:] y_train = y.loc[:row] y_test = y.loc[row:] # scale values scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # weuights ? weights = sklearn.utils.class_weight.compute_class_weight( 'balanced', np.array([0, 1]), y_train) # model logreg = LogisticRegression(class_weight={0: weights[0], 1: weights[1]}) logreg = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(500, 4), random_state=1) logreg.fit(x_train, y_train) predictions = logreg.predict(x_test) print(classification_report(y_test, predictions)) # Print logreg = svc() logreg = KNeighborsClassifier(n_neighbors=2) logreg = DecisionTreeClassifier(random_state=0) return None
def train(self, model_id): #get training status of model container train_status = model_cont['train_status'] if train_status != "trained": #load the data to train data_loader = DataLoader() dataset = data_loader.load_user_data(user_data_path) #load model specific parameters #TODO if "train_test_split" in params.keys() and params["train_test_split"]: data_split = params['train_test_split'] trainset = DataProcessor().get_trainset(features, labels, data_split) #train the model clf = svc() clf.fit(dataset['features'], dataset['labels']) pkl_file = pdumps(clf) #update the model object with the results of training model_cont['learned_model']=Binary(pkl_file) model_cont['train_status'] = "trained" return model_cont else: print("Already trained") return False
def classifier(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestClassifier as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeClassifier as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesClassifier as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "logistic": from sklearn.linear_model import LogisticRegression as lr cases = y.nunique() if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial") else: est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVC as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingClassifier as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPClassifier as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def regression(data, y, model="forest"): if model == "forest": from sklearn.ensemble import RandomForestRegressor as rfc est = rfc(n_estimators=10, n_jobs=-1) elif model == "tree": from sklearn.tree import DecisionTreeRegressor as dtc est = dtc() elif model == "extra": from sklearn.ensemble import ExtraTreesRegressor as etc est = etc(n_estimators=10, n_jobs=-1) elif model == "linear": from sklearn.linear_model import LinearRegression as lr cases = y.nunique() est = lr(n_jobs=-1) elif model == "svm": from sklearn.svm import SVR as svc est = svc() elif model == "boost": from sklearn.ensemble import GradientBoostingRegressor as gbc est = gbc(n_estimators=10) elif model == "neural": from sklearn.neural_network import MLPRegressor as nnc est = nnc(max_iter=10, learning_rate_init=1) est.fit(data, y) return est
def classification(self, metric, folds, alphas, graph): size = 1.3 * self.report_width // 10 models = {} models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Logistic classifier"] = logitc() models["SVM classifier with RBF kernel"] = svc(gamma='scale') models["SVM classifier with linear kernel"] = svc(kernel='linear') models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Random forest classifier"] = rfc(n_estimators=100) models["Gradient boosting classifier"] = gbc() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] for model_name in models: cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Classifier': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def makeSVM(self,**kwargs): scale_mag,scale_x,scale_y = self.FitScale(self.data) self.scaled_data = np.c_[scale_mag,scale_x,scale_y] self.completeness = svc(probability=True,**kwargs) if self.spatial: self.completeness.fit(self.scaled_data,self.det.ravel()) else: self.completeness.fit(np.c_[scale_mag],self.det.ravel())
def makeSVM(self, **kwargs): scale_mag, scale_x, scale_y = self.FitScale(self.data) self.scaled_data = np.c_[scale_mag, scale_x, scale_y] self.completeness = svc(probability=True, **kwargs) if self.spatial: self.completeness.fit(self.scaled_data, self.det.ravel()) else: self.completeness.fit(np.c_[scale_mag], self.det.ravel())
def support_vector_machine(): #Import Library from sklearn import svm #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. # Train the model using the training sets and check score model.fit(X, y) # R^2 score model.score(X, y) #Predict Output predicted= model.predict(x_test)
def SVC(test_data, test_label, train_data, train_label, d): svm_model_poly_classifier = svc(kernel='poly', degree=d, C=5, gamma=0.05, probability=True) #It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. svm_train_error = svm_model_poly_classifier.fit( train_data, train_label).score(train_data, train_label) svm_test_error = svm_model_poly_classifier.score(test_data, test_label) y_predict = svm_model_poly_classifier.predict(test_data) return y_predict, (1 - svm_train_error) * len(train_data), ( 1 - svm_test_error) * len(test_data)
def CrossValidate(self,c_range=np.logspace(-3.0,3.0,7),gamma_range=np.logspace(-3.0,3.0,7)): param_grid = dict(C=c_range,gamma=gamma_range) cv = StratifiedShuffleSplit(self.det,n_iter=5,test_size=0.2) grid = GridSearchCV(svc(kernel='rbf',cache_size=1000),param_grid=param_grid,cv=cv) scale_mag,scale_x,scale_y = self.Scale(self.data) if self.spatial: grid.fit(np.c_[scale_mag,scale_x,scale_y],self.det.ravel()) else: grid.fit(np.c_[scale_mag],self.det.ravel()) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
def train_model(train_X, train_Y, valid_X, valid_Y, hyper_param1): # Choose a classifier (here, linear SVM) clf = svc(C=1.0, kernel='rbf', max_iter=1000) # train clf.fit(train_X, train_Y) # validation valid_Y_hat = clf.predict(valid_X) accuracy = np.sum((valid_Y_hat == valid_Y)) / 200.0 * 100.0 print 'validation accuracy = ' + str(accuracy) + ' %' return clf, accuracy
def train(self, model_id): try: conn = MongoClient() print("\nConnection Successful") mlaas_db = conn[DB_NAME] #connect to db mlaas_db.authenticate(USERNAME, PASS) models = mlaas_db[COLL_NAME] #load the collection #get the model corresponding to the id provided model_cont = models.find_one({'_id': ObjectId(model_id)}) assert model_cont, "Invalid model ID" #get training status of model container train_status = model_cont['train_status'] #block if model is being trained currently if train_status != "training": #init the data loader and data processor data_loader = DataLoader() data_processor = DataProcessor() #params = model['parameters'] dataset = data_loader.load_user_data(data_path, USER_DATA_FNAME) if 'train_test_split' in params.keys(): data_split = params['train_test_split'] #TODO: implement this function trainset = data_processor.get_trainset( features, labels, data_split) clf = svc() clf.fit(dataset['features'], dataset['labels']) save_pkl(clf.coef_, data_path, WEIGHTS_FNAME) #model_cont['path_to_weights'] = PATH_TO_WEIGHTS model_cont['train_status'] = "trained" models.update({'_id': ObjectId(model_id)}, {'$set': model_cont}, upsert=False) #TODO: figure out why this form of catch clause does not work except ConnectionFailure as conn_e: print("\nCould not connect to server. \ Raised the following exception:\n{}".format(conn_e))
def CrossValidate(self, c_range=np.logspace(-3.0, 3.0, 7), gamma_range=np.logspace(-3.0, 3.0, 7)): param_grid = dict(C=c_range, gamma=gamma_range) cv = StratifiedShuffleSplit(self.det, n_iter=5, test_size=0.2) grid = GridSearchCV(svc(kernel='rbf', cache_size=1000), param_grid=param_grid, cv=cv) scale_mag, scale_x, scale_y = self.Scale(self.data) if self.spatial: grid.fit(np.c_[scale_mag, scale_x, scale_y], self.det.ravel()) else: grid.fit(np.c_[scale_mag], self.det.ravel()) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_))
def __fitsvm__(self, **args): train_objects_files = returnFiles(args['train_object_folder']) train_objects = returnImages(train_objects_files) train_objects = preproc(train_objects) svc_args = dict( zip(getargspec(svc.__init__)[0][1:], getargspec(svc.__init__)[3])) classifier_raw = svc(**hlp.chooseArgs(svc_args, self.args)) number_of_objects = hlp.listLengths(train_objects) shp = np.shape(train_objects) train_objects = np.reshape(train_objects, (shp[0] * shp[1], shp[2] * shp[3])) train_object_labels_expended = hlp.listExpend( args['train_object_labels'], number_of_objects) self.data = classifier_raw.fit(train_objects.tolist(), train_object_labels_expended) self.threshold = 0.5 if args['is_save']: self.__save()
def svc(x_train, y_train, C=1, kernel='rbf', x_test=False): ''' C-Support Vector Classification. based on libsvm. complexity is more than quadratic. hard to scale to dataset with more than a couple of 10000 samples. kernel can be:It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples). :param x_train: :param y_train: :param x_test: :return: ''' model = svm.svc(C=C, kernel=kernel) model.fit(x_train, y_train) if x_test == False: print('Score:', model.score(x_train, y_train)) return model else: return model, model.predict(x_test)
def predictedSVC(): start = time.time() clf = Pipeline([('vect', vectorizer), ('clf', svc(tol=1e-3, verbose=0, random_state=42, C=1.0, max_iter=-1, gamma='scale', probability=True))], verbose=True) parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)], 'clf__tol': (1e-3, 1e-2, 5e-3, 2e-3, 3e-3, 4e-3), 'clf__gamma': ('auto', 'scale'), 'clf__C': (1.0, .1, .2, .3, .4, 0.5, 0.6, 0.7, 0.8, 0.9) } gs_clf = GridSearchCV(clf, parameters, cv=5, iid=False, n_jobs=-1) gs_clf.fit(docs_train, y_train) y_predicted = gs_clf.predict(docs_test) print(gs_clf.best_params_) print("End.......... total=%.2f s" % (start - time.time())) # Print the classification report print( metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) plt.matshow(cm, cmap=plt.cm.jet) #plt.show() joblib.dump(gs_clf, "svc_model.plk")
def score(params): print "Training with params : " print params if 'scale' in params: if params['scale'] == 1: preprocessing.scale(X_train), preprocessing.scale( X_test), preprocessing.scale(y_train), preprocessing.scale( y_test), preprocessing.scale(ou_test) del params['scale'] # watchlist = [(dvalid, 'eval'), (dtrain, 'train')] clf = svm.svc(**params) a = clf.fit(X_train, y_train) preds = clf.predict(X_test) print preds score = Scorer(preds, y_test, ou_test) print score return {'loss': score, 'status': STATUS_OK}
print() print("Test Set Size") print(Y_test.shape) print() print("Classes:") print(target_names) print("----------------------") ## Vectorization object vectorizer = TfidfVectorizer( strip_accents=None, preprocessor=None, ) ## classifier svm = svc() ## With a Pipeline object we can assemble several steps ## that can be cross-validated together while setting different parameters. pipeline = Pipeline([ ('vect', vectorizer), ('svm', svm), ]) ## Setting parameters. ## Dictionary in which: ## Keys are parameters of objects in the pipeline. ## Values are set of values to try for a particular parameter. parameters = { 'vect__tokenizer': [None, stemming_tokenizer],
# Import Library from sklearn import svm # Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset X = [[0, 0], [2, 2]] y = [0.5, 2.5] # Create SVM(Support Vector Machine) classification object model = svm.svc(gamma='scale') # Train the model using the training sets and check score model.fit(X, y) model.score(X, y) # Predict Output predicted = model.predict(x_test)
print(svmP.w) # In[9]: #output prime results test2 = svmD.predict(x_test) res2 = cm(y_test, test2) print(res2) print(svmD.b) print(svmD.w) print(svmD.a) # In[10]: #lib prime libP = svc() libP.set_params(dual=False) libP.set_params(C=soft) #svc.set_params(max_iter=10000) libP.fit(x_train, y_train) test3 = libP.predict(x_test) print(cm(y_test, test3)) print(libP.coef_) # In[50]: #lib dual libD = svc() libD.set_params(dual=True) libD.set_params(C=soft) libD.set_params(max_iter=100000)
print 'f1 macro:', res print # color = cm(1. * i / NUM_COLORS) # color will now be an RGBA tuple # cm = plt.get_cmap('gist_rainbow') # fig = plt.figure(figsize=(8.0, 5.0)) # ax = fig.add_subplot(111) # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) # ax.plot(range(len(scores)), scores, label=str(threshold)) # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller') # plt.show() print name return res vec_list = [tf(), cv()] clf_list = [svc(), lr()] threshold_list = np.arange(0.5, 3, 0.5) print len(threshold_list) # results_size = (len(vec_list), len(clf_list),len(threshold_list)) # results = np.zeros(results_size, dtype = np.float) # a, b, c = range(3), range(3), range(3) # def my_func(x, y, z): # return (x + y + z) / 3.0, x * y * z, max(x, y, z) grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list)) # mean_grid, product_grid, max_grid = grids print len(grids) try: print grids.shape except: print type(grids)
sex = 'F' scaler = StandardScaler() data_partial = data[data['Sex'] == sex].drop('Sex', axis=1) # corr_matrix_f, corr_matrix_m = data_f.corr(), data_m.corr() # plot_corr_matrices(corr_matrix_f, corr_matrix_m) y = data_partial['EmoState'] X = scaler.fit_transform(data_partial.drop('EmoState', axis=1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=71) models = (('DTC', dtc()), ('SVM', svc(C=10)), ('KNN', knc(n_neighbors=10)), ('SGDC', sgdc()), ('GNBC', gnbc()), ('MLPC', mlpc(max_iter=1000, learning_rate='adaptive'))) results = [] names = [] seed = 13 scoring = 'accuracy' for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
y = x['PFLAG'].values x = x.drop(['PFLAG'],axis =1).values Y = X['PFLAG'].values X = X.drop(['PFLAG'], axis =1).values pca = PCA(n_components = 50) x= pca.fit_transform(x) X= pca.transform(X) Print(x.shape) #modeling clf = RandomForestClassifier(n_estimators = 400, max_depth = 4) clf = LogisticRegression(random_state = 0, solver = 'lbfgs') clf = svm.svc(kernel = 'linear') #rbf, linear, ploy, sigmoid clf = AdaBoostClassfier(DecisionTreeClassifier(max_depth =1). algorithm = 'SAMME',n_estimators = 200) clf. fit(x,y) pred = np.matrix(clf.predict(X)) correct = pred ==Y Accuracy = np.sum(correct)/correct.shape[1] print('Accuracy:' +str(accuracy*100)) #logistic ROC curve and Confustion Matrix Y = Y.tolist() pred = pred.T.tolist() fpr, tpr, thresholds = roc_curve(Y, pred) plt.plot(fpr) print.show()
print 'test samples:', len(y_test) vec = cPickle.load(open('AE_unlabeled_vec2014-11-17 11:56:27.965705')) # vec = tf(vocabulary=old_vec.vocabulary_) X_train = vec.transform(train_data) X_test = vec.transform(test_data) print X_train.shape print X_test.shape # load params W = cPickle.load(open('W_corr0.3_batchsize20_epochs100')) b = cPickle.load(open('b_corr0.3_batchsize20_epochs100')) print 'W:', W.shape, 'b:', b.shape X_train = get_rep(X_train) X_test = get_rep(X_test) clf = svc() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print_top_features(vec, clf) print classification_report(y_train, clf.predict(X_train)) print confusion_matrix(y_test, y_pred) print classification_report(y_test, y_pred) print f1_score(y_test, y_pred, pos_label=None, average='macro') scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1') print scores
import numpy as np import pandas as pd from sklearn import svm file='inputtrain.xlsx' x1=pd.ExcelFile(file) df1=x1.parse('Sheet1') df1.apply(pd.to_numeric, errors='ignore') df1max, df1min = df1.max(), df1.min() df=(df1-df1min)/(df1max-df1min) file='targettrain.xlsx' x2=pd.ExcelFile(file) df2=x2.parse('Sheet1') df2.apply(pd.to_numeric, errors='ignore') file='inputtest.xlsx' x3=pd.ExcelFile(file) df3=x3.parse('Sheet1') df3.apply(pd.to_numeric, errors='ignore') df3max, df3min = df3.max(), df3.min() df_test=(df3-df3min)/(df3max-df3min) df=np.array(df) df2=np.array(df2) df_test=np.array(df_test) #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = svm.svc(kernel='linear', c=1, gamma=1) # there is various option associated with it, like changing kernel, gamma and C value. Will discuss more # about it in next section.Train the model using the training sets and check score model.fit(df, df2) model.score(df, df2) #Predict Output predicted= model.predict(df_test)
from sklearn import datasets from sklearn import svm clf = svm.svc(gamma=0.001, c=100.) digits = datasets.load_digits() clf.fit(digits.data[:-1], digits.target[:-1]) result = clf.predict(digits.data[-1]) pprint(resuli)
x = results.loc[ind].copy() y = (down.loc[ind, 'd6'] < up.loc[ind, 'u1']).astype(int) x.reset_index(inplace=True, drop=True) y.index = x.index row = int(x.shape[0] * .8) x_train = x.loc[:row] x_test = x.loc[row:] y_train = y.loc[:row] y_test = y.loc[row:] scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) weights = sklearn.utils.class_weight.compute_class_weight( 'balanced', np.array([0, 1]), y_train) logreg = LogisticRegression(class_weight={0: weights[0], 1: weights[1]}) logreg = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) logreg.fit(x_train, y_train) predictions = logreg.predict(x_test) print(classification_report(y_test, predictions)) logreg = svc() logreg = KNeighborsClassifier(n_neighbors=2) logreg = DecisionTreeClassifier(random_state=0)
def classification(self, metric, folds, printt=True, graph=False): size = self.graph_width if len(self.y.iloc[:,0].unique()) > 2: struct = 'multiclass' else: struct = 'binary' # significant model setup differences should be list as different models models = {} models["Linear discriminant analysis"] = ldac() models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean') models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan') models["K nearest neighbors classifier K2"] = knnc(n_neighbors=2) models["K nearest neighbors classifier K5"] = knnc(n_neighbors=5) models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10) models["Decision tree classifier"] = dtc() models["Gaussian naive bayes"] = gnbc() models["Bernoulli naive bayes"] = bnbc(binarize=0.5) models["Multinomial naive bayes"] = mnbc() models["SGD classifier"] = sgdc(max_iter=10000) models["Ridge classifier"] = rc() if len(self.Xt_train) < 10000: models["SVM classifier RBF"] = svc(gamma='scale') models["SVM classifier Linear"] = svc(kernel='linear') models["SVM classifier Poly"] = svc(kernel='poly') if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5: models["Gradient boosting classifier"] = gbc() models["Random forest classifier"] = rfc(n_estimators=100) if struct == 'multiclass': models["Logistic classifier multinomial"] = logitc(multi_class='multinomial', solver='lbfgs') models["Logistic classifier auto"] = logitc(multi_class='auto') models["Logistic One vs Rest"] = ovrc(logitc()) models["Logistic One vs One"] = ovoc(logitc()) if struct == 'binary': models["Logistic classifier"] = logitc(max_iter=2000) self.models = models kf = StratifiedKFold(n_splits=folds, shuffle=True) results = [] names = [] et = [] for model_name in models: start = time.time() cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan) results.append(cv_scores) names.append(model_name) et.append((time.time() - start)) #print(model_name, time.time() - start) report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True, ascending=False) report.drop('Score', axis=1, inplace=True) report.reset_index(inplace=True, drop=True) self.report_performance = report if printt: print('\n') print(self.report_width * '*', '\n*') print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*') print(self.report_width * '*', '') print(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classifier Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0, bottom=0.25) self.graphs_model.append(fig) plt.show() return None
# Train the model using the training sets and check score model.fit(X, y) model.score(X, y) #Predict Output predicted= model.predict(x_test) #Import Library from sklearn import svm #Assumed you have, X (predic tor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. # Train the model using the training sets and check score model.fit(X, y) model.score(X, y) #Predict Output predicted= model.predict(x_test) #Import Library from sklearn.naive_bayes import GaussianNB
#train x_train= x[split:] y_train= Y[split:] #test x_test= x[:split] y_test= Y[:split] # In[ ]: #svc from sklearn.svm import svc svc().fit(X,Y) # In[ ]: #support vector cla cls= SVC().fix(X_train , y_train) # In[ ]: #classifier accuracy from sklearn.metrics import accuracy_score accuracy_scorey(y_true, y_predicted)
"""Using Support Vector Machine (SVM) model to prdict the competition test target values """ import pandas as pd from sklearn import cross_validation from sklearn.svm import SVC as svc from sklearn.metrics import accuracy_score training_data = pd.read_csv('../datasets/numerai_training_data.csv') tournament_data = pd.read_csv('../datasets/numerai_tournament_data.csv') #this returns four arrays which is in the order of features_train, features_test, labels_train, labels_test features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( training_data.iloc[:, 0:21], training_data['target'], test_size=0.3, random_state=0) clf = svc(C=1.0).fit(features_train, labels_train) #predicting our target value with the 30% remnant of the training_data predictions = clf.predict(features_test) print predictions accuracy = accuracy_score(predictions, labels_test) print accuracy #c = 1.0 -> 0.514361849391 #c = 100.0 -> 0.518133997785
from sklearn import svm import pickle import pandas #dataframe = pandas.read_csv("file name") #array = dataframe.values # kernel = 'rbf' model = svm.svc(kernel='linear', c=1, gamma=1) model.fit(X_train, y_train) model.score(X_train, y_train) #predict predicted = model.predict(x_test)
#Import Library from sklearn import svm #Assumed you have, X (predic tor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. # Train the model using the training sets and check score model.fit(X, y) model.score(X, y) #Predict Output
def make_model(col_labels = None, year = 2017, model_type = None): """make and run model""" data = pd.read_csv('NCAA2001_2017.csv') data_2018 = pd.read_csv('NCAA2018.csv') data_2018['year'] = 2018 data = data.append(data_2018) # data to pull from the data frame if col_labels is None: col_labels = [ 'TopEFGPer', # effective field goal percentage 'TopFTR', # free throw rate 'TopTOPer', # turnover percentage 'TopDRTG', # defensive rating 'TopSOS', # strength of schedule 'BotEFGPer', 'BotFTR', 'BotTOPer', 'BotDRTG', 'BotSOS' ] # don't scale SeedType if 'SeedType' in col_labels: col_labels.remove('SeedType') if len(col_labels) != 0: data[col_labels] = scale(data[col_labels]) col_labels.insert(0, 'SeedType') else: data[col_labels] = scale(data[col_labels]) # change SeedTypes to integers in case need to encode later data = data.replace( ['OneSixteen', 'TwoFifteen', 'ThreeFourteen', 'FourThirteen', 'FiveTwelve', 'SixEleven', 'SevenTen', 'EightNine'], [1, 2, 3, 4, 5, 6, 7, 8]) train = data.loc[(data['year'] != year) & \ (data['year'] != 2018)][col_labels] train_results = data.loc[(data['year'] != year) & \ (data['year'] != 2018)]['Upset'] # not a df test = data.loc[data['year'] == year][col_labels] results_columns = ['SeedType', 'TopSeed', 'BotSeed', 'Upset'] test_results = data.loc[data['year'] == year][results_columns] # have to one-hot the seeding type if that's in there if 'SeedType' in col_labels: enc = OneHotEncoder(categorical_features = [0]) # must be first train = enc.fit_transform(train).toarray() test = enc.fit_transform(test).toarray() else: train = train.as_matrix() test = test.as_matrix() # making the model if model_type == "forest": model = rf() elif model_type == "gbc": model = gbc() elif model_type == "svc": model = svc(probability = True) else: model = lm.LogisticRegression() model.fit(train, train_results.as_matrix()) predictions = model.predict_proba(test) proba = [] for i in range(len(predictions)): proba.append(predictions[i][1]) # second column is upset percentage test_results['UpsetProba'] = proba test_results = test_results.sort('UpsetProba', ascending = 0) print(test_results)
model = tree.DecisionTreeClassifier( criterion='gini' ) # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini # model = tree.DecisionTreeRegressor() for regression # Train the model using the training sets and check score model.fit(X, y) model.score(X, y) #Predict Output predicted = model.predict(x_test) #Support Vector Machine #Import Library from sklearn import svm #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = svm.svc( ) # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. # Train the model using the training sets and check score model.fit(X, y) model.score(X, y) #Predict Output predicted = model.predict(x_test) #Naive Bayes #Import Library from sklearn.naive_bayes import GaussianNB #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link # Train the model using the training sets and check score model.fit(X, y) #Predict Output predicted = model.predict(x_test)
#!/usr/bin/env python3 #Import Library from sklearn import svm #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail. # Train the model using the training sets and check score model.fit(X, y) model.score(X, y) #Predict Output predicted= model.predict(x_test)