Пример #1
0
def main():
    ### PHASE 1 ###

    num_classes = 21
    name_encoder = preprocessing.LabelBinarizer()

    shuffler.shuffleData()  # shuffle dataset
    trainData = dataset.SigData("data/train_data.csv", name_encoder)
    testData = dataset.SigData("data/test_data.csv", name_encoder)
    print("created datasets.")
    mytrainer = trainer.Trainer(
        trainData, testData,
        num_users=num_classes)  # number of people in the training set
    print("training.")
    mytrainer.train(num_epochs=60)

    exit()

    # Load checkpoint as trained weights for CNN

    ### PHASE 2 ###

    # new_dataset for a user
    # features = mytrainer.model.get_feature_vectors(new_dataset)
    forg_classifier = svm.svc(class_weight='balanced')
    forg_classifier_rbf = svm.svc(kernel='rbf', class_weight='balanced')
Пример #2
0
def get_model(choice='lr', class_weight=None):
    if choice == 'svc':
        model = svc(verbose=1, class_weight=class_weight, n_jobs=-1)

    elif choice == 'lsvc':
        model = lsvc(class_weight=class_weight, n_jobs=-1)
    elif choice == 'knn':
        model = KNeighborsClassifier()
    elif choice == 'msvm':
        model = MulticlassSVM(C=0.1,
                              tol=0.01,
                              max_iter=100,
                              random_state=0,
                              verbose=1)

    elif choice == 'gnb':
        model = gnb(class_weight=class_weight)

    elif choice == 'gpc':
        model = gpc(class_weight=class_weight)
    elif choice == 'sgdc':
        model = sgdc(class_weight=class_weight)

    elif choice == 'rf':
        model = rf(class_weight=class_weight)
#   elif choice == 'vw':
#         model = vw()
    else:
        model = lr(class_weight=class_weight)
    return model
Пример #3
0
def quick_ml(df, results, filter_slice, target_df, loss_df, target, loss):
    # Create x and y
    ind = df['indexes'].loc[filter_slice][0].values
    x = results.loc[ind].copy()
    y = (target_df.loc[ind, target] < loss_df.loc[ind, loss]).astype(int)
    # create sets
    x.reset_index(inplace=True, drop=True)
    y.index = x.index
    row = int(x.shape[0] * .8)
    x_train = x.loc[:row]
    x_test = x.loc[row:]
    y_train = y.loc[:row]
    y_test = y.loc[row:]
    # scale values
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    # weuights ?
    weights = sklearn.utils.class_weight.compute_class_weight(
        'balanced', np.array([0, 1]), y_train)
    # model
    logreg = LogisticRegression(class_weight={0: weights[0], 1: weights[1]})
    logreg = MLPClassifier(solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(500, 4),
                           random_state=1)
    logreg.fit(x_train, y_train)
    predictions = logreg.predict(x_test)
    print(classification_report(y_test, predictions))
    # Print
    logreg = svc()
    logreg = KNeighborsClassifier(n_neighbors=2)
    logreg = DecisionTreeClassifier(random_state=0)
    return None
Пример #4
0
    def train(self, model_id):

        #get training status of model container
        train_status = model_cont['train_status'] 

        if train_status != "trained":
            
            #load the data to train
            data_loader = DataLoader()
            dataset = data_loader.load_user_data(user_data_path)

            #load model specific parameters
            #TODO

            if "train_test_split" in params.keys() and params["train_test_split"]:
                data_split = params['train_test_split']
                trainset = DataProcessor().get_trainset(features, labels, data_split)

            #train the model
            clf = svc()
            clf.fit(dataset['features'], dataset['labels'])
            pkl_file = pdumps(clf)
            
            #update the model object with the results of training
            model_cont['learned_model']=Binary(pkl_file)
            model_cont['train_status'] = "trained"
            
            return model_cont

        else:
            print("Already trained")
            return False     
Пример #5
0
def classifier(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestClassifier as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeClassifier as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesClassifier as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "logistic":
        from sklearn.linear_model import LogisticRegression as lr
        cases = y.nunique()
        if cases > 2: est = lr(solver="newton-cg", multi_class="multinomial")
        else: est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVC as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingClassifier as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPClassifier as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Пример #6
0
def regression(data, y, model="forest"):
    if model == "forest":
        from sklearn.ensemble import RandomForestRegressor as rfc
        est = rfc(n_estimators=10, n_jobs=-1)

    elif model == "tree":
        from sklearn.tree import DecisionTreeRegressor as dtc
        est = dtc()

    elif model == "extra":
        from sklearn.ensemble import ExtraTreesRegressor as etc
        est = etc(n_estimators=10, n_jobs=-1)

    elif model == "linear":
        from sklearn.linear_model import LinearRegression as lr
        cases = y.nunique()
        est = lr(n_jobs=-1)

    elif model == "svm":
        from sklearn.svm import SVR as svc
        est = svc()

    elif model == "boost":
        from sklearn.ensemble import GradientBoostingRegressor as gbc
        est = gbc(n_estimators=10)

    elif model == "neural":
        from sklearn.neural_network import MLPRegressor as nnc
        est = nnc(max_iter=10, learning_rate_init=1)

    est.fit(data, y)
    return est
Пример #7
0
    def classification(self, metric, folds, alphas, graph):
        size = 1.3 * self.report_width // 10

        models = {}
        models["K nearest neighbors classifier K2"]  = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]  = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"] = knnc(n_neighbors=10)        
        models["Decision tree classifier"]           = dtc()
        models["Logistic classifier"]                = logitc()
        models["SVM classifier with RBF kernel"]     = svc(gamma='scale')
        models["SVM classifier with linear kernel"]  = svc(kernel='linear')
        models["Gaussian naive bayes"]               = gnbc()
        models["Bernoulli naive bayes"]              = bnbc()
        models["SGD classifier"]                     = sgdc(max_iter=10000)
        models["Random forest classifier"]           = rfc(n_estimators=100)
        models["Gradient boosting classifier"]       = gbc()
        self.models = models

        print('\n')
        print(self.report_width * '*', '\n*')
        print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        for model_name in models:
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
        print(self.report_width * '*', '')
        report = pd.DataFrame({'Classifier': names, 'Score': results})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        display(report)
        print('\n')
        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0)
            plt.show()             
        return None
Пример #8
0
        def makeSVM(self,**kwargs):
                scale_mag,scale_x,scale_y = self.FitScale(self.data)

                self.scaled_data = np.c_[scale_mag,scale_x,scale_y]

                self.completeness = svc(probability=True,**kwargs)

                if self.spatial:
                        self.completeness.fit(self.scaled_data,self.det.ravel())
                else:
                        self.completeness.fit(np.c_[scale_mag],self.det.ravel())
Пример #9
0
    def makeSVM(self, **kwargs):
        scale_mag, scale_x, scale_y = self.FitScale(self.data)

        self.scaled_data = np.c_[scale_mag, scale_x, scale_y]

        self.completeness = svc(probability=True, **kwargs)

        if self.spatial:
            self.completeness.fit(self.scaled_data, self.det.ravel())
        else:
            self.completeness.fit(np.c_[scale_mag], self.det.ravel())
Пример #10
0
def support_vector_machine():
    #Import Library
    from sklearn import svm
    #Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
    # Create SVM classification object 
    model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
    # Train the model using the training sets and check score
    model.fit(X, y)
    # R^2 score
    model.score(X, y)
    #Predict Output
    predicted= model.predict(x_test)
Пример #11
0
def SVC(test_data, test_label, train_data, train_label, d):
    svm_model_poly_classifier = svc(kernel='poly',
                                    degree=d,
                                    C=5,
                                    gamma=0.05,
                                    probability=True)
    #It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable.
    svm_train_error = svm_model_poly_classifier.fit(
        train_data, train_label).score(train_data, train_label)
    svm_test_error = svm_model_poly_classifier.score(test_data, test_label)
    y_predict = svm_model_poly_classifier.predict(test_data)
    return y_predict, (1 - svm_train_error) * len(train_data), (
        1 - svm_test_error) * len(test_data)
Пример #12
0
        def CrossValidate(self,c_range=np.logspace(-3.0,3.0,7),gamma_range=np.logspace(-3.0,3.0,7)):
                param_grid = dict(C=c_range,gamma=gamma_range)
                cv = StratifiedShuffleSplit(self.det,n_iter=5,test_size=0.2)
                grid = GridSearchCV(svc(kernel='rbf',cache_size=1000),param_grid=param_grid,cv=cv)
                scale_mag,scale_x,scale_y = self.Scale(self.data)

                if self.spatial:
                        grid.fit(np.c_[scale_mag,scale_x,scale_y],self.det.ravel())

                else:
                        grid.fit(np.c_[scale_mag],self.det.ravel())

                print("The best parameters are %s with a score of %0.2f"
                          % (grid.best_params_, grid.best_score_))
Пример #13
0
def train_model(train_X, train_Y, valid_X, valid_Y, hyper_param1):

    # Choose a classifier (here, linear SVM)
    clf = svc(C=1.0, kernel='rbf', max_iter=1000)

    # train
    clf.fit(train_X, train_Y)

    # validation
    valid_Y_hat = clf.predict(valid_X)

    accuracy = np.sum((valid_Y_hat == valid_Y)) / 200.0 * 100.0
    print 'validation accuracy = ' + str(accuracy) + ' %'

    return clf, accuracy
Пример #14
0
    def train(self, model_id):

        try:
            conn = MongoClient()
            print("\nConnection Successful")

            mlaas_db = conn[DB_NAME]  #connect to db
            mlaas_db.authenticate(USERNAME, PASS)
            models = mlaas_db[COLL_NAME]  #load the collection

            #get the model corresponding to the id provided
            model_cont = models.find_one({'_id': ObjectId(model_id)})
            assert model_cont, "Invalid model ID"

            #get training status of model container
            train_status = model_cont['train_status']

            #block if model is being trained currently
            if train_status != "training":
                #init the data loader and data processor
                data_loader = DataLoader()
                data_processor = DataProcessor()
                #params = model['parameters']

                dataset = data_loader.load_user_data(data_path,
                                                     USER_DATA_FNAME)

                if 'train_test_split' in params.keys():
                    data_split = params['train_test_split']
                    #TODO: implement this function
                    trainset = data_processor.get_trainset(
                        features, labels, data_split)

                clf = svc()
                clf.fit(dataset['features'], dataset['labels'])

                save_pkl(clf.coef_, data_path, WEIGHTS_FNAME)

                #model_cont['path_to_weights'] = PATH_TO_WEIGHTS
                model_cont['train_status'] = "trained"

                models.update({'_id': ObjectId(model_id)},
                              {'$set': model_cont},
                              upsert=False)
        #TODO: figure out why this form of catch clause does not work
        except ConnectionFailure as conn_e:
            print("\nCould not connect to server. \
                    Raised the following exception:\n{}".format(conn_e))
Пример #15
0
    def CrossValidate(self,
                      c_range=np.logspace(-3.0, 3.0, 7),
                      gamma_range=np.logspace(-3.0, 3.0, 7)):
        param_grid = dict(C=c_range, gamma=gamma_range)
        cv = StratifiedShuffleSplit(self.det, n_iter=5, test_size=0.2)
        grid = GridSearchCV(svc(kernel='rbf', cache_size=1000),
                            param_grid=param_grid,
                            cv=cv)
        scale_mag, scale_x, scale_y = self.Scale(self.data)

        if self.spatial:
            grid.fit(np.c_[scale_mag, scale_x, scale_y], self.det.ravel())

        else:
            grid.fit(np.c_[scale_mag], self.det.ravel())

        print("The best parameters are %s with a score of %0.2f" %
              (grid.best_params_, grid.best_score_))
 def __fitsvm__(self, **args):
     train_objects_files = returnFiles(args['train_object_folder'])
     train_objects = returnImages(train_objects_files)
     train_objects = preproc(train_objects)
     svc_args = dict(
         zip(getargspec(svc.__init__)[0][1:],
             getargspec(svc.__init__)[3]))
     classifier_raw = svc(**hlp.chooseArgs(svc_args, self.args))
     number_of_objects = hlp.listLengths(train_objects)
     shp = np.shape(train_objects)
     train_objects = np.reshape(train_objects,
                                (shp[0] * shp[1], shp[2] * shp[3]))
     train_object_labels_expended = hlp.listExpend(
         args['train_object_labels'], number_of_objects)
     self.data = classifier_raw.fit(train_objects.tolist(),
                                    train_object_labels_expended)
     self.threshold = 0.5
     if args['is_save']:
         self.__save()
Пример #17
0
def svc(x_train, y_train, C=1, kernel='rbf', x_test=False):
    '''
    C-Support Vector Classification. based on libsvm.  complexity is more than quadratic.
    hard to scale to dataset with more than a couple of 10000 samples.
    kernel can be:It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable.
    If a callable is given it is used to pre-compute the kernel matrix from data matrices;
     that matrix should be an array of shape (n_samples, n_samples).
    :param x_train:
    :param y_train:
    :param x_test:
    :return:
    '''
    model = svm.svc(C=C, kernel=kernel)
    model.fit(x_train, y_train)
    if x_test == False:
        print('Score:', model.score(x_train, y_train))
        return model
    else:
        return model, model.predict(x_test)
def predictedSVC():
    start = time.time()

    clf = Pipeline([('vect', vectorizer),
                    ('clf',
                     svc(tol=1e-3,
                         verbose=0,
                         random_state=42,
                         C=1.0,
                         max_iter=-1,
                         gamma='scale',
                         probability=True))],
                   verbose=True)

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
        'clf__tol': (1e-3, 1e-2, 5e-3, 2e-3, 3e-3, 4e-3),
        'clf__gamma': ('auto', 'scale'),
        'clf__C': (1.0, .1, .2, .3, .4, 0.5, 0.6, 0.7, 0.8, 0.9)
    }

    gs_clf = GridSearchCV(clf, parameters, cv=5, iid=False, n_jobs=-1)
    gs_clf.fit(docs_train, y_train)

    y_predicted = gs_clf.predict(docs_test)

    print(gs_clf.best_params_)

    print("End.......... total=%.2f s" % (start - time.time()))

    # Print the classification report
    print(
        metrics.classification_report(y_test,
                                      y_predicted,
                                      target_names=dataset.target_names))

    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)

    plt.matshow(cm, cmap=plt.cm.jet)
    #plt.show()

    joblib.dump(gs_clf, "svc_model.plk")
Пример #19
0
def score(params):
    print "Training with params : "
    print params

    if 'scale' in params:
        if params['scale'] == 1:
            preprocessing.scale(X_train), preprocessing.scale(
                X_test), preprocessing.scale(y_train), preprocessing.scale(
                    y_test), preprocessing.scale(ou_test)
    del params['scale']

    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    clf = svm.svc(**params)
    a = clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    print preds
    score = Scorer(preds, y_test, ou_test)
    print score
    return {'loss': score, 'status': STATUS_OK}
Пример #20
0
print()
print("Test Set Size")
print(Y_test.shape)
print()
print("Classes:")
print(target_names)
print("----------------------")

## Vectorization object
vectorizer = TfidfVectorizer(
    strip_accents=None,
    preprocessor=None,
)

## classifier
svm = svc()

## With a Pipeline object we can assemble several steps
## that can be cross-validated together while setting different parameters.

pipeline = Pipeline([
    ('vect', vectorizer),
    ('svm', svm),
])

## Setting parameters.
## Dictionary in which:
##  Keys are parameters of objects in the pipeline.
##  Values are set of values to try for a particular parameter.
parameters = {
    'vect__tokenizer': [None, stemming_tokenizer],
Пример #21
0
# Import Library
from sklearn import svm

# Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
X = [[0, 0], [2, 2]]
y = [0.5, 2.5]

# Create SVM(Support Vector Machine) classification object
model = svm.svc(gamma='scale')

# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)

# Predict Output
predicted = model.predict(x_test)
Пример #22
0
print(svmP.w)

# In[9]:

#output prime results
test2 = svmD.predict(x_test)
res2 = cm(y_test, test2)
print(res2)
print(svmD.b)
print(svmD.w)
print(svmD.a)

# In[10]:

#lib prime
libP = svc()
libP.set_params(dual=False)
libP.set_params(C=soft)
#svc.set_params(max_iter=10000)
libP.fit(x_train, y_train)
test3 = libP.predict(x_test)
print(cm(y_test, test3))
print(libP.coef_)

# In[50]:

#lib dual
libD = svc()
libD.set_params(dual=True)
libD.set_params(C=soft)
libD.set_params(max_iter=100000)
Пример #23
0
    print 'f1 macro:', res
    print
    # color = cm(1. * i / NUM_COLORS)  # color will now be an RGBA tuple
    # cm = plt.get_cmap('gist_rainbow')
    # fig = plt.figure(figsize=(8.0, 5.0))
    # ax = fig.add_subplot(111)
    # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
    # ax.plot(range(len(scores)), scores, label=str(threshold))
    # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller')
    # plt.show()
    print name
    return res


vec_list = [tf(), cv()]
clf_list = [svc(), lr()]
threshold_list = np.arange(0.5, 3, 0.5)
print len(threshold_list)
# results_size = (len(vec_list), len(clf_list),len(threshold_list))
# results = np.zeros(results_size, dtype = np.float)
# a, b, c = range(3), range(3), range(3)
# def my_func(x, y, z):
#     return (x + y + z) / 3.0, x * y * z, max(x, y, z)

grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list))
# mean_grid, product_grid, max_grid = grids
print len(grids)
try:
    print grids.shape
except:
    print type(grids)
Пример #24
0
sex = 'F'
scaler = StandardScaler()

data_partial = data[data['Sex'] == sex].drop('Sex', axis=1)
# corr_matrix_f, corr_matrix_m = data_f.corr(), data_m.corr()
# plot_corr_matrices(corr_matrix_f, corr_matrix_m)

y = data_partial['EmoState']
X = scaler.fit_transform(data_partial.drop('EmoState', axis=1))
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=71)

models = (('DTC', dtc()), ('SVM', svc(C=10)), ('KNN', knc(n_neighbors=10)),
          ('SGDC', sgdc()), ('GNBC', gnbc()), ('MLPC',
                                               mlpc(max_iter=1000,
                                                    learning_rate='adaptive')))
results = []
names = []
seed = 13
scoring = 'accuracy'

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model,
                                                 X_train,
                                                 y_train,
                                                 cv=kfold,
                                                 scoring=scoring)
Пример #25
0
y = x['PFLAG'].values
x = x.drop(['PFLAG'],axis =1).values
Y = X['PFLAG'].values
X = X.drop(['PFLAG'], axis =1).values

pca = PCA(n_components = 50)
x= pca.fit_transform(x)
X= pca.transform(X)

Print(x.shape)

#modeling
clf = RandomForestClassifier(n_estimators = 400, max_depth = 4)
clf = LogisticRegression(random_state = 0, solver = 'lbfgs')
clf = svm.svc(kernel = 'linear')  #rbf, linear, ploy, sigmoid
clf = AdaBoostClassfier(DecisionTreeClassifier(max_depth =1). algorithm = 'SAMME',n_estimators = 200)

clf. fit(x,y)
pred = np.matrix(clf.predict(X))
correct = pred ==Y
Accuracy  = np.sum(correct)/correct.shape[1]
print('Accuracy:' +str(accuracy*100))

#logistic ROC curve and Confustion Matrix
Y = Y.tolist()
pred = pred.T.tolist()

fpr, tpr, thresholds = roc_curve(Y, pred)
plt.plot(fpr)
print.show()
Пример #26
0
print 'test samples:', len(y_test)

vec = cPickle.load(open('AE_unlabeled_vec2014-11-17 11:56:27.965705'))

# vec = tf(vocabulary=old_vec.vocabulary_)
X_train = vec.transform(train_data)
X_test = vec.transform(test_data)
print X_train.shape
print X_test.shape

# load params
W = cPickle.load(open('W_corr0.3_batchsize20_epochs100'))
b = cPickle.load(open('b_corr0.3_batchsize20_epochs100'))
print 'W:', W.shape, 'b:', b.shape

X_train = get_rep(X_train)
X_test = get_rep(X_test)

clf = svc()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print_top_features(vec, clf)
print classification_report(y_train, clf.predict(X_train))
print confusion_matrix(y_test, y_pred)
print classification_report(y_test, y_pred)
print f1_score(y_test, y_pred, pos_label=None, average='macro')

scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')

print scores
Пример #27
0
import numpy as np
import pandas as pd
from sklearn import svm
file='inputtrain.xlsx'
x1=pd.ExcelFile(file)
df1=x1.parse('Sheet1')
df1.apply(pd.to_numeric, errors='ignore')
df1max, df1min = df1.max(), df1.min()
df=(df1-df1min)/(df1max-df1min)
file='targettrain.xlsx'
x2=pd.ExcelFile(file)
df2=x2.parse('Sheet1')
df2.apply(pd.to_numeric, errors='ignore')
file='inputtest.xlsx'
x3=pd.ExcelFile(file)
df3=x3.parse('Sheet1')
df3.apply(pd.to_numeric, errors='ignore')
df3max, df3min = df3.max(), df3.min()
df_test=(df3-df3min)/(df3max-df3min)
df=np.array(df)
df2=np.array(df2)
df_test=np.array(df_test)
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object 
model = svm.svc(kernel='linear', c=1, gamma=1) 
# there is various option associated with it, like changing kernel, gamma and C value. Will discuss more # about it in next section.Train the model using the training sets and check score
model.fit(df, df2)
model.score(df, df2)
#Predict Output
predicted= model.predict(df_test)
Пример #28
0
from sklearn import datasets
from sklearn import svm

clf = svm.svc(gamma=0.001, c=100.)
digits = datasets.load_digits()
clf.fit(digits.data[:-1], digits.target[:-1])
result = clf.predict(digits.data[-1])
pprint(resuli)
x = results.loc[ind].copy()
y = (down.loc[ind, 'd6'] < up.loc[ind, 'u1']).astype(int)

x.reset_index(inplace=True, drop=True)
y.index = x.index
row = int(x.shape[0] * .8)
x_train = x.loc[:row]
x_test = x.loc[row:]
y_train = y.loc[:row]
y_test = y.loc[row:]

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

weights = sklearn.utils.class_weight.compute_class_weight(
    'balanced', np.array([0, 1]), y_train)

logreg = LogisticRegression(class_weight={0: weights[0], 1: weights[1]})
logreg = MLPClassifier(solver='lbfgs',
                       alpha=1e-5,
                       hidden_layer_sizes=(5, 2),
                       random_state=1)
logreg.fit(x_train, y_train)
predictions = logreg.predict(x_test)
print(classification_report(y_test, predictions))

logreg = svc()
logreg = KNeighborsClassifier(n_neighbors=2)
logreg = DecisionTreeClassifier(random_state=0)
    def classification(self, metric, folds, printt=True, graph=False):
        size = self.graph_width

        if len(self.y.iloc[:,0].unique()) > 2:
            struct = 'multiclass'
        else:
            struct = 'binary'

        # significant model setup differences should be list as different models
        models = {}
        models["Linear discriminant analysis"]          = ldac()
        models["Nearest centroid classifier euclidian"] = ncc(metric='euclidean')
        models["Nearest centroid classifier manhattan"] = ncc(metric='manhattan')
        models["K nearest neighbors classifier K2"]     = knnc(n_neighbors=2)
        models["K nearest neighbors classifier K5"]     = knnc(n_neighbors=5)
        models["K nearest neighbors classifier K10"]    = knnc(n_neighbors=10)        
        models["Decision tree classifier"]              = dtc()
        models["Gaussian naive bayes"]                  = gnbc()
        models["Bernoulli naive bayes"]                 = bnbc(binarize=0.5)
        models["Multinomial naive bayes"]               = mnbc()
        models["SGD classifier"]                        = sgdc(max_iter=10000)
        models["Ridge classifier"]                      = rc()

        if len(self.Xt_train) < 10000:
            models["SVM classifier RBF"]                = svc(gamma='scale')
            models["SVM classifier Linear"]             = svc(kernel='linear')
            models["SVM classifier Poly"]               = svc(kernel='poly')

        if self.Xt_train.shape[0] < 10000 or self.Xt_train.shape[1] < 5:
            models["Gradient boosting classifier"]      = gbc()
            models["Random forest classifier"]          = rfc(n_estimators=100)

        if struct == 'multiclass':
            models["Logistic classifier multinomial"]   = logitc(multi_class='multinomial', solver='lbfgs')
            models["Logistic classifier auto"]          = logitc(multi_class='auto')
            models["Logistic One vs Rest"]              = ovrc(logitc())
            models["Logistic One vs One"]               = ovoc(logitc())

        if struct == 'binary':
            models["Logistic classifier"]               = logitc(max_iter=2000)

        self.models = models

        kf = StratifiedKFold(n_splits=folds, shuffle=True)
        results = []
        names = []
        et = []
        for model_name in models:
            start = time.time()
            cv_scores = cross_val_score(models[model_name], self.Xt_train, self.yt_train, cv=kf, scoring=metric, error_score=np.nan)  
            results.append(cv_scores)
            names.append(model_name)
            et.append((time.time() - start))
            #print(model_name, time.time() - start)
        report = pd.DataFrame({'Model': names, 'Score': results, 'Elapsed Time': et})
        report['Score (avg)'] = report.Score.apply(lambda x: x.mean())
        report['Score (std)'] = report.Score.apply(lambda x: x.std())
        report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)']
        report.sort_values(by='Score (avg)', inplace=True, ascending=False)
        report.drop('Score', axis=1, inplace=True)
        report.reset_index(inplace=True, drop=True)
        self.report_performance = report

        if printt:
            print('\n')
            print(self.report_width * '*', '\n*')
            print('* CLASSIFICATION RESULTS - BEFORE PARAMETERS BOOSTING \n*')
            print(self.report_width * '*', '')
            print(report)
            print('\n')

        if graph:
            fig, ax = plt.subplots(figsize=(size, 0.5 * size))
            plt.title('Classifier Comparison')
            #ax = fig.add_subplot(111)
            plt.boxplot(results)
            ax.set_xticklabels(names)
            plt.xticks(rotation=45)
            plt.subplots_adjust(hspace=0.0, bottom=0.25)
            self.graphs_model.append(fig)
            plt.show()             
        return None
Пример #31
0
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
 
#Predict Output
predicted= model.predict(x_test)



#Import Library
from sklearn import svm
 
#Assumed you have, X (predic
tor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object 
model = svm.svc() 
# there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
 
#Predict Output
predicted= model.predict(x_test)





#Import Library
from sklearn.naive_bayes import GaussianNB
 
Пример #32
0
#train
x_train= x[split:]
y_train= Y[split:]

#test
x_test= x[:split]
y_test= Y[:split]


# In[ ]:


#svc
from sklearn.svm import svc
svc().fit(X,Y)


# In[ ]:


#support vector cla
cls= SVC().fix(X_train , y_train)


# In[ ]:


#classifier accuracy
from sklearn.metrics import accuracy_score
accuracy_scorey(y_true, y_predicted)
"""Using Support Vector Machine (SVM) model to prdict the competition test target values 
"""
import pandas as pd
from sklearn import cross_validation
from sklearn.svm import SVC as svc
from sklearn.metrics import accuracy_score

training_data = pd.read_csv('../datasets/numerai_training_data.csv')
tournament_data = pd.read_csv('../datasets/numerai_tournament_data.csv')

#this returns four arrays which is in the order of features_train, features_test, labels_train, labels_test
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    training_data.iloc[:, 0:21],
    training_data['target'],
    test_size=0.3,
    random_state=0)

clf = svc(C=1.0).fit(features_train, labels_train)

#predicting our target value with the 30% remnant of the training_data
predictions = clf.predict(features_test)
print predictions

accuracy = accuracy_score(predictions, labels_test)
print accuracy
#c = 1.0 -> 0.514361849391
#c = 100.0 -> 0.518133997785
Пример #34
0
from sklearn import svm
import pickle
import pandas

#dataframe = pandas.read_csv("file name")

#array = dataframe.values

# kernel = 'rbf'

model = svm.svc(kernel='linear', c=1, gamma=1)
model.fit(X_train, y_train)
model.score(X_train, y_train)

#predict
predicted = model.predict(x_test)
Пример #35
0
#Import Library


from sklearn import svm


#Assumed you have, X (predic


tor) and Y (target) for training data set and x_test(predictor) of test_dataset


# Create SVM classification object 


model = svm.svc() 


# there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.


# Train the model using the training sets and check score


model.fit(X, y)


model.score(X, y)


#Predict Output
Пример #36
0
def make_model(col_labels = None, year = 2017, model_type = None):
    """make and run model"""

    data = pd.read_csv('NCAA2001_2017.csv')
    data_2018 = pd.read_csv('NCAA2018.csv')
    data_2018['year'] = 2018
    data = data.append(data_2018)

    # data to pull from the data frame
    if col_labels is None:
        col_labels = [
                'TopEFGPer', # effective field goal percentage
                'TopFTR', # free throw rate
                'TopTOPer', # turnover percentage
                'TopDRTG', # defensive rating
                'TopSOS', # strength of schedule
                'BotEFGPer',
                'BotFTR',
                'BotTOPer',
                'BotDRTG',
                'BotSOS'
                ]

    # don't scale SeedType
    if 'SeedType' in col_labels:
        col_labels.remove('SeedType')
        if len(col_labels) != 0:
            data[col_labels] = scale(data[col_labels])
        col_labels.insert(0, 'SeedType')
        
    else:
        data[col_labels] = scale(data[col_labels])

    # change SeedTypes to integers in case need to encode later
    data = data.replace(
            ['OneSixteen', 'TwoFifteen', 'ThreeFourteen',
                'FourThirteen', 'FiveTwelve', 'SixEleven',
                'SevenTen', 'EightNine'],
            [1, 2, 3, 4, 5, 6, 7, 8])

    train = data.loc[(data['year'] != year) & \
            (data['year'] != 2018)][col_labels]
    train_results = data.loc[(data['year'] != year) & \
            (data['year'] != 2018)]['Upset'] # not a df

    test = data.loc[data['year'] == year][col_labels]
    results_columns = ['SeedType', 'TopSeed', 'BotSeed', 'Upset']
    test_results = data.loc[data['year'] == year][results_columns]

    # have to one-hot the seeding type if that's in there
    if 'SeedType' in col_labels:
        enc = OneHotEncoder(categorical_features = [0]) # must be first
        train = enc.fit_transform(train).toarray()
        test = enc.fit_transform(test).toarray()
    else:
        train = train.as_matrix()
        test = test.as_matrix()

    # making the model
    if model_type == "forest":
        model = rf()
    elif model_type == "gbc":
        model = gbc()
    elif model_type == "svc":
        model = svc(probability = True)
    else:
        model = lm.LogisticRegression()
    model.fit(train, train_results.as_matrix())

    predictions = model.predict_proba(test)
    proba = []
    for i in range(len(predictions)):
        proba.append(predictions[i][1]) # second column is upset percentage

    test_results['UpsetProba'] = proba
    test_results = test_results.sort('UpsetProba', ascending = 0)

    print(test_results)
Пример #37
0
model = tree.DecisionTreeClassifier(
    criterion='gini'
)  # for classification, here you can change the algorithm as gini or entropy (information gain) by default it is gini
# model = tree.DecisionTreeRegressor() for regression
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
#Predict Output
predicted = model.predict(x_test)

#Support Vector Machine
#Import Library
from sklearn import svm
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object
model = svm.svc(
)  # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
#Predict Output
predicted = model.predict(x_test)

#Naive Bayes
#Import Library
from sklearn.naive_bayes import GaussianNB
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object model = GaussianNB() # there is other distribution for multinomial classes like Bernoulli Naive Bayes, Refer link
# Train the model using the training sets and check score
model.fit(X, y)
#Predict Output
predicted = model.predict(x_test)
Пример #38
0
#!/usr/bin/env python3

#Import Library
from sklearn import svm
 
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object 
model = svm.svc() # there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
# Train the model using the training sets and check score
model.fit(X, y)
model.score(X, y)
 
#Predict Output
predicted= model.predict(x_test)