Exemplo n.º 1
0
    def _sort_applicable_ngrams(self, list_of_ngrams, sentences, labels, spacy_nlp):
        """Given an intent classification problem and a list of ngrams, creates ordered list of most useful ngrams."""

        if list_of_ngrams:
            from sklearn import linear_model, preprocessing
            import numpy as np

            usable_labels = []
            for label in np.unique(labels):
                lab_sents = np.array(sentences)[np.array(labels) == label]
                if len(lab_sents) < min_intent_examples_for_ngram_classification:
                    continue
                usable_labels.append(label)

            mask = [label in usable_labels for label in labels]
            sentences = np.array(sentences)[mask]
            labels = np.array(labels)[mask]

            X = np.array(self._ngrams_in_sentences(sentences, spacy_nlp, list_of_ngrams))
            intent_encoder = preprocessing.LabelEncoder()
            intent_encoder.fit(labels)
            y = intent_encoder.transform(labels)

            clf = linear_model.RandomizedLogisticRegression(C=1)
            clf.fit(X, y)
            scores = clf.scores_
            sort_idx = [i[0] for i in sorted(enumerate(scores), key=lambda x: -1 * x[1])]

            return np.array(list_of_ngrams)[sort_idx]
        else:
            return []
def RandomizedLogisticRegression(np_X, np_y):
    X = np_X
    y = np_y
    X_sparse = coo_matrix(X)
    X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
    estimator = linear_model.RandomizedLogisticRegression(n_jobs=1,
                                                          n_resampling=500)
    estimator.fit(X, y)

    return estimator.scores_
Exemplo n.º 3
0
    def save_features(self, X, y):
        feats = dict()

        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func=f_classif, k='all')
        selector_clf.fit(X, y)
        pvalues_clf = selector_clf.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1

        #put feature vectors into dictionary
        feats['univ_sub01'] = (pvalues_clf < 0.1)
        feats['univ_sub005'] = (pvalues_clf < 0.05)
        feats['univ_clf_sub005'] = (pvalues_clf < 0.05)

        print "randomized logistic regression feature selector"
        sel_log = linear_model.RandomizedLogisticRegression(random_state=42,
                                                            n_jobs=4).fit(
                                                                X, y)
        #put rand_lasso feats into feature dict
        feats['rand_logreg'] = sel_log.get_support()

        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(X)
        sel_svc = svm.LinearSVC(C=0.1,
                                penalty="l1",
                                dual=False,
                                random_state=42).fit(X, y)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0)
        sel_log = linear_model.LogisticRegression(C=0.01, random_state=42).fit(
            X_sp, y)
        feats['LogReg'] = np.ravel(sel_log.coef_ > 0)

        tree_max_features = 20
        print "ExtraTrees feature selectors (%s)" % tree_max_features
        feats['tree'] = np.zeros(len(feats['LogReg']))
        tree = ExtraTreesClassifier(n_estimators=250,
                                    max_features=tree_max_features)
        tree.fit(X, y)
        feature_importance = tree.feature_importances_
        feature_importance = 100.0 * (feature_importance /
                                      feature_importance.max())
        sorted_idx = np.argsort(feature_importance)[::-1]
        for i in xrange(tree_max_features):
            feats['tree'][sorted_idx[i]] = 1

        feat_sums = np.zeros(len(feats['LogReg']))
        for key in feats:
            feat_sums += feats[key].astype(int)
        feats[
            'ensemble'] = feat_sums >= 4  #take features which get 5 or more votes
        joblib.dump(feats, 'features/feats.pkl', compress=3)
        return feats
Exemplo n.º 4
0
def find_better_features(data,
                         truth,
                         regularization=1e5,
                         number_renor_models=200):
    '''Resample the train data and compute a Logistic Regression on each resampling 
    http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html'''

    model = linear_model.RandomizedLogisticRegression(
        C=regularization, n_resampling=number_renor_models)

    model = model.fit(data, truth)

    return model
Exemplo n.º 5
0
    def _sort_applicable_ngrams(self, list_of_ngrams, sentences, labels,
                                spacy_nlp):
        """Given an intent classification problem and a list of ngrams, creates ordered list of most useful ngrams."""

        if list_of_ngrams:
            from sklearn import linear_model, preprocessing
            import numpy as np

            # filter examples where we do not have enough labeled instances for cv
            usable_labels = []
            for label in np.unique(labels):
                lab_sents = np.array(sentences)[np.array(labels) == label]
                if len(lab_sents
                       ) < self.min_intent_examples_for_ngram_classification:
                    continue
                usable_labels.append(label)

            mask = [label in usable_labels for label in labels]
            if any(mask) and len(usable_labels) >= 2:
                try:
                    sentences = np.array(sentences)[mask]
                    labels = np.array(labels)[mask]

                    X = np.array(
                        self._ngrams_in_sentences(sentences, spacy_nlp,
                                                  list_of_ngrams))
                    intent_encoder = preprocessing.LabelEncoder()
                    intent_encoder.fit(labels)
                    y = intent_encoder.transform(labels)

                    clf = linear_model.RandomizedLogisticRegression(C=1)
                    clf.fit(X, y)
                    scores = clf.scores_
                    sort_idx = [
                        i[0] for i in sorted(enumerate(scores),
                                             key=lambda x: -1 * x[1])
                    ]

                    return np.array(list_of_ngrams)[sort_idx]
                except ValueError as e:
                    if "needs samples of at least 2 classes" in str(e):
                        # we got unlucky during the random sampling :( and selected a slice that only contains one class
                        return []
                    else:
                        raise e
            else:
                # there is no example we can use for the cross validation
                return []
        else:
            return []
Exemplo n.º 6
0
    def _rank_ngrams_using_cv(self, examples, labels, list_of_ngrams) -> list:
        from sklearn import linear_model

        X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
        y = self.encode_labels(labels)

        clf = linear_model.RandomizedLogisticRegression(C=1)
        clf.fit(X, y)

        # sort the ngrams according to the classification score
        scores = clf.scores_
        sorted_idxs = sorted(enumerate(scores), key=lambda x: -1 * x[1])
        sorted_ngrams = [list_of_ngrams[i[0]] for i in sorted_idxs]

        return sorted_ngrams
def run_regression(data_file, lead, lag):
	start_time = time.time()
	intermediate_file = "prediction/data/tmp.csv"

	flatten_featureset.create_features(intermediate_file, data_file, lead, lag)
	train_data = np.genfromtxt(intermediate_file, delimiter = ',', skip_header = 1)
	os.remove(intermediate_file)

	X_train = train_data[:,1:] #file format is [label list_of_features]
	Y_train = train_data[:,0]

	logreg = linear_model.RandomizedLogisticRegression() #n_jobs=12
	logreg.fit(X_train, Y_train)

	return logreg.scores_
Exemplo n.º 8
0
def trainModel( pairs, classes, train, drugFeatures, diseaseFeatures,  drugFeatureNames, diseaseFeatureNames, model_type, model_fun, n_seed):
    clf= get_classification_model(model_type, model_fun, n_seed)
    classes = numpy.array(classes)
    pairs = numpy.array(pairs)
    pairs_train = pairs[train]
    classes_train = classes[train]
    X_train = createFeatureMat(pairs_train, classes_train, drugFeatures, diseaseFeatures, drugFeatureNames, diseaseFeatureNames,  featureMatfile=None)
    #sel = VarianceThreshold()
    #print X_train.shape
    #X_train=sel.fit_transform(X_train)
    #print X_train.shape
    randomlr = linear_model.RandomizedLogisticRegression( C=1, random_state=n_seed, selection_threshold=0.1)
    #sfm = SelectFromModel(clf)
    randomlr.fit(X_train,classes_train)
    X_train = randomlr.transform(X_train)
    print "number of seleceted features",X_train.shape[1]   
    joblib.dump(randomlr,"../data/models/randomlr.pkl") 
    selectedFeatures=randomlr.get_support(indices=True)
    print selectedFeatures
    saveSelectedFeatures(drugFeatureNames, diseaseFeatureNames, selectedFeatures)

    y_train = numpy.array(classes_train)
    clf.fit(X_train, y_train)
    return clf
Exemplo n.º 9
0
def main():

    print "loading data.."
    traindata = (p.read_table('train.tsv'))
    tr_title, tr_body, tr_url = convert_text(traindata, 'boilerplate')

    testdata = p.read_table('test.tsv')
    ts_title, ts_body, ts_url = convert_text(testdata, 'boilerplate')

    y = np.array(p.read_table('train.tsv'))[:, -1]

    internetStopWords = [
        'http', 'www', 'online', 'com', 'jpg', 'static', 'link', 'terminal01',
        'user', 'null', 'div', 'span', 'font', 'timestamp', 'content', 'blog'
    ]
    stopwords = ENGLISH_STOP_WORDS
    stopwords = list(stopwords)
    stopwords = stopwords + internetStopWords

    X_all = tr_body + ts_body + tr_title + ts_title

    #use for dummy variables
    urls = getURL(traindata['url'])

    #building the model
    tfv = TfidfVectorizer(min_df=3,
                          stop_words=stopwords,
                          max_features=None,
                          strip_accents='unicode',
                          analyzer='word',
                          token_pattern=r'\w{1,}',
                          ngram_range=(1, 3),
                          use_idf=1,
                          smooth_idf=1,
                          sublinear_tf=1)
    tfdif = wordTFIDF.fit_transform(corpus)
    Xt = tfdif[:lentrain]

    rnd = lm.RandomizedLogisticRegression()
    xrnd = rnd.fit_transform(Xt, y_train)

    # not working :
    #X_all = hstack( (xrnd,url) )
    # tfv.build_analyzer()
    rd = lm.LogisticRegression(penalty='l2',
                               dual=True,
                               tol=0.0001,
                               C=1,
                               fit_intercept=True,
                               intercept_scaling=1.0,
                               class_weight=None,
                               random_state=None)

    lentrain = len(traindata)

    print "fitting pipeline"
    tfv.fit(X_all)
    print "transforming data"
    X_all = tfv.transform(X_all)

    X = X_all[:lentrain]
    X_test = X_all[lentrain:]

    print "20 Fold CV Score: ", np.mean(
        cross_validation.cross_val_score(rd, X, y, cv=20, scoring='roc_auc'))
Exemplo n.º 10
0
    file_path = './data/bankloan.xls'
    bank_data = pd.read_excel(file_path)
    # print(bank_data.head())
    """
       年龄  教育  工龄  地址   收入   负债率      信用卡负债      其他负债  违约
    0  41   3  17  12  176   9.3  11.359392  5.008608   1
    1  27   1  10   6   31  17.3   1.362202  4.000798   0
    2  40   1  15  14   55   5.5   0.856075  2.168925   0
    3  41   1  15  14  120   2.9   2.658720  0.821280   0
    4  24   2   2   0   28  17.3   1.787436  3.056564   1

    """
    X = bank_data.iloc[:, :8]
    y = bank_data.iloc[:, 8]
    # 建立逻辑回归模型 帅选变量 RandomizedLogisticRegression 即将在0.21中弃掉
    rlr = linear_model.RandomizedLogisticRegression()
    # 训练模型
    rlr.fit(X, y)
    print(u'有效特征:%s' % ','.join(bank_data.columns[0:-1][rlr.get_support()]))

    # 建立逻辑货柜模型
    # penalty='l2', n_jobs=-1, solver='sag'
    lr = linear_model.LogisticRegression(solver='lbfgs')
    X = bank_data[bank_data.columns[0:-1][rlr.get_support()]]
    lr.fit(X, y)
    # (700, 4)
    print(X.shape)
    # 模型的平均正确率: 0.8142857142857143
    print(u'模型的平均正确率:%s' % lr.score(X, y))

Exemplo n.º 11
0
            try:
                tokens[vocabulary[w]] += 1
            except KeyError:
                pass

        samples.append(tokens)

train_samples = samples[:len(samples) - 1000]
train_labels = labels[:len(labels) - 1000]
test_samples = samples[-1000:]
test_labels = labels[-1000:]

print len(labels), len(samples), len(samples[0])

if use_randomlogreg:
    logreg = linear_model.RandomizedLogisticRegression(n_resampling=150)
    logreg.fit(train_samples, train_labels)

    indices = logreg.get_support(indices=True)
    swapped_vocab = dict((v, k) for k, v in vocabulary.iteritems())

    print ", ".join([swapped_vocab[i] for i in indices])
    exit()
else:
    logreg = linear_model.LogisticRegression()
    logreg.fit(train_samples, train_labels)

print "Done"

predictions = logreg.predict(test_samples)
Exemplo n.º 12
0
cols = list(df_logit.columns.values)
cols.remove('SUBJECT')
cols.remove('CLASS')
cols.remove('AGE')

X = df_logit[cols]
y = df_logit.CLASS

# In[67]:

import warnings  # sklearn is using a deprecated rand function here,
with warnings.catch_warnings():  # and warnings clutter output
    warnings.simplefilter("ignore")
    resamplings = 2000
    rlogit = linear_model.RandomizedLogisticRegression(
        n_resampling=resamplings)
    rlogit.fit(X, y)
    print(
        "Features sorted by score, using {} resamplings: ".format(resamplings))
    feature_list = sorted(zip(map(lambda x: round(x, 4), rlogit.scores_),
                              cols),
                          reverse=True)
    for f in feature_list[
            0:25]:  # Adjust this if last feature output is nonzero
        print("{}:\t\t\t{:.2f}".format(f[1], f[0]))

# ### Entire dataset, LASSO for age as interest variable.

# In[68]:

X, y = df[cols], df.AGE
Exemplo n.º 13
0
            pass

print "Finished loading samples"

l1 = int(len(samples) * test_size)

# train_samples = samples[:l1]
# train_labels = labels[:l1]
# test_samples = samples[-l1:]
# test_labels = labels[-l1:]

print len(labels), len(samples), len(samples[0])

if use_randomlogreg:
    print "Running ranlogreg"
    logreg = linear_model.RandomizedLogisticRegression(
        n_resampling=200, selection_threshold=0.25)
    print "Fitting"
    logreg.fit(samples, labels)

    samples = None
    labels = None

    print "Swapping vocab"
    indices = logreg.get_support(indices=True)
    swapped_vocab = dict((v, k) for k, v in vocabulary.iteritems())

    print ", ".join([swapped_vocab[i] for i in indices])
    exit()
else:
    logreg = linear_model.LogisticRegression()
    logreg.fit(train_samples, train_labels)
def bag_feature_selection(X, y, feature_names, CVobj, K_best=None, pct_best=None, percentiles=np.linspace(10,100,10), 
                          Cs_l1=np.logspace(-4, -1, 7), C_l2=0.01, rand_L1_params=None, RF_params=None, GBM_params=None,
                          scaling=None, rand_seed=1234, plot=True, save_fig=False, fig_names=None, show=True):
    """
    Bag four different types of feature selection process. 
    1. Univariate feature selection (f-score based ranking)
    2. Recursive feature elimination
    3. L1-based feature selection 
    4. Tree-based feature selection
    For 1,2,3, logistic regression and linear SVM are used as base models
    For 4, random forest and gradient tree boosting are used as base models
    For each process, a plot of (cross-validation) accuracy vs. number of selected features 
    (or regularization parameter for L1-based methods) are given 
    
    Input: 
    CVobj: cross validation object
    K_best: K best features to keep 
    pct_best: Percent of features to keep (If both K_best and pct_best are provided, pct_best will be suppressed.)
    percentiles: a list of percents of features to keep and search over
    Cs_l1: a list of C values (1/regularization parameters) to be searched over
    C_l2, RF_params, GBM_param: default parameter for base models (logistic, svm, forest)
    fig_names: a dict where keys and items are figure category and the corresponding filenames to save
        
    Return a dict containing ranked features from univariate feature selection,
    recursive feature elimination (a list where each element corresponds to a model), 
    and tree-based (a list where each element corresponds to a model)
    """
    
    p = X.shape[1]
    if K_best is not None:
        n_selected = K_best
    elif pct_best is not None:    
        n_selected = int(pct_best*0.01*p)
    else:
        n_selected = p
      
    if scaling is not None:
        if scaling=='standard':
            X = preprocessing.StandardScaler().fit_transform(X)
        elif scaling=='minmax':
            X = preprocessing.MinMaxScaler().fit_transform(X)    
        
        
    ranked_features = {}    
    feature_ranks = {}
    # Note: The combination of penalty='l2' and loss='hinge' are not supported when dual=False
#    clfs = [linear_model.LogisticRegression(C=C_l2, penalty='l2'), svm.LinearSVC(C=C_l2, penalty='l2', loss='squared_hinge')]
#    clf_names = ['logistic regression','linear SVM']
    clfs = [linear_model.LogisticRegression(C=C_l2, penalty='l2')]
    clf_names = ['logistic regression']

    # ******************* Univariate feature selection ******************* #    
    univar_scores= fs.univar_score(X, y, feature_names, K_best=n_selected, criterion='f_score',\
                                   plot=plot, save_fig=save_fig, fig_name=fig_names['univar_rank'],show=show)    
    ranked_features['univar'], feature_ranks['univar'] = univar_scores['ranked_features'], univar_scores['ranks']
    fs.univar_FS_cv(X, y, clfs, clf_names, CVobj, percentiles=percentiles, criterion='f_score',\
                    plot=plot, save_fig=save_fig, fig_name=fig_names['univar_CV'], show=show);  
      
    # ******************* Recursive feature elimination ******************* #
    RFE_scores = fs.RFE_rank(X, y, clfs, feature_names, K_best=n_selected) 
    ranked_features['RFE'], feature_ranks['RFE'] = RFE_scores['ranked_features'], RFE_scores['ranks']
#    # The following code takes very long time to run
#    n_keep = np.round(percentiles*0.01*p).astype(int)   
#    fs.RFE_FS_cv(X, y, clfs, clf_names, CVobj, rm_per_step=1, n_keep=n_keep,\
#                 plot=plot, save_fig=save_fig, fig_name=fig_names['RFE_CV'],show=show);
             
    #******************* L1-based feature selection *************************#    
    if rand_L1_params is None:
        rand_L1_params = dict(C=Cs_l1, scaling=0.5, sample_fraction=0.75, n_resampling=100, 
                              selection_threshold=0.25, random_state=rand_seed, n_jobs=1)
        
    rand_L1 = linear_model.RandomizedLogisticRegression(**rand_L1_params)
    L1_scores = fs.L1_score(X, y, [rand_L1], ['randomized sparse model'], feature_names, K_best=n_selected, criterion='mean',\
                            plot=plot, save_fig=save_fig, fig_name=fig_names['L1_rank'], show=show)
    ranked_features['L1'], feature_ranks['L1'] = L1_scores['ranked_features'], L1_scores['ranks']

    # NOTE: For L1 logistic regression,‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty.
    clfs[0].set_params(**{'penalty':'l1','solver':'liblinear'}) 
#    clfs[1].set_params(**{'penalty':'l1','dual':False})
            
    fs.plot_L1_path(X, y, clfs, clf_names, Cs_l1, save_fig=save_fig, fig_name=fig_names['L1_path'], show=show);    
    fs.L1_FS_cv(X, y, clfs, clf_names, CVobj, Cs_l1, \
                plot=plot, save_fig=save_fig, fig_name=fig_names['L1_CV'], show=show);    
          
    # **************** Tree-based feature selection ************************ #
    # For tree-based methods, raw features values could be used 
    if RF_params is None:
        RF_params = {'n_estimators': 1000,'max_features': 'auto','min_samples_split': 1, 
                     'bootstrap': True, 'oob_score': True, 'random_state': rand_seed, 'n_jobs': -1}
    if GBM_params is None:
        GBM_params = {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05, 'subsample': 1,
                      'max_features': 'auto', 'min_samples_leaf': 1, 'random_state': rand_seed}
                  
    RF = ensemble.RandomForestClassifier(**RF_params)              
    GBM = ensemble.GradientBoostingClassifier(**GBM_params)
    clfs = [RF, GBM]
    clf_names = ['Random Forest','Gradient Boosting Machine']

    tree_scores = fs.tree_score(X, y, clfs, clf_names, feature_names, K_best=n_selected, plot=plot,\
                                save_fig=save_fig, fig_name=fig_names['feature_imp'], show=show)
    ranked_features['tree'], feature_ranks['tree'] = tree_scores['ranked_features'], tree_scores['ranks']
#    # DO NOT DO THIS. See doctring of function fs.tree_FS_cv
#    fs.tree_FS_cv(X, y, clfs, clf_names, CVobj, percentiles, plot=plot,\
#                  save_fig=save_fig, fig_name=fig_names['tree_CV'], show-show)
    
    return ranked_features, feature_ranks
Exemplo n.º 15
0
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None):
    from sklearn import datasets, neighbors, linear_model, svm

    totalTime = 0

    startTrainTime = time()
    logger.info("Start training...")
    if model_type == 'ARDRegression':
        model = linear_model.ARDRegression().fit(train_x, train_y)
    elif model_type == 'BayesianRidge':
        model = linear_model.BayesianRidge().fit(train_x, train_y)
    elif model_type == 'ElasticNet':
        model = linear_model.ElasticNet().fit(train_x, train_y)
    elif model_type == 'ElasticNetCV':
        model = linear_model.ElasticNetCV().fit(train_x, train_y)
    elif model_type == 'HuberRegressor':
        model = linear_model.HuberRegressor().fit(train_x, train_y)
    elif model_type == 'Lars':
        model = linear_model.Lars().fit(train_x, train_y)
    elif model_type == 'LarsCV':
        model = linear_model.LarsCV().fit(train_x, train_y)
    elif model_type == 'Lasso':
        model = linear_model.Lasso().fit(train_x, train_y)
    elif model_type == 'LassoCV':
        model = linear_model.LassoCV().fit(train_x, train_y)
    elif model_type == 'LassoLars':
        model = linear_model.LassoLars().fit(train_x, train_y)
    elif model_type == 'LassoLarsCV':
        model = linear_model.LassoLarsCV().fit(train_x, train_y)
    elif model_type == 'LassoLarsIC':
        model = linear_model.LassoLarsIC().fit(train_x, train_y)
    elif model_type == 'LinearRegression':
        model = linear_model.LinearRegression().fit(train_x, train_y)
    elif model_type == 'LogisticRegression':
        model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'LogisticRegressionCV':
        model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'MultiTaskLasso':
        model = linear_model.MultiTaskLasso().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNet':
        model = linear_model.MultiTaskElasticNet().fit(train_x, train_y)
    elif model_type == 'MultiTaskLassoCV':
        model = linear_model.MultiTaskLassoCV().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNetCV':
        model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuit':
        model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuitCV':
        model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveClassifier':
        model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveRegressor':
        model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y)
    elif model_type == 'Perceptron':
        model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RandomizedLasso':
        model = linear_model.RandomizedLasso().fit(train_x, train_y)
    elif model_type == 'RandomizedLogisticRegression':
        model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y)
    elif model_type == 'RANSACRegressor':
        model = linear_model.RANSACRegressor().fit(train_x, train_y)
    elif model_type == 'Ridge':
        model = linear_model.Ridge().fit(train_x, train_y)
    elif model_type == 'RidgeClassifier':
        model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeClassifierCV':
        model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeCV':
        model = linear_model.RidgeCV().fit(train_x, train_y)
    elif model_type == 'SGDClassifier':
        model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SGDRegressor':
        model = linear_model.SGDRegressor().fit(train_x, train_y)
    elif model_type == 'TheilSenRegressor':
        model = linear_model.TheilSenRegressor().fit(train_x, train_y)
    elif model_type == 'lars_path':
        model = linear_model.lars_path().fit(train_x, train_y)
    elif model_type == 'lasso_path':
        model = linear_model.lasso_path().fit(train_x, train_y)
    elif model_type == 'lasso_stability_path':
        model = linear_model.lasso_stability_path().fit(train_x, train_y)
    elif model_type == 'logistic_regression_path':
        model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'orthogonal_mp':
        model = linear_model.orthogonal_mp().fit(train_x, train_y)
    elif model_type == 'orthogonal_mp_gram':
        model = linear_model.orthogonal_mp_gram().fit(train_x, train_y)
    elif model_type == 'LinearSVC':
        model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SVC':
        model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y)
    else:
        raise NotImplementedError('Model not implemented')

        
    logger.info("Finished training.")
    endTrainTime = time()
    trainTime = endTrainTime - startTrainTime
    logger.info("Training time : %d seconds" % trainTime)


    logger.info("Start predicting train set...")
    train_pred_y = model.predict(train_x)
    logger.info("Finished predicting train set.")
    logger.info("Start predicting test set...")
    test_pred_y = model.predict(test_x)
    logger.info("Finished predicting test set.")
    endTestTime = time()
    testTime = endTestTime - endTrainTime
    logger.info("Testing time : %d seconds" % testTime)
    totalTime += trainTime + testTime

    train_pred_y = np.round(train_pred_y)
    test_pred_y = np.round(test_pred_y)

    np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i')

    logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y)))
    logger.info('[TEST]  Acc: %.3f' % (accuracy_score(test_y, test_pred_y)))

    return accuracy_score(test_y, test_pred_y)