예제 #1
0
def test_lasso():
    alphaNum = 6
    print '*' * 80
    inputData = pd.read_hdf(
        './rise_DM_fraud/dev1/preprocessing/preprocessing_result.h5')
    target = 'fpd'
    Y = inputData[target]
    X = inputData.drop(target, axis=1)
    X.fillna(-999, inplace=True)
    lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, Y)
    skf = cv.StratifiedKFold(y=Y, n_folds=5)
    for i, (_, test_index) in enumerate(skf):
        print 'Fold', i
        test_X = X.iloc[test_index, :]
        test_Y = Y[test_index]
        alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0],
                             alphaNum)
        clf = linear_model.RandomizedLasso(alphas, random_state=33,
                                           n_jobs=1).fit(test_X, test_Y)
        featureImportance = pd.DataFrame(sorted(zip(
            map(lambda x: round(x, 4), clf.scores_), X.columns),
                                                reverse=True),
                                         columns=['importance', 'name'])
        featureImportance.to_csv(
            './rise_DM_fraud/dev1/feature_ranking/feature_importance_lasso_fold_%d.csv'
            % (i + 1),
            index=False)
예제 #2
0
def Random_Lasso_reg(X, y, alpha):
    #modify this to use gridsearch cv this weekend:
    #https://stackoverflow.com/questions/45857274/interpreting-ridge-regression-in-gridsearchcv
    estimator = linear_model.RandomizedLasso(alpha=alpha)
    estimator.fit(X, y)

    return estimator.scores_
def perform_randomizedLasso(df, target):
    randomLasso = linear_model.RandomizedLasso(alpha=np.logspace(-3, 3, 100),
                                               sample_fraction=0.5,
                                               n_resampling=500,
                                               normalize=False,
                                               random_state=36,
                                               scaling=0.5)
    randomLasso.fit(df, target)
    return randomLasso.scores_  #, randomLasso.all_scores_
def RandomizedLassoRegression(np_X, np_y):
    X = np_X
    y = np_y
    X_sparse = coo_matrix(X)
    X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
    estimator = linear_model.RandomizedLasso(n_jobs=1, n_resampling=500)
    estimator.fit(X, y)

    return estimator.scores_
    def rLasso(X_scaled, Y, labels, X_test):
        print "Features sorted by their score for Randomized Lasso:"
        scores = np.zeros(X_scaled.shape[1])
        alphas = [0.003, 0.002]  #, 0.001]
        for i in alphas:
            a = i
            print "Trying alpha %f" % (a)
            randomized_lasso = linear_model.RandomizedLasso(
                n_jobs=1, alpha=a, sample_fraction=0.25, verbose=True)
            printSizes('rlasso', X_scaled, Y, X_test)
            randomized_lasso.fit(X_scaled, Y[:, 1])
            scores = scores + randomized_lasso.scores_
            if debug:
                for score, label in sorted(zip(
                        map(lambda x: round(x, 6), randomized_lasso.scores_),
                        labels),
                                           reverse=True):
                    if score > 0.015:
                        print "%s: %f" % (label, score)

        scores = scores / len(alphas)  # get mean values
        meanImportance = np.mean(scores)
        print "Average score for variable = %f" % (meanImportance)
        if meanImportance > 0.00001:
            if X_scaled.shape[1] > 100:
                thresh = 1.0
            else:
                thresh = 1.0
            keptIndices = np.where(scores > thresh * meanImportance)
            print "Top Scores for Random Lasso"
            if debug:
                for (score, label) in sorted(zip(scores, labels),
                                             key=lambda (score, label): score,
                                             reverse=True):
                    if score > meanImportance:
                        print "%s: %f" % (label, score)

            printSizes('rlassoBeforeCut', X_scaled, Y, X_test)
            labels = labels[keptIndices]
            X_scaled = np.squeeze(X_scaled[:, keptIndices])
            X_test = np.squeeze(X_test[:, keptIndices])
            printSizes('rlassoAfterCut', X_scaled, Y, X_test)
        else:
            print "Not useful, aborting"
        print "New size of X"
        print X_scaled.shape
        return (X_scaled, Y, labels, X_test)
예제 #6
0
size = 750
X = np.random.uniform(0, 1, (size, 14))

print(X[:, 1])

Y = (10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - .5)**2 +
     10 * X[:, 3] + 5 * X[:, 4]**5 + np.random.normal(0, 1))
X[:, 10:] = X[:, :4] + np.random.normal(0, .025, (size, 4))

lin = linear_model.LinearRegression()
lin.fit(X, Y)
ridge = Ridge()  # alpha=0.1
ridge.fit(X, Y)
lasso = linear_model.Lasso()  # alpha=0.1
lasso.fit(X, Y)
randLasso = linear_model.RandomizedLasso()
randLasso.fit(X, Y)
rfe = feature_selection.RFE(estimator=linear_model.LinearRegression())
rfe.fit(X=X, y=Y)

rfr = RandomForestRegressor()
rfr.fit(X, Y)
freg = feature_selection.f_regression(X, Y)

ans_lin = abs(lin.coef_)
mx = [max(ans_lin)] * 14
ans_lin = ans_lin / mx
ans_ridge = abs(ridge.coef_)
mx = [max(ans_ridge)] * 14
ans_ridge = ans_ridge / mx
ans_lasso = abs(lasso.coef_)
예제 #7
0
def feature_select(feature_ranking_choice,
                   ranking_method,
                   X_train,
                   Y_train,
                   targetName,
                   featureRank_folder,
                   featureNum,
                   fill_missing,
                   stable_test_rf=False):
    # lasso configuration
    alphaNum = 6
    # random forest configuration
    nTrees = 1000
    njobs = 4
    maxFeaturePercent = 0.1
    nFeaturePlot = 30

    featureRanking_la = featureRank_folder + '/feature_importance_lasso.csv'
    afterSelectData_la = featureRank_folder + '/dataAfterSelect_lasso.h5'
    featureRanking_rf = featureRank_folder + '/feature_importance_rf.csv'
    afterSelectData_rf = featureRank_folder + '/dataAfterSelect_rf.h5'

    print '*' * 80
    print 'running feature_selection.py'
    if feature_ranking_choice == 0:
        print 'no feature ranking or selection!'
        return X_train
    elif feature_ranking_choice == 2:
        print 'previous feature ranking and selection result is loaded!'
        if ranking_method == 'lasso':
            featureNames = pd.read_csv(featureRanking_la)['name'][:featureNum]
        elif ranking_method == 'rf':
            featureNames = pd.read_csv(featureRanking_rf)['name'][:featureNum]
        return X_train[featureNames]
    elif feature_ranking_choice == 1:
        X_train_temp = X_train.copy()
        if X_train_temp.isnull().sum().sum() > 0:
            X_train_temp.fillna(-999, inplace=True)
            print 'missing data is temporarily filled by -999 in the feature selection process!'
        #### stability selection: L1-based feature selection
        if ranking_method == 'lasso':
            ## find best alpha through cross-valiation
            #            lars_cv = linear_model.LassoLarsCV(cv=6).fit(X_train_temp,Y_train)
            ## choose the alpha candidates
            #            alphas = np.linspace(lars_cv.alphas_[0], .1*lars_cv.alphas_[0], alphaNum)
            ## obtain scores of features coming with different alphas and combine them, max() used across all alphas's score
            #            clf1 = linear_model.RandomizedLasso(alpha=alphas, random_state=42,n_jobs=1).fit(X_train_temp,Y_train)
            clf1 = linear_model.RandomizedLasso(alpha='aic',
                                                random_state=33,
                                                n_jobs=1,
                                                verbose=True).fit(
                                                    X_train_temp, Y_train)
            ## sort the scores of features
            featureImportance = pd.DataFrame(zip(
                X_train_temp.columns, map(lambda x: round(x, 4),
                                          clf1.scores_)),
                                             columns=['name', 'importance'])
            featureImportance.sort_values(by='importance',
                                          ascending=False,
                                          inplace=True)
            featureImportance.index = range(featureImportance.shape[0])
            featureImportance.to_csv(featureRanking_la, index=False)
            if fill_missing == True:
                returnData = pd.concat([
                    Y_train,
                    X_train_temp.ix[:, featureImportance.iloc[:featureNum, 0]]
                ],
                                       axis=1)
            else:
                returnData = pd.concat([
                    Y_train, X_train.ix[:, featureImportance.iloc[:featureNum,
                                                                  0]]
                ],
                                       axis=1)
            print 'Lasso feature ranking finish!'
        elif ranking_method == 'rf':
            if stable_test_rf == True:
                test_rf(X_train_temp, Y_train, nTrees, njobs,
                        maxFeaturePercent, featureRank_folder, nFeaturePlot)
            featureImportanceAndName = get_feature_importance_rf(
                X_train_temp, Y_train, nTrees, njobs, maxFeaturePercent)
            featureImportanceAndName.sort_values(by='importance',
                                                 ascending=False,
                                                 inplace=True)
            featureImportanceAndName.to_csv(featureRanking_rf, index=False)
            if fill_missing == True:
                returnData = pd.concat([
                    Y_train, X_train_temp.
                    ix[:, featureImportanceAndName['name'][:featureNum]]
                ],
                                       axis=1)
            else:
                returnData = pd.concat([
                    Y_train,
                    X_train.ix[:,
                               featureImportanceAndName['name'][:featureNum]]
                ],
                                       axis=1)
            print 'RF feature ranking finish!'
        print 'feature ranking done!'
        returnX = returnData.drop(targetName, axis=1, inplace=False)
        return returnX
#features.drop('zipcode',1,inplace=True)
#features.drop('lat',1,inplace=True)
#features.drop('long',1,inplace=True)

scalerNorm = Normalizer(norm='l2')
scalerStandard = StandardScaler().fit(features)
#scalerX.fit(features)
#features = scalerX.transform(features)
features = scalerStandard.transform(features)

print(features.shape)

Lars_cv = linearmodels.LarsCV(cv=6).fit(features, y)
Lasso_cv = linearmodels.LassoCV(cv=6).fit(features, y)
alphas = np.linspace(Lars_cv.alphas_[0], .1 * Lars_cv.alphas_[0], 6)
Randomized_lasso = linearmodels.RandomizedLasso(alpha=alphas, random_state=42)

linear_regression = linearmodels.LinearRegression()
linear_SVR = LinearSVR(loss='squared_epsilon_insensitive')

featureselector_Lars = feature_selection.SelectFromModel(Lars_cv, prefit=True)
featureselector_Lasso = feature_selection.SelectFromModel(Lasso_cv,
                                                          prefit=True)
featureselector_RLasso = Randomized_lasso.fit(features, y)

print(Lars_cv.coef_)
print(Lasso_cv.coef_)
print(Randomized_lasso.scores_)

scoreoffeature = pd.DataFrame(
    [Lars_cv.coef_, Lasso_cv.coef_, Randomized_lasso.scores_],

scalerX = preprocessing.Normalizer(norm='l2')
standardScalerX = preprocessing.StandardScaler()
#scalerYa = preprocessing.Normalizer(norm='l2')
#scalerYg = preprocessing.Normalizer(norm='l2')
#scalerYf = preprocessing.Normalizer(norm='l2')

scalerX.fit(X)
X=scalerX.transform(X)
#X=standardScalerX.fit_transform(X)
print(X.shape)

lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, yf)
alphas = py.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
treeScore = linear_model.RandomizedLasso(alpha=alphas,random_state=42)
#treeScore.fit(X, yf)


#X=selector.fit_transform(X,yf)

#trees = ensemble.ExtraTreesRegressor(100).fit(X, yf)
print(lars_cv.coef_)
#lars_cv.coef_ = py.abs(lars_cv.coef_)

treeSelector = feature_selection.SelectFromModel(lars_cv,prefit=True)

#treeSelector = feature_selection.SelectFromModel(treeScoreSaver,prefit=True,threshold=0.5)

print(treeSelector.get_params())
예제 #10
0
        "Features sorted by score, using {} resamplings: ".format(resamplings))
    feature_list = sorted(zip(map(lambda x: round(x, 4), rlogit.scores_),
                              cols),
                          reverse=True)
    for f in feature_list[
            0:25]:  # Adjust this if last feature output is nonzero
        print("{}:\t\t\t{:.2f}".format(f[1], f[0]))

# ### Entire dataset, LASSO for age as interest variable.

# In[68]:

X, y = df[cols], df.AGE

import warnings  # sklearn is using a deprecated rand function here,
with warnings.catch_warnings():  # and warnings clutter output
    warnings.simplefilter("ignore")
    resamplings = 2000
    rlasso = linear_model.RandomizedLasso(n_resampling=resamplings)
    rlasso.fit(X, y)
    print(
        "Features sorted by score, using {} resamplings: ".format(resamplings))
    feature_list = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
                              cols),
                          reverse=True)
    for f in feature_list[
            0:50]:  # Adjust this if last feature output is nonzero
        print("{}:\t\t\t{:.2f}".format(f[1], f[0]))

# In[ ]:
예제 #11
0
data = pd.read_csv('../data/pipeline-full/ya-oa-full-linreg-02-24.csv')

cols = list(data.columns.values)
cols.remove('SUBJECT')
cols.remove('CLASS')
cols.remove('AGE')
cols.remove('SEX')

X = data[cols]
y = data.AGE

alpha = -15.4
resamplings = 8

rlasso = linear_model.RandomizedLasso(alpha=alpha, n_resampling=resamplings)
rlasso.fit(X, y)

print("Features sorted by score, using {} resamplings: ".format(resamplings))
feature_list = sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), cols), reverse=True)
for f in feature_list:
	print(f)








예제 #12
0
def feature_select(feature_ranking_choice, ranking_method, inputData,
                   targetName, featureRank_folder, featureNum):
    alphaNum = 6
    nTrees = 1000
    featureRanking_la = featureRank_folder + '/feature_importance_lasso.csv'
    afterSelectData_la = featureRank_folder + '/dataAfterSelect_lasso.h5'
    featureRanking_rf = featureRank_folder + '/feature_importance_rf.csv'
    afterSelectData_rf = featureRank_folder + '/dataAfterSelect_rf.h5'

    print '*' * 80
    if feature_ranking_choice == 0:
        print 'no feature ranking or selection!'
        return inputData
    elif feature_ranking_choice == 2:
        print 'previous feature ranking and selection result is loaded!'
        if ranking_method == 'lasso':
            return pd.read_hdf(afterSelectData_la, 'dataAfterSelect')
        elif ranking_method == 'rf':
            return pd.read_hdf(afterSelectData_rf, 'dataAfterSelect')
    elif feature_ranking_choice == 1:
        if inputData.isnull().sum().sum() > 0:
            inputData.fillna(-999, inplace=True)
            print 'missing data is temporarily filled by -999 in the feature selection process!'
        Y = inputData[targetName]
        X = inputData.drop([targetName], axis=1, inplace=False)
        #### L1-based feature selection
        if ranking_method == 'lasso':
            ## find best alpha through cross-valiation
            lars_cv = linear_model.LassoLarsCV(cv=6).fit(X, Y)
            ## choose the alpha candidates
            alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0],
                                 alphaNum)
            ## obtain scores of features coming with different alphas and combine them, max() used across all alphas's score
            clf1 = linear_model.RandomizedLasso(alpha=alphas,
                                                random_state=42,
                                                n_jobs=1).fit(X, Y)
            ## sort the scores of features
            featureImportance = pd.DataFrame(
                sorted(zip(map(lambda x: round(x, 4), clf1.scores_),
                           X.columns),
                       reverse=True))
            featureImportance.to_csv(featureRanking_la, index=False)
            store = pd.HDFStore(afterSelectData_la)
            store['dataAfterSelect'] = pd.concat(
                [Y, X.ix[:featureImportance.iloc[:featureNum, 1]]], axis=1)
            print 'Lasso feature ranking finish!'
        elif ranking_method == 'rf':
            rf1 = ensemble.RandomForestClassifier(n_estimators=nTrees,
                                                  criterion='gini',
                                                  max_features=0.1,
                                                  max_depth=3,
                                                  n_jobs=4,
                                                  verbose=1)
            rf2 = ensemble.RandomForestClassifier(n_estimators=nTrees,
                                                  criterion='gini',
                                                  max_features=0.1,
                                                  max_depth=5,
                                                  n_jobs=4,
                                                  verbose=1)
            rf3 = ensemble.RandomForestClassifier(n_estimators=nTrees,
                                                  criterion='gini',
                                                  max_features=0.1,
                                                  max_depth=7,
                                                  n_jobs=4,
                                                  verbose=1)
            rf4 = ensemble.RandomForestClassifier(n_estimators=nTrees,
                                                  criterion='entropy',
                                                  max_features=0.1,
                                                  max_depth=3,
                                                  n_jobs=4,
                                                  verbose=1)
            rf5 = ensemble.RandomForestClassifier(n_estimators=nTrees,
                                                  criterion='entropy',
                                                  max_features=0.1,
                                                  max_depth=5,
                                                  n_jobs=4,
                                                  verbose=1)
            rf6 = ensemble.RandomForestClassifier(n_estimators=nTrees,
                                                  criterion='entropy',
                                                  max_features=0.1,
                                                  max_depth=7,
                                                  n_jobs=4,
                                                  verbose=1)
            ## train random forest model
            rf1.fit(X, Y)
            rf2.fit(X, Y)
            rf3.fit(X, Y)
            rf4.fit(X, Y)
            rf5.fit(X, Y)
            rf6.fit(X, Y)
            ## note down the ranking of features based on the importances in different split criteria and max depth
            featureImportanceAverage = (
                rf1.feature_importances_ + rf2.feature_importances_ +
                rf3.feature_importances_ + rf4.feature_importances_ +
                rf5.feature_importances_ + rf6.feature_importances_) / 6
            sortedFeatureImportance = pd.DataFrame(
                featureImportanceAverage).sort_values(by=0, ascending=False)
            sortedFeatureNames = X.columns[sortedFeatureImportance.index]
            sortedFeatureImportance.index = range(X.shape[1])
            featureImportance = pd.concat([
                pd.DataFrame(sortedFeatureImportance),
                pd.DataFrame(sortedFeatureNames)
            ],
                                          axis=1)
            featureImportance.to_csv(featureRanking_rf, index=False)
            store = pd.HDFStore(afterSelectData_rf)
            store['dataAfterSelect'] = pd.concat(
                [Y, X.ix[:, featureImportance.iloc[:featureNum, 1]]], axis=1)
            print 'RF feature ranking finish!'
        return pd.concat([Y, X.ix[:, featureImportance.iloc[:featureNum, 1]]],
                         axis=1)
        print 'feature ranking done!'
예제 #13
0
def run_simple_model(train_x, train_y, dev_x, dev_y, test_x, test_y, model_type, out_dir=None, class_weight=None):
    from sklearn import datasets, neighbors, linear_model, svm

    totalTime = 0

    startTrainTime = time()
    logger.info("Start training...")
    if model_type == 'ARDRegression':
        model = linear_model.ARDRegression().fit(train_x, train_y)
    elif model_type == 'BayesianRidge':
        model = linear_model.BayesianRidge().fit(train_x, train_y)
    elif model_type == 'ElasticNet':
        model = linear_model.ElasticNet().fit(train_x, train_y)
    elif model_type == 'ElasticNetCV':
        model = linear_model.ElasticNetCV().fit(train_x, train_y)
    elif model_type == 'HuberRegressor':
        model = linear_model.HuberRegressor().fit(train_x, train_y)
    elif model_type == 'Lars':
        model = linear_model.Lars().fit(train_x, train_y)
    elif model_type == 'LarsCV':
        model = linear_model.LarsCV().fit(train_x, train_y)
    elif model_type == 'Lasso':
        model = linear_model.Lasso().fit(train_x, train_y)
    elif model_type == 'LassoCV':
        model = linear_model.LassoCV().fit(train_x, train_y)
    elif model_type == 'LassoLars':
        model = linear_model.LassoLars().fit(train_x, train_y)
    elif model_type == 'LassoLarsCV':
        model = linear_model.LassoLarsCV().fit(train_x, train_y)
    elif model_type == 'LassoLarsIC':
        model = linear_model.LassoLarsIC().fit(train_x, train_y)
    elif model_type == 'LinearRegression':
        model = linear_model.LinearRegression().fit(train_x, train_y)
    elif model_type == 'LogisticRegression':
        model = linear_model.LogisticRegression(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'LogisticRegressionCV':
        model = linear_model.LogisticRegressionCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'MultiTaskLasso':
        model = linear_model.MultiTaskLasso().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNet':
        model = linear_model.MultiTaskElasticNet().fit(train_x, train_y)
    elif model_type == 'MultiTaskLassoCV':
        model = linear_model.MultiTaskLassoCV().fit(train_x, train_y)
    elif model_type == 'MultiTaskElasticNetCV':
        model = linear_model.MultiTaskElasticNetCV().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuit':
        model = linear_model.OrthogonalMatchingPursuit().fit(train_x, train_y)
    elif model_type == 'OrthogonalMatchingPursuitCV':
        model = linear_model.OrthogonalMatchingPursuitCV().fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveClassifier':
        model = linear_model.PassiveAggressiveClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'PassiveAggressiveRegressor':
        model = linear_model.PassiveAggressiveRegressor().fit(train_x, train_y)
    elif model_type == 'Perceptron':
        model = linear_model.Perceptron(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RandomizedLasso':
        model = linear_model.RandomizedLasso().fit(train_x, train_y)
    elif model_type == 'RandomizedLogisticRegression':
        model = linear_model.RandomizedLogisticRegression().fit(train_x, train_y)
    elif model_type == 'RANSACRegressor':
        model = linear_model.RANSACRegressor().fit(train_x, train_y)
    elif model_type == 'Ridge':
        model = linear_model.Ridge().fit(train_x, train_y)
    elif model_type == 'RidgeClassifier':
        model = linear_model.RidgeClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeClassifierCV':
        model = linear_model.RidgeClassifierCV(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'RidgeCV':
        model = linear_model.RidgeCV().fit(train_x, train_y)
    elif model_type == 'SGDClassifier':
        model = linear_model.SGDClassifier(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SGDRegressor':
        model = linear_model.SGDRegressor().fit(train_x, train_y)
    elif model_type == 'TheilSenRegressor':
        model = linear_model.TheilSenRegressor().fit(train_x, train_y)
    elif model_type == 'lars_path':
        model = linear_model.lars_path().fit(train_x, train_y)
    elif model_type == 'lasso_path':
        model = linear_model.lasso_path().fit(train_x, train_y)
    elif model_type == 'lasso_stability_path':
        model = linear_model.lasso_stability_path().fit(train_x, train_y)
    elif model_type == 'logistic_regression_path':
        model = linear_model.logistic_regression_path(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'orthogonal_mp':
        model = linear_model.orthogonal_mp().fit(train_x, train_y)
    elif model_type == 'orthogonal_mp_gram':
        model = linear_model.orthogonal_mp_gram().fit(train_x, train_y)
    elif model_type == 'LinearSVC':
        model = svm.LinearSVC(class_weight=class_weight).fit(train_x, train_y)
    elif model_type == 'SVC':
        model = svm.SVC(class_weight=class_weight, degree=3).fit(train_x, train_y)
    else:
        raise NotImplementedError('Model not implemented')

        
    logger.info("Finished training.")
    endTrainTime = time()
    trainTime = endTrainTime - startTrainTime
    logger.info("Training time : %d seconds" % trainTime)


    logger.info("Start predicting train set...")
    train_pred_y = model.predict(train_x)
    logger.info("Finished predicting train set.")
    logger.info("Start predicting test set...")
    test_pred_y = model.predict(test_x)
    logger.info("Finished predicting test set.")
    endTestTime = time()
    testTime = endTestTime - endTrainTime
    logger.info("Testing time : %d seconds" % testTime)
    totalTime += trainTime + testTime

    train_pred_y = np.round(train_pred_y)
    test_pred_y = np.round(test_pred_y)

    np.savetxt(out_dir + '/preds/best_test_pred' + '.txt', test_pred_y, fmt='%i')

    logger.info('[TRAIN] Acc: %.3f' % (accuracy_score(train_y, train_pred_y)))
    logger.info('[TEST]  Acc: %.3f' % (accuracy_score(test_y, test_pred_y)))

    return accuracy_score(test_y, test_pred_y)