Exemplo n.º 1
0
# df_test = pd.read_csv(data_path('test.csv'))
#
# labels = df['TARGET']
# df_test_id = df_test['ID']
#
# colls = ['saldo_var30', 'var15', 'saldo_var5', 'ind_var30', 'var38', 'saldo_medio_var5_ult3', 'num_meses_var5_ult3', 'saldo_medio_var5_hace3', 'var36', 'num_meses_var39_vig_ult3', 'num_var30', 'num_var5', 'num_var4', 'num_var45_hace2']
# print(sorted(colls))
#
# df = df[colls]
# df_test = df_test[colls]

# poly = PolynomialFeatures(2)
# df = poly.fit_transform(df)
# df_test = poly.transform(df_test)

clf = GradientBoostingClassifier(verbose=3)

# clf = RandomForestClassifier()
clf.fit(df, labels)

scores = cross_validation.cross_val_score(clf,
                                          df,
                                          labels,
                                          cv=5,
                                          scoring='roc_auc')
print(scores.mean(), scores)

from src.submission import make_submission
make_submission('gradient_boosting.csv', df_test_id,
                clf.predict_proba(df_test))
Exemplo n.º 2
0
    #Let's print the result
    percentage_hist = 0
    print("Proportion of selected features from each histone marker")
    for i in range(5):
        value = histone_marker[i][1] / 2.5
        print("%s : %f" % (histone_marker[i][0], value))
        percentage_hist += value
    print("Total  : %f " % percentage_hist)
    """ V. Improvement of the accuracy with different classifiers"""

    #First step : training the classifier seen in class
    classifiers1 = [(RandomForestClassifier(), "Random Forest"),
                    (ExtraTreesClassifier(), "Extra-Trees"),
                    (AdaBoostClassifier(), "AdaBoost"),
                    (GradientBoostingClassifier(), "GB-Trees")]
    classifiers3 = [(KNeighborsClassifier(), "KNeighbors")]
    Results = []
    Predicted_data = []
    counter = 0
    #spliting training data in two in order to test the accuracy
    X_train2, X_test2, y_train2, y_test2 = train_test_split(x_train,
                                                            y_train,
                                                            test_size=0.2)

    #implementing the 1st group of classifiers
    estimators = [100, 500, 1000]
    for clf, name in classifiers1:
        for est_val in estimators:
            clf.n_estimators = est_val
            #clf.n_jobs = -1
Exemplo n.º 3
0
joblib.dump([elo_bins, mg_quants], blundermodel_dir + 'groups.p')
features = [
    'side', 'halfply', 'moverscore', 'bestmove_is_capture',
    'bestmove_is_check', 'depth', 'seldepth', 'num_bestmoves',
    'num_bestmove_changes', 'bestmove_depths_agreeing', 'deepest_change'
]

modelnum = 0
for elo_name, elo_df in train_df.groupby(train_df['elo_groups']):
    msg('working on elo group %s, of size %i' % (elo_name, elo_df.shape[0]))

    msg('computing perfect-move model')
    gbc = GradientBoostingClassifier(min_samples_split=500,
                                     min_samples_leaf=300,
                                     n_estimators=NUM_ESTIMATORS,
                                     verbose=1,
                                     subsample=0.5,
                                     learning_rate=0.2)
    X = elo_df[features]
    y = (elo_df['clipped_movergain'] == 0)
    gbc.fit(X, y)
    joblib.dump([elo_name, 1.0, gbc], '%s%i.p' % (blundermodel_dir, modelnum))
    modelnum = modelnum + 1

    for mg_quant in mg_quants:
        msg('computing mg_quant %f' % mg_quant)
        gbr = GradientBoostingRegressor(loss='quantile',
                                        alpha=mg_quant,
                                        min_samples_split=500,
                                        min_samples_leaf=300,
                                        n_estimators=NUM_ESTIMATORS,
Exemplo n.º 4
0
    'U_behaviors_sum10', 'Item_sale10', 'Item_sale5', 'Item_sale3',
    'Item_sale1', 'car5', 'car4', 'car3', 'car2', 'car1', 'buy5', 'buy4',
    'buy3', 'buy2', 'buy1', 'I_order10', 'I_order5', 'I_order3', 'I_order1',
    'I_buyer10', 'I_buyer5', 'I_buyer3', 'I_buyer1', 'behav1', 'behav2',
    'behav3', 'behav4', 'last_time'
]

df_train = pd.read_csv("train_feature.csv")
df_validation = pd.read_csv("validation_feature.csv")
#数据归一化处理

ui = df_train[["user_id", "item_id"]]
samples = df_train[features]
target = df_train["tag"]
classifier = GradientBoostingClassifier(n_estimators=200,
                                        learning_rate=1.0,
                                        max_depth=5,
                                        random_state=0)
classifier.fit(samples, target)  # 训练数据来学习,不需要返回值

validation_feature = df_validation[features]
x = classifier.predict(validation_feature)  # 测试数据,分类返回标记
print x
validation_ui = df_validation[["user_id", "item_id"]]
validation_ui["tag"] = x
validation_result = validation_ui[validation_ui.tag == 1][[
    "user_id", "item_id"
]]
os.chdir('..')
validation_result.to_csv("predict_v_Gbrt.csv", index=False)
Exemplo n.º 5
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from plot_confusion_matrix import gen_confusion_matrix_figure

X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1)

xgb = xgboost.XGBClassifier(objective="multi:softprob",
                            nthread=-1,
                            reg_alpha=0.7,
                            reg_lambda=0.05,
                            subsample=0.9)
gbrt = GradientBoostingClassifier(random_state=0)
forest = RandomForestClassifier(n_jobs=-1, random_state=0)
lr = LogisticRegression(C=0.03)
eclf = VotingClassifier(estimators=[('xgboost', xgb), ('gbrt', gbrt),
                                    ('forest', forest),
                                    ('logistic regression', lr)],
                        voting='soft',
                        weights=None)

classifier_list = [xgb, gbrt, forest, lr, eclf]

for clf in classifier_list:
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    y_train_pred = clf.fit(X_train, y_train).predict(X_train)

    # Compute confusion matrix
    'dateOfBirth', 'popularity'
]]
got_target = got.loc[:, 'isAlive']

X_train, X_test, y_train, y_test = train_test_split(got_data,
                                                    got_target.values.ravel(),
                                                    test_size=0.1,
                                                    random_state=508,
                                                    stratify=got_target)

# Building a gbm
gbm = GradientBoostingClassifier(
    loss='deviance',
    learning_rate=1.5,
    n_estimators=100,
    max_depth=3,
    criterion='friedman_mse',
    warm_start=False,
    random_state=508,
)

gbm_basic_fit = gbm.fit(X_train, y_train)

gbm_basic_predict = gbm_basic_fit.predict(X_test)

# Training and Testing Scores
print('Training Score', gbm_basic_fit.score(X_train, y_train).round(4))
print('Testing Score:', gbm_basic_fit.score(X_test, y_test).round(4))

cv_lr_3 = cross_val_score(gbm, got_data, got_target, cv=3, scoring='roc_auc')
Exemplo n.º 7
0
 '''
 This one almost always gets the best model.  In v2, changed to use early stopping which has the result of setting
 n_estimators to a very high value (1000) and fixing both the validation_fraction and n_iter_no_change to results
 derived from Bayes testing.  Max_depth and subsample were also always fixed in Bayes mode testing.  Learning rate 
 and splitting still show some variability depending on the data type, so we left a couple option in GridSearch.
 
 For some reason, it does seem to struggle with specifically homozygous SNVs.  I wonder if the frequency of FP is just
 low enough to make it a challenge for this model type.
 '''
 #" Most data scientist see number of trees, tree depth and the learning rate as most crucial parameters" - https://www.datacareer.de/blog/parameter-tuning-in-gradient-boosting-gbm/
 #'''
 CLASSIFIERS.append((
     'GradientBoosting',
     GradientBoostingClassifier(random_state=0,
                                learning_rate=0.1,
                                loss='exponential',
                                max_depth=4,
                                max_features='sqrt',
                                n_estimators=200),
     {
         'random_state': [0],
         'n_estimators': [
             1000
         ],  #prior tests: 100, 200; OBSOLETE: since adding n_iter_no_change, just set to a big number
         'max_depth': [6],  #prior tests: 3, 4
         'learning_rate': [
             0.05, 0.1, 0.5
         ],  #prior tests: 0.01, 0.2; from bayes mode, all results were in the 0.04-0.2 range with the occasional "high" rate near 0.5
         'loss': ['exponential'],  #prior tests: 'deviance'
         'max_features': ['sqrt'],
         'min_samples_split': [
             2, 15, 50
Exemplo n.º 8
0
def gradient_boosting_classifier(X_train, y_train):
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=200, random_state=2)
    model.fit(X_train, y_train)
    return model
Exemplo n.º 9
0
Arquivo: main.py Projeto: wsqat/DM
def gradient_boosting_classifier(train_x, train_y):
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=200)
    model.fit(train_x, train_y)
    print model.feature_importances_  # 显示每一个特征的重要性指标,越大说明越重要
    return model
def main():
    experiment_config = {
        'comment': 'Keel run',
        'experiment_repetitions': 5,
        'n_splits': 5,
        'random_seed': int(os.urandom(1)[0] / 255 * (2**32)),
    }

    classifiers = [('LR', LogisticRegression()),
                   ('GBM', GradientBoostingClassifier(), [{
                       'n_estimators': [50, 100, 200]
                   }]),
                   ('KNN', KNeighborsClassifier(), [{
                       'n_neighbors': [3, 5, 8]
                   }])]
    oversampling_methods = [
        ('None', None),
        ('RandomOverSampler', RandomOverSampler()),
        ('SMOTE', SMOTE(), [{
            'k_neighbors': [3, 5, 20]
        }]),
        ('B1-SMOTE', SMOTE(kind='borderline1'), [{
            'k_neighbors': [3, 5, 20]
        }]),
        ('B2-SMOTE', SMOTE(kind='borderline2'), [{
            'k_neighbors': [3, 5, 20]
        }]),
        (
            'KMeansSMOTE',
            KMeansSMOTE(),
            [
                {
                    'imbalance_ratio_threshold': [1, float('Inf')],
                    'density_power': [0, 2,
                                      None],  # None corresponds to n_features
                    'smote_args': [{
                        'k_neighbors': 3
                    }, {
                        'k_neighbors': 5
                    }, {
                        'k_neighbors': 20
                    }, {
                        'k_neighbors': float('Inf')
                    }],
                    'kmeans_args': [{
                        'n_clusters': 2
                    }, {
                        'n_clusters': 20
                    }, {
                        'n_clusters': 50
                    }, {
                        'n_clusters': 100
                    }, {
                        'n_clusters': 250
                    }, {
                        'n_clusters': 500
                    }],
                    'use_minibatch_kmeans': [True],
                    'n_jobs': [-1]
                },
                # SMOTE Limit Case
                {
                    'imbalance_ratio_threshold': [float('Inf')],
                    'kmeans_args': [{
                        'n_clusters': 1
                    }],
                    'smote_args': [{
                        'k_neighbors': 3
                    }, {
                        'k_neighbors': 5
                    }],
                    'use_minibatch_kmeans': [True],
                    'n_jobs': [-1]
                }
            ])
    ]

    datasets = read_csv_dir(cfg['dataset_dir'])
    experiment = BinaryExperiment(
        datasets,
        classifiers,
        oversampling_methods,
        n_jobs=-1,
        experiment_repetitions=experiment_config['experiment_repetitions'],
        random_state=experiment_config['random_seed'],
        n_splits=experiment_config['n_splits'],
        scoring=[
            'geometric_mean_score', 'average_precision', 'roc_auc', 'f1', 'fp',
            'fn', 'tp', 'tn'
        ])

    with warnings.catch_warnings():
        warnings.filterwarnings(action='ignore',
                                message='Adapting smote_args\.k_neighbors')
        experiment.run()

    path = cfg['results_dir']
    if 'session_id' not in globals():
        session_id = (datetime.utcnow() +
                      timedelta(hours=2, minutes=0)).strftime("%Y-%m-%d %Hh%M")

    os.makedirs('{}/{}'.format(path, session_id))

    experiment.save('{}/{}/experiment.p'.format(path, session_id))

    # stringify oversampling methods
    experiment_config['oversampling_methods'] = re.sub(
        '\\n *', ' ', str(oversampling_methods))
    # save experiment config
    pd.Series(experiment_config).to_csv('{}/{}/experiment_config.csv'.format(
        path, session_id))
Exemplo n.º 11
0
    def feature_selection(self, X, y, method):
        """
        purpose:    select feature
        input:  X:train data
                y:lable
                method: uesed method
        return:
        """
        X_indices = np.arange(X.shape[-1])

        score = []

        # Removing features with low variance

        # correlation coefficient
        # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # mutual information
        # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # Univariate feature selection (for classification)
        if method == 'chi-squared':
            skb = SelectKBest(chi2)
            skb.fit_transform(X, y)
            score = skb.scores_

        # Univariate feature selection (for regression)
        if method == 'f_regression':
            skb = SelectKBest(f_regression)
            skb.fit_transform(X, y)
            score = skb.scores_

        # L1-based feature selection (for classification)
        if method == 'LinearSVC':
            lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
            sfm = SelectFromModel(lsvc, prefit=True)
            X_new = sfm.transform(X)

        # L1-based feature selection (for regression)
        elif method == 'LassoCV':
            lasso = LassoCV().fit(X, y)
            score = lasso.coef_
            sfm = SelectFromModel(lasso, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classification)
        elif method == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier()
            clf = clf.fit(X, y)
            print clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'ExtraTreesRegressor':
            clf = ExtraTreesRegressor()
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classifier)
        elif method == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'GradientBoostingRegressor':
            clf = GradientBoostingRegressor(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Print the feature ranking
        indices = np.argsort(score)[::-1]
        print("Feature ranking:")
        for f in X_indices:
            print("feature %d: %s  (%f)" %
                  (indices[f], self.columns[indices[f]], score[indices[f]]))

        #draw plot
        plt.figure()
        # plt.bar(indices, score, width=0.2, color='r')
        plt.barh(indices, score, height=0.2, color='r')
        plt.title(method)
        plt.xlabel("score")
        plt.ylabel("feature")
        plt.grid(axis='x')
        plt.show()

        pass
Exemplo n.º 12
0
def gbc_y():
    from sklearn.ensemble import GradientBoostingClassifier
    regressor_gb = GradientBoostingClassifier()
    regressor_gb.fit(X_train, y_train)
    y_pred_gb = regressor_gb.predict_proba(X_valid)
    return y_pred_gb[:, 1]
Exemplo n.º 13
0
def return_model(mode, **kwargs):
    
    
    if inspect.isclass(mode):
        assert getattr(mode, 'fit', None) is not None, 'Custom model family should have a fit() method'
        model = mode(**kwargs)
    elif mode=='logistic':
        solver = kwargs.get('solver', 'liblinear')
        n_jobs = kwargs.get('n_jobs', None)
        max_iter = kwargs.get('max_iter', 5000)
        model = LogisticRegression(solver=solver, n_jobs=n_jobs, 
                                 max_iter=max_iter, random_state=666)
    elif mode=='Tree':
        model = DecisionTreeClassifier(random_state=666)
    elif mode=='RandomForest':
        n_estimators = kwargs.get('n_estimators', 50)
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='GB':
        n_estimators = kwargs.get('n_estimators', 50)
        model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='AdaBoost':
        n_estimators = kwargs.get('n_estimators', 50)
        model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666)
    elif mode=='SVC':
        kernel = kwargs.get('kernel', 'rbf')
        model = SVC(kernel=kernel, random_state=666)
    elif mode=='LinearSVC':
        model = LinearSVC(loss='hinge', random_state=666)
    elif mode=='GP':
        model = GaussianProcessClassifier(random_state=666)
    elif mode=='KNN':
        n_neighbors = kwargs.get('n_neighbors', 5)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
    elif mode=='NB':
        model = MultinomialNB()
    elif mode=='linear':
        model = LinearRegression(random_state=666)
    elif mode=='ridge':
        alpha = kwargs.get('alpha', 1.0)
        model = Ridge(alpha=alpha, random_state=666)
    elif 'conv' in mode:
        tf.reset_default_graph()
        address = kwargs.get('address', 'weights/conv')
        hidden_units = kwargs.get('hidden_layer_sizes', [20])
        activation = kwargs.get('activation', 'relu')
        weight_decay = kwargs.get('weight_decay', 1e-4)
        learning_rate = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 1000)
        early_stopping= kwargs.get('early_stopping', 10)
        warm_start = kwargs.get('warm_start', False)
        batch_size = kwargs.get('batch_size', 256)
        kernel_sizes = kwargs.get('kernel_sizes', [5])
        strides = kwargs.get('strides', [5])
        channels = kwargs.get('channels', [1])
        validation_fraction = kwargs.get('validation_fraction', 0.)
        global_averaging = kwargs.get('global_averaging', 0.)
        optimizer = kwargs.get('optimizer', 'sgd')
        if mode=='conv':
            model = CShapNN(mode='classification', batch_size=batch_size, max_epochs=max_iter,
                          learning_rate=learning_rate, 
                          weight_decay=weight_decay, validation_fraction=validation_fraction,
                          early_stopping=early_stopping,
                         optimizer=optimizer, warm_start=warm_start, address=address,
                          hidden_units=hidden_units,
                          strides=strides, global_averaging=global_averaging,
                         kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
        elif mode=='conv_reg':
            model = CShapNN(mode='regression', batch_size=batch_size, max_epochs=max_iter,
                          learning_rate=learning_rate, 
                          weight_decay=weight_decay, validation_fraction=validation_fraction,
                          early_stopping=early_stopping,
                         optimizer=optimizer, warm_start=warm_start, address=address,
                          hidden_units=hidden_units,
                          strides=strides, global_averaging=global_averaging,
                         kernel_sizes=kernel_sizes, channels=channels, random_seed=666)
    elif 'NN' in mode:
        solver = kwargs.get('solver', 'adam')
        hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20,))
        if isinstance(hidden_layer_sizes, list):
            hidden_layer_sizes = list(hidden_layer_sizes)
        activation = kwargs.get('activation', 'relu')
        learning_rate_init = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 5000)
        early_stopping= kwargs.get('early_stopping', False)
        warm_start = kwargs.get('warm_start', False)
        if mode=='NN':
            model = MLPClassifier(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
                                activation=activation, learning_rate_init=learning_rate_init,
                                warm_start = warm_start, max_iter=max_iter,
                                early_stopping=early_stopping)
        if mode=='NN_reg':
            model = MLPRegressor(solver=solver, hidden_layer_sizes=hidden_layer_sizes,
                                activation=activation, learning_rate_init=learning_rate_init,
                                warm_start = warm_start, max_iter=max_iter, early_stopping=early_stopping)
    else:
        raise ValueError("Invalid mode!")
    return model
Exemplo n.º 14
0
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

data_loc = 'data/uci_data/poker_hand/poker-hand-training-true.data.txt'
col_names = [
    'S1', 'C1', 'S2', 'C2', 'S3', 'C3', 'S4', 'C4', 'S5', 'C5', 'hand'
]
df = pd.read_csv(data_loc, names=col_names)
df_x = df.drop(col_names[-1], axis=1)
df_y = df[col_names[-1]]
'''
    We lower the learning rate and increase the number of estimators proportionally.
    The parameter we have tuned might not be the optimum values but a good benchmark.
    '''

gb_tuned = GradientBoostingClassifier(random_state=1,
                                      learning_rate=0.05,
                                      n_estimators=500,
                                      max_depth=7,
                                      min_samples_split=2,
                                      min_samples_leaf=1,
                                      max_features=10,
                                      subsample=1)
scores = cross_val_score(gb_tuned, df_x, df_y, cv=5, scoring='accuracy')
print(scores)
print("mean: %0.6f, std: %0.6f" % (scores.mean(), scores.std()))
Exemplo n.º 15
0
# In[58]:


# Boosting


# In[69]:


#Boosting on oversampled data
from sklearn.ensemble import GradientBoostingClassifier
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_t, y_t)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))


# In[60]:


gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0)
gb_clf.fit(X_t, y_t)

print("Learning rate: ", learning_rate)
print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t)))
Exemplo n.º 16
0
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=4)

# Average CV score on the training set was:0.7802373007044865
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.2, kernel="cosine", n_components=10),
    GradientBoostingClassifier(learning_rate=0.5,
                               max_depth=4,
                               max_features=0.45,
                               min_samples_leaf=4,
                               min_samples_split=12,
                               n_estimators=100,
                               subsample=0.8500000000000001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 17
0
def define_clfs_params():
    clfs = {
        'BG':
        BaggingClassifier(n_estimators=10),
        'RF':
        RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'LR':
        LogisticRegression(penalty='l1', C=1e5),
        'SVM':
        svm.SVC(kernel='linear', probability=True, random_state=0),
        'GB':
        GradientBoostingClassifier(learning_rate=0.05,
                                   subsample=0.5,
                                   max_depth=6,
                                   n_estimators=10),
        'AB':
        AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                           algorithm="SAMME",
                           n_estimators=200),
        'DT':
        DecisionTreeClassifier(),
        'KNN':
        KNeighborsClassifier(n_neighbors=3),
        'NB':
        GaussianNB()
    }
    grid = {
        'BG': {
            'n_estimators': [10, 50]
        },
        'RF': {
            'n_estimators': [1, 10, 100, 1000, 10000],
            'max_depth': [1, 5, 10, 20, 50, 100],
            'max_features': ['sqrt', 'log2'],
            'min_samples_split': [2, 5, 10],
            'n_jobs': [-1]
        },
        'LR': {
            'penalty': ['l1', 'l2'],
            'C': [0.00001, 0.001, 0.1, 1, 10]
        },
        'SVM': {
            'C': [0.01, 0.1, 1],
            'kernel': ['linear']
        },
        'GB': {
            'n_estimators': [1, 10, 100],
            'learning_rate': [0.01, 0.1, 0.5],
            'subsample': [0.1, 0.5, 1.0],
            'max_depth': [1, 5, 20]
        },
        'AB': {
            'algorithm': ['SAMME', 'SAMME.R'],
            'n_estimators': [1, 10, 100]
        },
        'DT': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [1, 5, 10, 20, 50, 100],
            'min_samples_split': [2, 5, 10]
        },
        'KNN': {
            'n_neighbors': [1, 5, 10, 25, 50, 100],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        },
        'NB': {}
    }
    return clfs, grid
Exemplo n.º 18
0
b_data=Binarizer(threshold=0.5).fit_transform(boston.data)
print(b_data[0:5,:])


# 哑编码,对boston数据集的目标值,返回值为哑编码后的数据
o_target=OneHotEncoder().fit_transform(boston.target)
print(o_target[0:5])



###特征选择###


#方差选择法,返回值为特征选择后的数据
#参数threshold为方差的阈值
VarianceThreshold(threshold=3).fit_transform(iris.data)

# 卡方检验,选择K个最好的特征,返回选择特征后的数据
select_data=SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)

# 递归特征消除法,返回特征选择后的数据
# 参数estimator为基模型
# 参数n_features_to_select为选择的特征个数
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target)

#带L1惩罚项的逻辑回归作为基模型的特征选择
SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)

#GBDT作为基模型的特征选择
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
## Explore results

# Scikit-learn classification

## Step 1: Create and fit gradient boosting classifier

parameters = {'n_estimators': 120,
              'learning_rate': 0.12,
              'min_samples_split': 3,
              'min_samples_leaf': 2}

from sklearn.datasets import load_digits
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

gbc = GradientBoostingClassifier(**parameters)

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=28743)

gbc.fit(X_train, y_train)

## Step 2: Initialize Neptune

import neptune

neptune.init('shared/sklearn-integration', api_token='ANONYMOUS')

## Step 3: Create an Experiment

neptune.create_experiment(params=parameters,
    X1_re, y1_re, test_size=0.2, train_size=300)  #, random_state = 0)
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2_re, y2_re, test_size=0.2, train_size=300)  #, random_state = 0)

sc = StandardScaler()
X1_train = sc.fit_transform(X1_train)
X2_train = sc.fit_transform(X2_train)

PCA1_train = PCA(n_components=8).fit(X1_train).transform(X1_train)
PCA2_train = PCA(n_components=8).fit(X2_train).transform(X2_train)

y1_train = np.array(y1_train).ravel()
y2_train = np.array(y2_train).ravel()

forest = RandomForestClassifier(max_depth=None)
forestBoost = GradientBoostingClassifier(max_depth=None)
MLP = MLPClassifier()
svm = SVC()
knn = KNeighborsClassifier()

Names = np.vstack((Names1, Names2))
Names = pd.DataFrame(Names, columns=["Name", "ExName"])
mydict = dict(zip(Names.Name, Names.ExName))

X1_columns = pd.DataFrame(X1_column_names, columns=["Name"])
X1_columns = X1_columns.replace(mydict)
X1_column_names = pd.DataFrame(X1_column_names)
Named = np.hstack((X1_column_names, X1_columns))

Named = pd.DataFrame(Named, columns=["Name", "ExName"])
Exemplo n.º 21
0
pred = pipe.predict(X_test)

# In[35]:

pipe.predict(['I think I am going to love it here!'])

# In[37]:

accuracy_score(y_test, pred)

# In[64]:

abc = AdaBoostClassifier()
bag = BaggingClassifier()
gbc = GradientBoostingClassifier()
rfc = RandomForestClassifier()
lr = LogisticRegression()

# In[65]:

lr.fit(X_train, y_train)

# In[68]:

lr_pred = lr.predict(X_test)

# In[69]:

abc.fit(X_train, y_train)
Exemplo n.º 22
0
clf_RF = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
y_predict = clf_RF.predict(X_test)
fpr_RF, tpr_RF, thr_RF = roc_curve(y_test, y_predict)
pr_RF, rec_RF, thr_RF = precision_recall_curve(y_test, y_predict, pos_label=1)

delta_RF = datetime.now() - startTime
i = list(thr_RF).index(1)
print('\tPrecision: {0}'.format(pr_RF[i]))
print('\tRecall: {0}'.format(rec_RF[i]))
print('\tTime: {0}'.format(delta_RF))

# Run Gradient Boosting
print('Running Gradient Boosting...')
startTime = datetime.now()

clf_GB = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
y_predict = clf_GB.predict(X_test)
fpr_GB, tpr_GB, _ = roc_curve(y_test, y_predict)
pr_GB, rec_GB, thr_GB = precision_recall_curve(y_test, y_predict)

delta_GB = datetime.now() - startTime
i = list(thr_GB).index(1)
print('\tPrecision: {0}'.format(pr_GB[i]))
print('\tRecall: {0}'.format(rec_GB[i]))
print('\tTime: {0}'.format(delta_GB))

# ===========================================================================
# Repeat process with an oversampled balanced dataset
print('Running models with oversampled balanced dataset...')
# Count number of fraud and non-fraud data points
min_idx = y_train == 1
def train_model(datasetvar, dataset):
    x = datasetvar
    y = dataset['Churn'].values
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    print(sss)
    print('训练数据和测试数据被分成的组数:', sss.get_n_splits(x, y))

    # 建立训练数据和测试数据
    for train_index, test_index in sss.split(x, y):
        print('train:', train_index, 'test:', test_index)
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print('原始数据特征:', x.shape, '训练数据特征:', x_train.shape, '测试数据特征:',
              x_test.shape)

        print('原始数据特征:', y.shape, '训练数据特征:', y_train.shape, '测试数据特征:',
              y_test.shape)

    # 使用分类算法, 这里选用10中分类算法
    Classifier = [['Random Forest', RandomForestClassifier()],
                  ['Support Vector Machine', SVC()],
                  ['LogisticRegression',
                   LogisticRegression()],
                  ['KNN', KNeighborsClassifier(n_neighbors=5)],
                  ['Navie Bayes', GaussianNB()],
                  ['Decision Tree', DecisionTreeClassifier()],
                  ['AdaBosstClassifier',
                   AdaBoostClassifier()],
                  ['GradientBoostingClassifier',
                   GradientBoostingClassifier()], ['XGB',
                                                   XGBClassifier()],
                  ['CatBoost',
                   CatBoostClassifier(logging_level='silcat')]]

    # 训练模型
    Classify_result = []
    names = []
    prediction = []
    for name, classifier in Classifier:
        classifier = classifier
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        class_eva = pd.DataFrame([recall, precision])
        Classify_result.append(class_eva)
        name = pd.Series(name)
        names.append(name)
        y_pred = pd.Series(y_pred)
        prediction.append(y_pred)

    # 训练模型
    names = pd.DataFrame(names)
    names = names[0].tolist()
    result = pd.concat(Classify_result, axis=1)
    result.columns = names
    result.index = ['recall', 'precision', 'f1score']
    print(result)

    # 实施方案
    pred_x = datasetvar.tail(10)

    # 提取customerID
    pred_id = telcom_id.tail(10)

    # 使用朴素贝叶斯方法, 对预测数据集中的生存情况进行预测
    model = GaussianNB()
    model.fit(x_train, y_train)
    pred_y = model.predict(pred_x)

    # 预测结果
    predDf = pd.DataFrame({'customerID': pred_id, 'Churn': pred_y})
    print(predDf)
# stack base predicts for training meta model
#stacked_predictions = np.column_stack((rf_fit.predict(x_train),et_fit.predict(x_train),ada_fit.predict(x_train),gb_fit.predict(x_train),svc_fit.predict(x_train)))

polymetamnalicac

# train meta model
from sklearn.linear_model import LinearRegression
#meta_model = LinearRegression()
#meta_model.fit(stacked_predictions, t_train)
from sklearn import preprocessing
satsuki = pd.read_csv('haruten.csv', index_col=0)
mm = preprocessing.MinMaxScaler()  # インスタンスの作成
satsuki_seiki = mm.fit_transform(satsuki)
arima = pd.read_csv('arima.csv', index_col=0)
from sklearn.ensemble import VotingClassifier

estimators = [
    ('svc', SVC()),
    ('rf', RandomForestClassifier()),
    ('et', ExtraTreesClassifier()),
    ('ada', AdaBoostClassifier()),
    ('gb', GradientBoostingClassifier()),
]

sum = 0
buy = 0
voting = VotingClassifier(estimators)
voting.fit(x, t)
print(voting.predict(satsuki_seiki))
    return df


df = pd.read_csv('drugsCom_raw/drugsComTrain_raw.tsv', sep='\t', index_col=0)
df['date'] = pd.to_datetime(df['date'])
df = rm_sym(df)
df_tem2 = df.sample(20000)
#df_tem2.groupby('rating_cate').size() / df_tem2.groupby('rating_cate').size().sum()

## Generate table of words with their counts
con_vec = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
X_train = con_vec.fit_transform(df_tem2['review'])
y_train = df_tem2['rating_cate']

## test set
test = pd.read_csv("drugsCom_raw/drugsComTest_raw.tsv", sep='\t', index_col=0)
test = rm_sym(test)
X_test = con_vec.transform(test['review'])
y_test = test['rating_cate']

pickle.dump(con_vec, open("gbc_20000_600_tfidf.sav", 'wb'))

gbc = GradientBoostingClassifier(n_estimators=600)
gbc.fit(X_train, y_train)
y_test_predict = gbc.predict(X_test)
acc = accuracy_score(y_test, y_test_predict)
with open("gbc_20000_600_accuracy.txt", 'w') as outfile:
    outfile.write(str(acc))

pickle.dump(gbc, open("gbc_20000_600_gbc.sav", 'wb'))
Exemplo n.º 26
0
param['bst:eta'] = 0.1
param['bst:max_depth'] = 6
param['eval_metric'] = 'auc'
param['silent'] = 1
param['nthread'] = 4

plst = param.items() + [('eval_metric', '[email protected]')]

watchlist = [(xgmat, 'train')]
# boost 10 tres
num_round = 10
print('loading data end, start to boost trees')
print("training GBM from sklearn")
tmp = time.time()
gbm = GradientBoostingClassifier(n_estimators=num_round,
                                 max_depth=6,
                                 verbose=2)
gbm.fit(data, label)
print("sklearn.GBM costs: %s seconds" % str(time.time() - tmp))
#raw_input()
print("training xgboost")
threads = [1, 2, 4, 16]
for i in threads:
    param['nthread'] = i
    tmp = time.time()
    plst = param.items() + [('eval_metric', '[email protected]')]
    bst = xgb.train(plst, xgmat, num_round, watchlist)
    print("XGBoost with %d thread costs: %s seconds" %
          (i, str(time.time() - tmp)))

print('finish training')
from General.Paths import Gitlab_Path
import pandas as pd
from Scoring.scoring_func import f1_scores_plot
import numpy as np
from time import time

fold1_df = load_dataframe(filename='fold1_NA_features.dat')
fold2_df = load_dataframe(filename='fold2_NA_features.dat')

del fold1_df['id']
del fold2_df['id']

n_features = int(len(fold1_df.columns) / 4)
p0 = time()
clf = GradientBoostingClassifier('deviance',
                                 learning_rate=0.05,
                                 n_estimators=100,
                                 max_features=n_features)

clf.fit(fold1_df.iloc[:, 1:], fold1_df.iloc[:, 0])
preds_ens = clf.predict_proba(fold2_df.iloc[:, 1:])[:, 1]
print(time() - p0)

## Ensemble the predictions
true_values = fold2_df['label']
df, best_index = f1_scores_plot(preds_ens, true_values)
df['f1_score'][best_index]  #Li

### Check perfomance on fold3
fold3_df = load_dataframe(filename='fold3_NA_features.dat')
del fold3_df['id']
dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca']
Exemplo n.º 28
0
for i in corr_mat:
    for j in corr_mat:
        if (i == j):
            continue

        else:
            if (corr_mat[i][j] > 0.2):
                a.add(i)
print(a)

sve = SVC()
sve.fit(data_pd, Y_train)
print(sve.score(data_pd1, Y_test))
print(sve.score(data_pd, Y_train))

grb = GradientBoostingClassifier()
grb.fit(data_pd, Y_train)
print(grb.score(data_pd1, Y_test))
print(grb.score(data_pd, Y_train))

cor_matt = data_pd.corr()
eig_vals, eig_vecs = np.linalg.eig(cor_matt)
#print(eig_vals)
#print('sdaddddddddddddddd')
#print(eig_vecs)
'''fiting and transforming pca'''
pca = PCA(n_components=9)
train_features = pca.fit_transform(data_pd)
test_features = pca.transform(data_pd1)

sve1 = SVC()
Exemplo n.º 29
0
# List of comments
comments = []

# https://stackoverflow.com/questions/49100615/nltk-detecting-whether-a-sentence-is-interogative-or-not
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()
posts_text = [post.text for post in posts]
#divide train and test in 80 20
train_text = posts_text[:int(len(posts_text) * 0.8)]
test_text = posts_text[int(len(posts_text) * 0.2):]
#Get TFIDF features
vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=0.001,
                             max_df=0.7,
                             analyzer='word')
X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)
y = [post.get('class') for post in posts]
y_train = y[:int(len(posts_text) * 0.8)]
y_test = y[int(len(posts_text) * 0.2):]
gb = GradientBoostingClassifier(n_estimators=400, random_state=0)
gb.fit(X_train, y_train)

question_comments = []
for comment in comments:
    type_of_comment = gb.predict(vectorizer.transform([comment]))
    if (type_of_comment == 'ynQuestion' or type_of_comment == 'whQuestion'
            or '?' in comment):
        question_comments.append(comment)
question_comments
plt.savefig('plt_heatmap_svc.png', bbox_inches='tight')
plt.show() 
         
sns.heatmap(table, mask=mask, vmax=.65, square=True, cmap="RdBu_r")       




"""
    Gradient Boosting ---------------------------------------------------------

"""

from sklearn.ensemble import GradientBoostingClassifier

clf_fb = GradientBoostingClassifier(n_estimators = 200) #0.001,1000
clf_fb.fit(regressors_train_pca, target_train_bin)           

target_validation_bin_predicted_gb = clf_fb.predict(regressors_validation_pca)

# Accuracy of Predictions
accuracy_score(target_validation_bin, target_validation_bin_predicted_gb)
# Confusion Matrix
print(confusion_matrix(target_test.Box_Office_Range_Bins, target_validation_bin_predicted_gb))




"""

    Neural Network --------------------------------------------------------------