Пример #1
0
def bench_classifiers(name):
    classifiers = [
        ada_boost(name + '.ada_boost'),  # boo
        gaussian_nb(name + '.gaussian_nb'),  # eey
        knn(name + '.knn', sparse_data=True),  # eey
        linear_discriminant_analysis(name + '.linear_discriminant_analysis', n_components=1),  # eey
        random_forest(name + '.random_forest'),  # boo
        sgd(name + '.sgd')  # eey
    ]
    if xgboost:
        classifiers.append(xgboost_classification(name + '.xgboost'))  # boo
    return hp.choice('%s' % name, classifiers)
Пример #2
0
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values
    X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values

    estim = HyperoptEstimator(classifier=xgboost_classification('myXG'),
                              algo=tpe.suggest,
                              max_evals=100,
                              trial_timeout=120,
                              verbose=True)

    estim.fit(X_train, y_train)

    print("\n\n{}\n\n".format(estim.score(X_test, y_test)))
    print("\n\n{}\n\n".format(estim.best_model()))
Пример #3
0
def makeXGB(myaeid):
    try:
        # output print statements into file
        os.makedirs("/home/rlougee/Desktop/xgb_results/" + str(myaeid))
        sys.stdout = open(
            '/home/rlougee/Desktop/xgb_results/{}/{}_output.txt'.format(
                myaeid, myaeid), 'w')

        # prep data
        df = mc5_table[mc5_table['aeid'] == myaeid]
        df = df[['dsstox_compound_id', 'hitc']]
        df = handle_duplicates(df, 3)
        print("duplicates passed")
        df = fillfp(df, 1445)
        print("fillfp passed")
        # get file name for outputs
        name = myaeid
        print(name)

        # print(df)
        # declare variables
        y = np.array(df['hitc'])
        print(y)
        df = df.drop(['hitc', 'dsstox_compound_id'], 1)
        X = np.array(df.values)
        print(X.shape)  ######
        print(X)
        # X = X[:,~np.all(np.isnan(X), axis=0)]
        # print(X)

        # make test and train data
        X_, X_test, y_, y_test = train_test_split(X,
                                                  y,
                                                  test_size=0.2,
                                                  stratify=y)

        print('##########################')

        # calculate base parameters
        spw = len(y[y == 0]) / len(y[y == 1])
        max_delta_step = 0  # hp.quniform('max_delta_step', 0, 1, .1)
        min_child_weight = hp.quniform('min_child', 50, 60, .1)
        # subsample = hp.quniform('subsample',0,1,0.1)
        colsample_bytree = 1  #hp.quniform('colsample_bytree',0.1,1,.1)
        # max_delta_step = hp.quniform('max_delta_step',0,10,1)
        gamma = hp.quniform('gamma', 0.5, 1, .01)
        learning_rate = hp.quniform('learning_rate', .01, .2, .001)
        n_estimators = sample(
            scope.int(hp.quniform('n_estimators', 3000, 4000, 100)))
        max_depth = 100
        n_jobs = 30

        model = HyperoptEstimator(classifier=xgboost_classification(
            'my_clf',
            min_child_weight=min_child_weight,
            colsample_bytree=colsample_bytree,
            max_delta_step=max_delta_step,
            gamma=gamma,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            scale_pos_weight=spw,
            reg_alpha=0,
            reg_lambda=1,
            colsample_bylevel=1,
            subsample=1))

        model.fit(X_, y_)
        a = str(model.best_model()['learner']).replace('=', '":').replace(
            'XGBClassifier(',
            '{"').replace(', ', ', "').replace(',\n       ', ', "').replace(
                ')',
                '}').replace("'", '"').replace(' "missing":nan,', '').replace(
                    ' "nthread":None,',
                    '').replace(' "silent":True,',
                                '').replace('"base_score":0.5, ', '')
        print(a)

        params_final = json.loads(a)

        ###############################################
        final_model = xgb.XGBClassifier(**params_final)

        final_model.fit(X_, y_, verbose=False)

        # Performance train
        train_y_pred = final_model.predict(X_)
        auc = roc_auc_score(y_, train_y_pred)
        print("Performance train : ", auc)

        # Performance test
        test_y_pred = final_model.predict(X_test)
        auc = roc_auc_score(y_test, test_y_pred)
        print("Performance test : ", auc)

        # #
        tn, fp, fn, tp = confusion_matrix(y_, final_model.predict(X_)).ravel()
        # Error rate :
        err_rate = (fp + fn) / (tp + tn + fn + fp)
        print("Error rate  : ", err_rate)
        # Accuracy :
        acc_ = (tp + tn) / (tp + tn + fn + fp)
        print("Accuracy  : ", acc_)
        # Sensitivity :
        sens_ = tp / (tp + fn)
        print("Sensitivity  : ", sens_)
        # Specificity
        sp_ = tn / (tn + fp)
        print("Specificity  : ", sp_)
        # False positive rate (FPR)
        FPR = fp / (tn + fp)
        print("False positive rate  : ", FPR)

        # Error rate :
        err_rate = (fp + fn) / (tp + tn + fn + fp)
        print("Error rate  on train set : ", err_rate)
        # Accuracy :
        acc_ = (tp + tn) / (tp + tn + fn + fp)
        print("Accuracy  on train set  : ", acc_)

        tn, fp, fn, tp = confusion_matrix(y_test,
                                          final_model.predict(X_test)).ravel()
        # Error rate :
        err_rate = (fp + fn) / (tp + tn + fn + fp)
        print("Error rate  on test set : ", err_rate)
        # Accuracy :
        acc_ = (tp + tn) / (tp + tn + fn + fp)
        print("Accuracy  on test set  : ", acc_)

        # os.makedirs("/home/rlougee/Desktop/xgb_results/" + myaeid)
        # print(list(zip(X.columns[2:], final_model.feature_importances_)))
        #xgb.plot_tree(final_model, rankdir='LR')
        #plt.savefig("/home/rlougee/Desktop/xgb_results/{}/{}_treeplot".format(name, name), dpi=1200)
        #xgb.plot_importance(final_model, importance_type="weight") #importance type (weight, gain, cover)
        #plt.tight_layout()
        #plt.savefig("/home/rlougee/Desktop/xgb_results/{}/{}_featureimportance".format(name,name), dpi=1200)
        pickle.dump(
            final_model,
            open(
                "/home/rlougee/Desktop/xgb_results/{}/{}_model".format(
                    name, name), 'wb'))
        #loaded_mdel = pickle.load(open(file_name, 'rb'))
    except:
        print('FAILURE: {}'.format(myaeid))
Пример #4
0
        n_job = 6
        select_classes = [0, 1, 2, 3, 4, 5]
        val_dist = X_val_mini.shape[0] / X_train_mini.shape[0]
        name = 'my_est_oVa'

        tic_mod_all = time.time()
        select_alg = [
            ada_boost(name + '.ada_boost'),
            gaussian_nb(name + '.gaussian_nb'),
            knn(name + '.knn', sparse_data=True),
            linear_discriminant_analysis(name +
                                         '.linear_discriminant_analysis',
                                         n_components=1),
            random_forest(name + '.random_forest'),
            sgd(name + '.sgd'),
            xgboost_classification(name + '.xgboost')
        ]

        # fitting models
        estim_one_vs_rest = dict()
        # scoring models
        algo_scoring = dict()
        save_score_path = r'C:/Users/anden/PycharmProjects/NovelEEG/results'
        for alg in [select_alg[args.index]]:
            tic_mod = time.time()
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
                  "running on %s" % (alg.name + '.one_V_all'),
                  "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            clf_method = one_vs_rest(str(alg.name + '.one_V_all'),
                                     estimator=alg,
                                     n_jobs=1)
Пример #5
0
            y_test = Y[test]
            y_train = Y[train]

        if (num_feats != 0):
            sk_obj = SelectKBest(f_classif, k=num_feats)
            x_train = sk_obj.fit_transform(x_train, y_train)
            x_test = sk_obj.transform(x_test)

        if (model_type == 'XGB'):
            if (num_classes == 2):
                objective = 'binary:logistic'
            else:
                objective = 'multi:softmax'
            if (hyper_param):
                model = HyperoptEstimator(
                    classifier=xgboost_classification('xbc'),
                    preprocessing=[],
                    algo=tpe.suggest,
                    trial_timeout=200)
            else:
                model = XGBClassifier(learning_rate=1,
                                      n_estimators=10,
                                      objective=objective,
                                      silent=True,
                                      nthread=num_threads)
            model.fit(x_train, y_train)
        elif (model_type == 'SVM'):
            from sklearn import svm
            if (hyper_param):
                model = HyperoptEstimator(classifier=svc("mySVC"),
                                          preprocessing=[],
Пример #6
0
    print('##########################')

    # calculate base parameters
    spw = len(y[y==0])/len(y[y==1])
    max_delta_step = 0 # hp.quniform('max_delta_step', 0, 1, .1)
    min_child_weight = hp.quniform('min_child',50,60,.1)
    # subsample = hp.quniform('subsample',0,1,0.1)
    colsample_bytree = 1 #hp.quniform('colsample_bytree',0.1,1,.1)
    # max_delta_step = hp.quniform('max_delta_step',0,10,1)
    gamma = hp.quniform('gamma',0.5,1,.01)
    learning_rate = hp.quniform('learning_rate',.01,.2,.001)
    n_estimators = sample(scope.int(hp.quniform('n_estimators',3000,4000,100)))
    max_depth = 100
    n_jobs = 30

    model = HyperoptEstimator(classifier=xgboost_classification('my_clf', min_child_weight=min_child_weight, colsample_bytree=colsample_bytree, max_delta_step=max_delta_step, gamma=gamma, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, scale_pos_weight=spw, reg_alpha=0, reg_lambda=1, colsample_bylevel=1, subsample=1 ))

    model.fit(X_, y_)
    a = str(model.best_model()['learner']).replace('=','":').replace('XGBClassifier(', '{"').replace(', ', ', "').replace(',\n       ', ', "').replace(')', '}').replace("'", '"').replace(' "missing":nan,','').replace(' "nthread":None,','').replace(' "silent":True,','').replace('"base_score":0.5, ','')
    print(a)
    import json
    params_final = json.loads(a)

    # print('##########################')
    #
    # cv1_params = {'min_child_weight':[3, 4, 5], 'subsample':[0, 0.5, 0.6, 1], 'colsample_bytree':[.6, .7, .8, .9], 'max_delta_step': [0],  'gamma':[x/10.0 for x in range(0, 5)]} #params to be tried in the grid search
    # fix_params = { 'max_depth': 14, 'n_estimators': 100, 'objective': 'binary:logistic', 'learning_rate':0.4, 'scale_pos_weight':spw}
    # csv1 = GridSearchCV(xgb.XGBClassifier(**fix_params), cv1_params, scoring ='roc_auc', cv=5, n_jobs=30)
    # csv1.fit(X_, y_)
    # # csv.grid_scores_
    # # print(csv1.cv_results_)