예제 #1
0
propconfmat = svc_confmat.copy()
for i in range(propconfmat.shape[0]):
    propconfmat[i] = 100 * propconfmat[i] / confsumh[i]
svc_ypredconfprob_all = (propconfmat / 100)

print("SVM ====> original score, ", m.accuracy_score(ytest, svc_ypred))

#===============================================================
#===============================================================
#===============================================================
#===============================================================

#train xgb model
import xgboost as xgb

xgbclf = xgb.XGBClassifier(random_state=seed, n_estimators=1)
xgbclf.fit(xtrain, ytrain)
xgb_ypred = xgbclf.predict(xtest)
xgb_confmat = m.confusion_matrix(ytest, xgb_ypred)
xgb_ypredprob = xgbclf.predict_proba(xtest)
xgb_confmat = m.confusion_matrix(ytest, xgb_ypred)

confsumh = np.sum(xgb_confmat, axis=1)
propconfmat = xgb_confmat.copy()
for i in range(propconfmat.shape[0]):
    propconfmat[i] = 100 * propconfmat[i] / confsumh[i]
xgb_ypredconfprob_all = (propconfmat / 100)

print("xgb ====> original score, ", m.accuracy_score(ytest, xgb_ypred))

#===============================================================
#XG <- xgboost(data = train, label = result, params =param, nrounds = 100)


from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


#clf1 = SVC(C = 0.99, kernel = 'linear', probability = True, verbose = 1) 
# SVC is slow
clf2 = RandomForestClassifier(random_state = 13, n_estimators = 200, verbose = 1)
clf3 = LogisticRegression(random_state = 13, verbose = 1)
#clf4 = MultinomialNB(alpha = 0.1)
clf4 = xgb.XGBClassifier(max_depth = 25, learning_rate = 0.1, objective = "binary:logistic",
                    scale_pos_weight = sumneg / sumpos, silent = 0, nthread = 16,
                    max_delta_step = 4, subsample = 0.8, min_child_weigth = 2, seed = 13,
                    n_estimators = 100, verbose = 1) 

#eclf_hard = VotingClassifier(estimators=[('SVC', clf1), ('rf', clf2), ('lr', clf3), ('xgb', clf4)], voting = 'hard')
#eclf_soft = VotingClassifier(estimators=[('SVC', clf1), ('rf', clf2), ('lr', clf3), ('xgb', clf4)], voting = 'soft')

eclf_hard = VotingClassifier(estimators=[('rf', clf2), ('lr', clf3), ('xgb', clf4)], voting = 'hard')
eclf_soft = VotingClassifier(estimators=[('rf', clf2), ('lr', clf3), ('xgb', clf4)], voting = 'soft')


result = pd.DataFrame()
#for clf, label in zip([clf1, clf2, clf3, clf4, eclf_hard, eclf_soft], ['SVC', 'Random Forest', 'Logistic Reg', 'XGBBoosting', 'Ensemble_hard', 'Ensemble_soft']):
for clf, label in zip([clf2, clf3, clf4, eclf_hard, eclf_soft], ['Random Forest', 'Logistic Regression', 'XGBBoost', 'Hard Voting', 'Soft Voting']):
    print(label)
    clf.fit(x_train, y_train)
    if label == 'Hard Voting':
예제 #3
0
    # eq_data_menor += variables_per_class[2]
    # eq_data_menor += variables_per_class[3]
    # eq_data_menor += variables_per_class[4]
    # eq_data_menor += variables_per_class[5]
    # eq_data_menor += variables_per_class[6]
    # eq_data_menor = np.array(eq_data_menor)

    X_train_menor, X_test_menor, y_train_menor, y_test_menor = train_test_split(
        data[:, 1:55], data[:, 55], test_size=test_avg)
    normalizer = Normalizer().fit(X_train_menor)
    X_train_menor = normalizer.transform(X_train_menor)
    X_test_menor = normalizer.transform(X_test_menor)
    ##

    ##
    bst = xgb.XGBClassifier()
    bst.fit(X_train_menor, y_train_menor)
    y_pred = bst.predict(X_test_menor)
    # print(confusion_matrix(y_test_menor, y_pred))
    print(classification_report(y_test_menor, y_pred))
    print('{}_XGB_m - {}'.format(iteration,
                                 accuracy_score(y_test_menor, y_pred)))
    final_predictions = bst.predict(predict[:, 1:])
    with open(r'Output\torrent\torrent_0{}_XGB_m.txt'.format(iteration),
              'w') as file:
        with open(r'Data\Estimar_UH2020.csv', 'r') as read:
            for i in range(len(final_predictions)):
                line = read.readline().split('|')
                file.write('{}|{}'.format(line[0],
                                          dictio_i[final_predictions[i]]))
                if line[0] not in avg_prediction:
예제 #4
0
    # 不同 Class 统计 (根据 Target 列)
    print("\nDataset shape: ", X.shape, " Number of features: ", X.shape[1])
    num_categories = np.unique(y).size
    sum_y = np.asarray(np.unique(y.astype(int), return_counts=True))
    df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
    print('\n', df_sum_y)

    # 初始化 classifier 字典
    clf = {
        'lgb':
        lgb.LGBMClassifier(random_state=args.randomseed,
                           n_jobs=-1,
                           boosting_type='gbdt'),
        'xgb':
        xgb.XGBClassifier(booster='gblinear',
                          objective='binary:logistic',
                          n_jobs=-1,
                          random_state=args.randomseed),
        'gdbt':
        GradientBoostingClassifier(random_state=args.randomseed),
        'rf':
        RandomForestClassifier(n_jobs=-1, random_state=args.randomseed),
        'ext':
        ExtraTreesClassifier(n_estimators=100,
                             n_jobs=-1,
                             random_state=args.randomseed),
        'knn':
        KNeighborsClassifier(n_jobs=-1),
        'nb':
        GaussianNB(),
        'svm':
        SVC(kernel='sigmoid', random_state=args.randomseed)
lModels.append(('KNN-Clf', KNeighborsClassifier()))
lModels.append(('LogRegr', LogisticRegression(random_state=707)))
lModels.append(('DecTree', DecisionTreeClassifier(random_state=707)))
lModels.append(('GNBayes', GaussianNB()))
lModels.append(
    ('RandomForestClassifier', RandomForestClassifier(random_state=707)))
lModels.append(('GradientBoostingClassifier',
                GradientBoostingClassifier(n_estimators=100,
                                           learning_rate=1.0,
                                           max_depth=1,
                                           random_state=707)))
lModels.append(('AdaBoostClassifier',
                AdaBoostClassifier(n_estimators=100, random_state=707)))
lModels.append(('XGBoostClassifier',
                xgb.XGBClassifier(booster='gbtree',
                                  objective='multi:softprob',
                                  verbosity=0,
                                  seed=707)))
for vModel in lModels:
    print(vModel)
print("Done ...")

################################
# Classification - cross validation
###############################

# blank list to store results
print("\n*** Cross Validation Init ***")
xvModNames = []
xvAccuracy = []
xvSDScores = []
print("Done ...")
예제 #6
0
def xgboost_forecast(input_covariates,
                     training_window_end,
                     num_forecast_steps,
                     smooth_coef = 1.0,
                     max_num_training_samples = 100000,
                     max_num_val_samples = 10000,
                     num_threads = 1):
  """Forecast covariates using XGBoost classification or regression.

  Args:
    input_covariates:
    training_window_end: The number of points in the training data.
    num_forecast_steps: Number of forecasting steps.
    smooth_coef: Smoothing coefficient.
    max_num_training_samples: Maximum number of training samples.
    max_num_val_samples: Maximum number of validation samples.
    num_threads: The number of parallel threads with which XGBoost should run.

  Returns:
    An array of covariates for the input feature.
  """
  # Hyperparameter search space
  lr_candidate = [0.05, 0.1]
  n_estimators_candidate = [20, 100]
  max_depth_candidate = [5]
  subsample_candidate = [1.0]
  lambda_candidate = [1.0, 0.01]

  # Decide whether to treat the problem as classification or
  # regression based on the number of categories for labels.
  unique = np.unique(input_covariates.flatten())
  num_unique_values = unique.size
  if num_unique_values > 10:
    problem_type = "regression"
    # Note that the range may
    # not be in [0, 1] and with increasing and decreasing trends, covariate
    # values may go beyond the training data range.
    min_value = 0.0
    max_value = 1.0
  elif num_unique_values > 1:
    problem_type = "classification"
    le = preprocessing.LabelEncoder()
    le.fit(input_covariates.flatten())
  else:
    # If there is only one value return that value
    return (np.ones(
        (input_covariates.shape[0], num_forecast_steps)) * unique[0])

  if problem_type == "regression":
    # Smoothing
    input_covariates_smoothed = np.copy(input_covariates)
    for t in range(1, input_covariates.shape[1] - 1):
      input_covariates_smoothed[:, t] = (
          smooth_coef * input_covariates_smoothed[:, t] + (1.0 - smooth_coef) *
          (input_covariates_smoothed[:, t - 1]))
  else:
    input_covariates_smoothed = input_covariates

  # Construct datasets.

  # Actual case vs. for short-ter integration tests.
  if training_window_end > 5 * num_forecast_steps:
    lags = [1, 2, 3, 4, 5, 6, 7, 14, 21, 28]
    averages = [3, 5, 7, 14, 21, 28]
    max_windows = [3, 5, 7, 14, 21, 28]
    if problem_type == "classification":
      training_window_beginning = num_forecast_steps
    else:
      training_window_beginning = max(
          training_window_end - 5 * num_forecast_steps, 0)
  else:
    lags = [1, 2, 3, 4, 5, 6, 7]
    averages = [3, 5, 7]
    max_windows = [3, 5, 7]
    training_window_beginning = 8

  # Train
  train_range_end = max(training_window_end - 2 * num_forecast_steps,
                        training_window_beginning + 1)
  if input_covariates.shape[1] - train_range_end <= num_forecast_steps:
    raise ValueError(
        f"Cannot forecast covariates {num_forecast_steps} days with only "
        f"{input_covariates.shape[1]} days of data")
  train_features, train_labels = generate_prediction_features(
      input_covariates,
      input_covariates_smoothed,
      range(training_window_beginning, train_range_end),
      num_forecast_steps,
      is_training=True,
      lags=lags,
      averages=averages,
      max_windows=max_windows)

  num_training_points, label_dim = train_labels.shape
  if num_training_points > max_num_training_samples:
    np.random.seed(seed=1)
    indices = np.random.choice(train_features.shape[0],
                               max_num_training_samples)
    train_features = train_features[indices, :]
    train_labels = train_labels[indices, :]
    num_training_points = max_num_training_samples

  # Validation
  val_features, val_labels = generate_prediction_features(
      input_covariates,
      input_covariates_smoothed,
      range(train_range_end,
            max(training_window_end - num_forecast_steps, train_range_end + 1)),
      num_forecast_steps,
      is_training=True,
      lags=lags,
      averages=averages,
      max_windows=max_windows)

  num_val_points = val_features.shape[0]
  if num_val_points > max_num_val_samples:
    np.random.seed(seed=1)
    indices = np.random.choice(val_features.shape[0], max_num_val_samples)
    val_features = val_features[indices, :]
    val_labels = val_labels[indices, :]
    num_val_points = max_num_val_samples

  # Test
  test_features, _ = generate_prediction_features(
      input_covariates,
      input_covariates_smoothed,
      range(training_window_end, training_window_end + 1),
      num_forecast_steps,
      is_training=False,
      lags=lags,
      averages=averages,
      max_windows=max_windows)
  num_test_points = test_features.shape[0]

  optimal_err = 1e128
  for lr in lr_candidate:
    for n_estimators in n_estimators_candidate:
      for max_depth in max_depth_candidate:
        for subsample in subsample_candidate:
          for lambda_v in lambda_candidate:

            if problem_type == "regression":
              multioutputpredictor = MultiOutputRegressor(
                  xgboost.XGBRegressor(
                      learning_rate=lr,
                      n_estimators=n_estimators,
                      max_depth=max_depth,
                      subsample=subsample,
                      reg_lambda=lambda_v,
                      objective="reg:squarederror",
                      n_jobs=num_threads)).fit(train_features, train_labels)
            elif problem_type == "classification":
              multioutputpredictor = MultiOutputClassifier(
                  xgboost.XGBClassifier(
                      learning_rate=lr,
                      n_estimators=n_estimators,
                      max_depth=max_depth,
                      subsample=subsample,
                      reg_lambda=lambda_v,
                      n_jobs=num_threads)).fit(
                          train_features,
                          le.transform(
                              train_labels.reshape(
                                  (num_training_points * label_dim))).reshape(
                                      (num_training_points, label_dim)))

            val_predicted = multioutputpredictor.predict(val_features)
            if problem_type == "classification":
              val_predicted = le.inverse_transform(
                  val_predicted.reshape((num_val_points * label_dim))).reshape(
                      (num_val_points, label_dim))

            val_err = np.mean((val_predicted - val_labels)**2)

            if val_err < optimal_err:
              multioutputpredictor_opt = multioutputpredictor
              optimal_err = val_err

  # Generate forecasts
  test_predicted = multioutputpredictor_opt.predict(test_features)
  if problem_type == "classification":
    test_predicted = le.inverse_transform(
        test_predicted.reshape((num_test_points * label_dim))).reshape(
            (num_test_points, label_dim))

  else:
    # Clip to be in the range.
    test_predicted = np.clip(test_predicted, min_value, max_value)

  return test_predicted
예제 #7
0
X = np.load('data/X51.npy')
Y = np.load('data/y51.npy')

# fixes errors with Nan data
X = preprocessing.Imputer().fit_transform(X)
print(X.shape, Y.shape)

# Recursive oversampling and undersampling
adsn = ADASYN(imb_threshold=0.5, ratio=0.7)
X, Y = adsn.fit_transform(X, Y)
X, Y = deleteClass(X, Y, 100, 2)
print(int(np.sqrt(X.shape[1])))

# Create the RFE object and compute a cross-validated score.
rf = RandomForestClassifier(n_jobs=-1)
gbm = xgb.XGBClassifier(n_estimators=300)
# The "accuracy" scoring is proportional to the number of correct
# classifications

param_dist = {
    "n_estimators": [10, 50, 100, 150, 300],
    "criterion": ['gini', 'entropy'],
    "bootstrap": [True, False],
    "max_features": [10, 20, 30, 40, 45, 48],
    "class_weight": ['auto']
}
param_dist_xgb = {
    "max_depth": [5, 10, 15, 25, 30],
    "learning_rate": [0.001, 0.01, 0.2, 0.5, 0.7],
    "subsample": [0.3, 0.5, 0.9, 1],
    "gamma": [0.001, 0.01, 0.2, 0.7, 2],
예제 #8
0
파일: models.py 프로젝트: xieliaing/shap
def cric__gbm():
    """ Gradient Boosting Machines
    """
    import xgboost
    return xgboost.XGBClassifier(n_estimators=500, learning_rate=0.01, subsample=0.2, n_jobs=8, random_state=0)
예제 #9
0
파일: clf.py 프로젝트: wwagner4/py-tryout02
            normalized=True
            ),
    ClfConf(id="knn",
            clf=lambda: KNeighborsClassifier(n_neighbors=3),
            normalized=False
            ),
    ClfConf(id="nm_g",
            clf=lambda: GaussianNB(),
            normalized=False
            ),
    ClfConf(id="rf",
            clf=lambda: RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
            normalized=False
            ),
    ClfConf(id="xgb",
            clf=lambda: xgb.XGBClassifier(),
            normalized=False
            ),
    ClfConf(id="gb",
            clf=lambda: GradientBoostingClassifier(
                loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse',
                min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3,
                min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None,
                verbose=0, max_leaf_nodes=None),
            normalized=False
            ),
]

feature_groups = ["mean", "se", "worst"]

feature_selections: Dict[str, List[str]] = {
예제 #10
0
# It's interesting to compare the parameters that hyperopt found for random forest and XGBoost. Random forest ended up with 375 trees of depth 7, where XGBoost has 250 of depth 5. This fits the theory that random forest averages many complex (independantly trained) trees to get good results, where xgboost & lightgbm (boosting) add up many simple trees (trained on residuals).

# ## Comparing the models
#
# Now let's see how the models perform - if hyperopt has determined a sensible set of parameters for us...

# In[ ]:

rf_model = RandomForestClassifier(n_jobs=4,
                                  class_weight='balanced',
                                  n_estimators=325,
                                  max_depth=5)

xgb_model = xgb.XGBClassifier(n_estimators=250,
                              learning_rate=0.05,
                              n_jobs=4,
                              max_depth=2,
                              colsample_bytree=0.7,
                              gamma=0.15)

lgbm_model = lgbm.LGBMClassifier(n_estimators=500,
                                 learning_rate=0.01,
                                 num_leaves=16,
                                 colsample_bytree=0.7)

models = [
    ('Random Forest', rf_model),
    ('XGBoost', xgb_model),
    ('LightGBM', lgbm_model),
]

for label, model in models:
예제 #11
0
        true_data.append(line[1:54] + categorical_encoder_catastral[line[54]] +
                         [categorical_encoder_class[line[55]]])

# Finalmente convertimos las muestras preprocesadas a una matriz
data = np.array(data).astype('float32')
true_data = np.array(true_data).astype('float32')

# Variable en el rango (0.0 - 1.0) que indica el procentaje de muestras de validación
test_avg = 0.2

# X -> Datos ya tratados sin predicción
# Y -> Predicción
last_position = len(data[0]) - 1
X, Y = (data[:, :last_position], data[:, last_position])

model = xgb.XGBClassifier(objective='binary:hinge', )

sss = StratifiedShuffleSplit(
    n_splits=1,  # Solo una partición
    test_size=0.2,  # Repartición 80/20 
)

# La función es un iterador (debemos iterar, aunque sea solo una vez)
for train_index, test_index in sss.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
예제 #12
0
            new_y.append(label)
            continue
        if ensg_to_name[gene] not in pnas_d:
            new_y.append(label)
            continue

        #if the prior of the gene is over the cutoff, set label to zero
        if pnas_d[ensg_to_name[gene]] >= cutoff:
            new_y.append(0)

        else:
            new_y.append(label)

    new_y = np.array(new_y)
    print np.bincount(new_y)

    count = 0
    while count < 25:
        clf = xgboost.XGBClassifier(n_estimators=100, n_jobs=12)

        skf = cross_validation.StratifiedKFold(new_y, n_folds=4, shuffle=True)
        for i, (train, test) in enumerate(skf):
            preds = clf.fit(X_new[train], new_y[train],
                            eval_metric='auc').predict_proba(X_new[test])
            auroc = roc_auc_score(new_y[test], preds[:, 1])
            print auroc
            mr[cutoff].append(auroc)
        count += 1

joblib.dump(mr, './HUVEC_AUROC_removal_titration_results_randomprobs_nohic')
예제 #13
0
a = a.set_index('Unnamed: 0')
b = b.set_index('Unnamed: 0')
train_data = pd.concat([train_data, a], axis=1)
test_data = pd.concat([test_data, b], axis=1)
del all_data

test_result = pd.DataFrame(EID.values, columns=["EID"])
depth = [6, 7, 8]
nround = [2250, 2000, 1750]
for i in range(10):
    model_xgb = xgb.XGBClassifier(learning_rate=0.02,
                                  n_estimators=nround[i % 3],
                                  min_child_weight=2,
                                  max_depth=depth[i % 3],
                                  gamma=0.0,
                                  subsample=0.8,
                                  colsample_bytree=0.6,
                                  objective='binary:logistic',
                                  nthread=12,
                                  scale_pos_weight=1,
                                  seed=27 * i)
    model_xgb.fit(train_data, y_train)
    test_y_prob = model_xgb.predict_proba(test_data)[:, 1]
    test_result['PROB_' + str(i)] = test_y_prob
test_result.to_csv('C:/Luoshichao/result/test_result.csv',
                   index=None,
                   encoding='utf-8')
result = test_result
scr_test = result[['PROB_' + str(i) for i in range(10)]].mean(axis=1)
result['PROB'] = scr_test
df = result[['EID', 'PROB']]
def build_custom_model(n_estimators=100, max_depth=3, learning_rate=1.e-3):
    model = xgb.XGBClassifier(max_depth=max_depth,
                              n_estimators=n_estimators,
                              learning_rate=learning_rate)
    return model
예제 #15
0
    def __init__(self,
                 lemmatization=False,
                 granularity="label",
                 failures_skip=None):
        Model.__init__(self, lemmatization)

        self.granularity = granularity
        self.failures_skip = failures_skip

        self.training_dbs = [repository.COMMITS_DB]
        self.eval_dbs[repository.COMMITS_DB] = (
            repository.COMMITS_DB,
            repository.COMMIT_EXPERIENCES_DB,
        )
        if granularity == "label":
            self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_LABEL_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_LABEL_DB,
                test_scheduling.FAILING_TOGETHER_LABEL_DB,
            )
        elif granularity == "group":
            self.training_dbs.append(test_scheduling.TEST_GROUP_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
            )
            self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = (
                test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB, )
        elif granularity == "config_group":
            self.training_dbs.append(
                test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB)
            self.eval_dbs[test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
            )

        self.cross_validation_enabled = False

        self.entire_dataset_training = True

        self.sampler = RandomUnderSampler(random_state=0)

        feature_extractors = [
            test_scheduling_features.prev_failures(),
        ]

        if granularity == "label":
            feature_extractors += [
                test_scheduling_features.platform(),
                # test_scheduling_features.chunk(),
                test_scheduling_features.suite(),
            ]
        elif granularity in ("group", "config_group"):
            feature_extractors += [
                test_scheduling_features.path_distance(),
                test_scheduling_features.common_path_components(),
                test_scheduling_features.touched_together(),
            ]

        self.extraction_pipeline = Pipeline([
            (
                "commit_extractor",
                commit_features.CommitExtractor(feature_extractors, []),
            ),
            ("union", ColumnTransformer([("data", DictVectorizer(), "data")])),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
예제 #16
0
#y_test = mdf.iloc[(end+7)*24*180:(end+14)*24*180,-1]
#X_eval = mdf.iloc[-14*24*180:-7*24*180,:-1]
#y_eval = mdf.iloc[-14*24*180:-7*24*180,-1]
X_test = mdf.iloc[-14 * 24 * 180:, :-1]
y_test = mdf.iloc[-14 * 24 * 180:, -1]

print(y[y == 0].count() / y[y == 1].count())

# ### Initial XGBoost attempt

# In[ ]:

xgbc = xgb.XGBClassifier(max_depth=4,
                         n_estimators=5,
                         subsample=0.5,
                         eval_metric='logloss',
                         colsample_bytree=0.8,
                         min_child_weight=100,
                         gamma=50)

xgbc.fit(X, y)
print(xgbc.feature_importances_)
print(xgbc.score(X, y))

# In[ ]:

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
preds = xgbc.predict(X_test)
results = confusion_matrix(y_test, preds)
print('Test Set Results')
print('Confusion Matrix :')
예제 #17
0
 def XGB(self):
     self.xgb = xgb.XGBClassifier()
     self.xgb.fit(self.X, self.y)

xgb_params = {
    "objective": "multi:softprob",
    "max_depth": 5,
    "learning_rate": 0.05,
    "silent": 1,
    "n_estimators": 2000,
    "subsample": 0.9,
    "seed": 123451,
    "n_jobs":10,
}



xgb_clf_odd = xgb.XGBClassifier(**xgb_params)
xgb_clf_odd.fit(
    X_odd,
    y_odd,
    sample_weight=w_odd,
    early_stopping_rounds=50,
    eval_set=[(X_odd, y_odd, w_odd), (X_even, y_even, w_even)],
    eval_metric = "mlogloss",
    verbose=True,
)

xgb_clf_odd.get_booster().save_model('mvadm_inclusive_2fold_applytoeven.model')


with open ("mvadm_inclusive_2fold_applytoeven.pkl",'w') as f:
    pickle.dump(xgb_clf_odd,f)
def xgboost_model(data):

    model = {'data': data}

    print("\nbuilding corpus vector space...\n")

    model['bm25'] = BM25Transformer()
    model['vectorizer'] = TfidfVectorizer()
    model['vectorizer'].fit(data['TrainData'])
    #data['vectorizer'].fit(ValidToken)

    TrainTf = model['vectorizer'].transform(tqdm(data['TrainData']))

    print("fitting bm25...", end='')
    sys.stdout.flush()
    model['bm25'].fit(TrainTf)
    #data['bm25'].fit(ValidTf)
    print("ok")

    print("transforming...", end='')
    sys.stdout.flush()
    data['TrainData'] = model['bm25'].transform(TrainTf)
    print("ok")
    print('TrainTf.shape:', TrainTf.shape)

    ytrain = data['TrainLabel']
    xtrain_tfv = data['TrainData']

    svd = decomposition.TruncatedSVD(n_components=120)

    svd.fit(xtrain_tfv)
    xtrain_svd = svd.transform(xtrain_tfv)

    scl = preprocessing.StandardScaler()
    scl.fit(xtrain_svd)
    xtrain_svd_scl = scl.transform(xtrain_svd)

    clf = xgb.XGBClassifier(max_depth=7,
                            n_estimators=200,
                            colsample_bytree=0.8,
                            subsample=0.8,
                            nthread=10,
                            learning_rate=0.1)
    clf.fit(xtrain_svd, ytrain)

    def documents_to_bm25(tokens):
        tf = model['vectorizer'].transform(tqdm(tokens))
        print("doing the valid set transformation...", end='')
        sys.stdout.flush()
        DocData = model['bm25'].transform(tf)
        print("ok")
        print('ValidTf.shape:', tf.shape)
        return DocData

    def validate(documents, show_loss=False):
        xvalid_tfv = documents_to_bm25(documents)
        xvalid_svd = svd.transform(xvalid_tfv)
        xvalid_svd_scl = scl.transform(xvalid_svd)

        if show_loss:
            predictions = clf.predict_proba(xvalid_svd)
            print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

        predictions = clf.predict(xvalid_svd)
        return predictions

    def predict(doc, show_loss=False):
        return validate([data['config']['tokenize'](doc)], show_loss)[0]

    model['validate'] = validate
    model['predict'] = predict

    return model
 def __init__(self):
     self.model = xgb.XGBClassifier()
if __name__ == '__main__':

    train = df_of_images('train')
    test = pd.DataFrame({'image_path': [os.path.join(p, 'test', i) for i in
                                        os.listdir('../249_plant/data/test/')]})
    base_model = InceptionV3()
    model = Model(inputs=base_model.input, outputs=base_model.get_layer('avg_pool').output)

    train['image_features'] = train.image_path.apply(lambda x: extract_features_keras(x, base_model))
    test['image_features'] = test.image_path.apply(lambda x: extract_features_keras(x, base_model))

    train_, test_ = train_test_split(train, test_size=0.33, random_state=42, stratify=train.label)

    'train:', train_.label.value_counts() / len(train_), 'test:', test_.label.value_counts() / len(test_)

    xgc = xgb.XGBClassifier(objective='multi:softmax', num_class=train.label.nunique())
    xgc.fit(pd.DataFrame(train_['image_features'].values.tolist()), train_.label)

    results = test_.copy()
    results['y_pred'] = xgc.predict(pd.DataFrame(test_['image_features'].values.tolist()))

    label_map = {x.lower().strip().replace(' ', '_').replace('-', '_'): x for x in os.listdir(os.path.join(p, 'train'))}

    results = test.copy()
    results['species'] = xgc.predict(pd.DataFrame(test['image_features'].values.tolist()))
    results['species'] = results['species'].replace(label_map)
    results['file'] = results.image_path.apply(lambda x: x.split('/')[-1])

    results[['file', 'species']].to_csv('submission.csv', index=False)

예제 #22
0
            1: 4
        }),
        n_estimators=5)
    clf_RandomForest = RandomForestClassifier(class_weight={0: 1, 1: 11})
    clf_LogisticReg = LogisticRegression(class_weight={
        0: 1,
        1: 9
    },
                                         warm_start=True)

    clf_AdaBoost.fit(train_features, train_labels)
    clf_RandomForest.fit(train_features, train_labels)

    clf_GradientBooster = xgb.XGBClassifier(max_depth=10,
                                            n_estimators=500,
                                            learning_rate=0.1,
                                            seed=27).fit(
                                                train_features, train_labels)

    clf_LogisticReg.fit(train_features, train_labels)

    print "\nClassifier Map: 0:AdaBoost 1:RandomForest 2:Gradient Booster 3.Logistic Regression"
    classifiers_map = {
        0: clf_AdaBoost,
        1: clf_RandomForest,
        2: clf_GradientBooster,
        3: clf_LogisticReg
    }
    print "\nTrain End Time : {}".format(time.time())

    auc_list = []
예제 #23
0
                                            ascending=False,
                                            inplace=True,
                                            axis=0)

                        df[col] = temp_df.col_idx.tolist()

                    except:
                        pass

            if col == 'feat_impo_xgb':
                xgb = xgboost.XGBClassifier(learning_rate=0.001,
                                            n_estimators=2000,
                                            verbosity=0,
                                            objective='binary:logistic',
                                            booster='gbtree',
                                            tree_method='auto',
                                            n_jobs=-1,
                                            gpu_id=0,
                                            min_child_weight=3,
                                            subsample=0.8,
                                            colsample_bytree=0.7)
                xgb.fit(x_train, y_train)

                dict_importance = {}
                for feature, importance in zip(x_train.columns,
                                               xgb.feature_importances_):
                    dict_importance[feature] = importance

                best_xgb_features = []

                for idx, w in enumerate(
예제 #24
0
list_tree = [10, 40,60,100,150]
list_max_depth = [4,6,8]
list_min_child_weight= [1, 2,5]
list_gamma = [0.5, 1]
list_subsample = [0.8, 1.0]
list_colsample_bytree = [0.6, 0.8, 1.0]

score = 0

for tree in list_tree:
    for max_depth in list_max_depth:
        for min_child_weight in list_min_child_weight:
            for gamma in list_gamma:
                for subsample in list_subsample:
                    for colsample_bytree in list_colsample_bytree:
                        model = xgboost.XGBClassifier(max_depth=max_depth, learning_rate=0.02, n_estimators=tree, silent=True,
                                                     min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, nthread=2, reg_lambda=0, reg_alpha=1)
                        model.fit(X_train[features], y_train)
                        score_ = model.score(X_test[features], y_test)
                        
                        if score_ > score:
                            print(score_)
                            tmp_tree = tree
                            tmp_max_depth = max_depth
                            tmp_min_child_weight = min_child_weight
                            tmp_gamma = gamma
                            tmp_subsample = subsample
                            tmp_colsample_bytree = colsample_bytree
                            score = score_ 

print("results Grid Search : ")
print(tmp_tree)
예제 #25
0
def loss_with_per_tree_stats(df, new_cols):
    features = [
        'bathrooms', 'bedrooms', 'latitude', 'longitude', 'price',
        'num_features', 'num_photos', 'word_num_in_descr', "created_month",
        "created_day", CREATED_HOUR, CREATED_MINUTE, DAY_OF_WEEK
    ]
    features += new_cols

    train_df, test_df = split_df(df, 0.7)

    train_df, test_df, new_cols = process_mngr_categ_preprocessing(
        train_df, test_df)
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_df, test_df, new_cols = process_manager_num(train_df, test_df)
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_df, test_df, new_cols = process_bid_categ_preprocessing(
        train_df, test_df)
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_df, test_df, new_cols = process_bid_num(train_df, test_df)
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_df, test_df, new_cols = process_listing_id(train_df, test_df)
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_df, test_df, new_cols = process_nei123(train_df, test_df)
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_df, test_df, new_cols = process_time_density(train_df, test_df)
    print new_cols
    train_df, test_df = shuffle_df(train_df), shuffle_df(test_df)
    features += new_cols

    train_target, test_target = train_df[TARGET].values, test_df[TARGET].values
    del train_df[TARGET]
    del test_df[TARGET]

    train_df = train_df[features]
    test_df = test_df[features]

    train_arr, test_arr = train_df.values, test_df.values
    print features

    estimator = xgb.XGBClassifier(n_estimators=1100,
                                  objective='mlogloss',
                                  subsample=0.8,
                                  colsample_bytree=0.8)
    eval_set = [(train_arr, train_target), (test_arr, test_target)]
    estimator.fit(train_arr,
                  train_target,
                  eval_set=eval_set,
                  eval_metric='mlogloss',
                  verbose=False)

    # plot feature importance
    # ffs= features[:len(features)-1]+['man_id_high', 'man_id_medium', 'man_id_low', 'manager_skill']
    # sns.barplot(ffs, [x for x in estimator.feature_importances_])
    # sns.plt.show()

    # print estimator.feature_importances_
    proba = estimator.predict_proba(test_arr)

    loss = log_loss(test_target, proba)
    loss1K = get_loss_at1K(estimator)
    return loss, loss1K, xgboost_per_tree_results(
        estimator), estimator.feature_importances_
예제 #26
0
                                  n_jobs=-1,
                                  n_estimators=1000,
                                  num_leaves=80,
                                  scale_pos_weight=0.05,
                                  verbose=2)),
              ('rf',
               RandomForestClassifier(random_state=123456,
                                      n_jobs=-1,
                                      max_depth=30,
                                      n_estimators=400,
                                      verbose=2)),
              ('xgboost',
               xgb.XGBClassifier(predictor='cpu_predictor',
                                 n_gpus=0,
                                 n_jobs=-1,
                                 n_estimators=700,
                                 eta=0.1,
                                 max_depth=10,
                                 verbose=2))]

stacking = StackingClassifier(estimators=estimators,
                              final_estimator=LogisticRegression(),
                              cv=5,
                              verbose=2)

#stacking, y_test_stacking = validacion_cruzada(stacking, X, y, skf)

# Entreno de nuevo con el total de los datos
# El resultado que muestro es en training, será mejor que en test
t = time.time()
clf = stacking
예제 #27
0
logreg_coefs_r = StandardScaler().fit_transform(logreg_coefs_r[0].reshape(
    -1, 1))
FeatureCoefs['logreg_resample'] = logreg_coefs_r

# Logreg Summaries
print('Original Logreg AUC Score: %f' % (auc))
print('Resampled Logreg AUC Score: %f' % (auc_r))
print('Original Data')
print(classification_report(y_holdout, preds))
print('Resampled Data')
print(classification_report(y_holdout, preds_r))

# XGBoost

# Tune parameters
xgb_model = xgb.XGBClassifier()

parameters = {
    'max_depth': [7],
    'learning_rate': [0.001],
    'n_estimators': [1000],
    'objective': ['binary:logistic'],
    'gamma': [0],
    'min_child_weight': [6],
    'subsample': [0.7],
    'colsample_bytree': [0.5]
}

clf = GridSearchCV(xgb_model,
                   param_grid=parameters,
                   scoring='roc_auc',
예제 #28
0
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


baselines_map = {
    'knn_clf': KNeighborsClassifier(n_neighbors=1, weights='distance'),
    'random_forest': RandomForestClassifier(),
    'logistic_regression': LogisticRegression(),
    'svc': SVC(gamma='auto'),
    'xgboost':
        xgb.XGBClassifier(
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=0.1,
            max_depth=6,
            min_child_weight=12,
            n_estimators=100,
            subsample=0.95
        ),
    'simple-NN': get_keras_simple_nn()
}


def get_baselines_performance(df_train, df_val, label_col='Binary', use_only=None):
    df_train = get_rdkit_features(df_train)
    df_val = get_rdkit_features(df_val)
    input_cols = [
        'BalabanJ', 'BertzCT', 'MaxAbsPartialCharge', 'MolLogP', 'MolWt', 'NumAliphaticCarbocycles',
        'NumRotatableBonds', 'RingCount', 'SlogP_VSA10', 'TPSA'
    ]
y=data["default.payment.next.month"]

#willbe using xgboost for this problem statement 

#hyper parameter 
param={
       "learning_rate":[0.01,0.05,0.10,0.15,0.20,0.25,0.30],
           "max_depth":[2,4,6,8,10,12,14],
               "min_child_weight":[1,3,5,7,9],
                   "gamma":[0.0,0.1,0.2,0.3,0.4],
                       "colsample_bytree":[0.4,0.5,0.6,0.7,0.8]
       }

from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
import xgboost
classifier=xgboost.XGBClassifier()

random_search=RandomizedSearch(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,verbose=3)

random_search.best_estimator_

random_search.best_params_

classifier=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, gamma=0.1, learning_rate=0.25,
       max_delta_step=0, max_depth=3, min_child_weight=7, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)
예제 #30
0
    if 'Category' in data:
        labels = label_encoder.fit_transform(data['Category'])
    return df, labels


features, labels = get_features(train)
scaler = StandardScaler()
scaler.fit(features)
features = scaler.transform(features)

print 'training...', datetime.now()

# nthread = 1 es para que no me consuma todos los recursos de la maquina :S
gbm = xgb.XGBClassifier(max_depth=9,
                        nthread=-1,
                        n_estimators=100,
                        learning_rate=0.3,
                        silent=True,
                        subsample=0.9)
gbm.fit(features, labels, eval_metric="mlogloss")

pickle.dump(gbm, open('xbm_model.pkl', 'wb'))

print 'prepairing test data...', datetime.now()

test = pd.read_csv('../data/test.csv.gz', parse_dates=True)
test_features, _ = get_features(test)

scaler.fit(test_features)
test_features = scaler.transform(test_features)

print 'predicting...', datetime.now()