예제 #1
0
def train_xgb(X, y, params, save_path=None, save_path_booster=None):

    # the threshold is not handled by XGB interface
    params, binary_threshold = _parse_param_and_delete(params,
                                                       'binary_threshold', .5)

    # n_jobs is handled by XGB SKL interface
    params = _parse_param_and_keep(params,
                                   name='n_jobs',
                                   default=min(max_cpu_count(), 24))

    X = np.asarray(X)
    y = np.asarray(y).flatten()

    if not tuple(np.sort(np.unique(y))) == (0, 1):
        raise NotImplementedError(
            'XGB Wrapper currently only support biinary classification.')

    # Fit the model
    model = XGBClassifier(use_label_encoder=False, )
    model = clone(model)
    model.set_params(**params)

    logging.info('Training...')
    model.fit(
        X,
        y,
        # early_stopping_rounds=10,
        verbose=True,
    )
    # Save and re-load (feature-agnostic model)
    temp_file = f'temp-{time.time()}-{random.random()}.bin'
    model.get_booster().save_model(temp_file)
    booster = Booster(model_file=temp_file)
    os.remove(temp_file)

    if binary_threshold == 'auto':
        p_ = booster.predict(DMatrix(X))
        p_ = np.sort(p_)
        binary_threshold = p_[int((y == 0).sum())]

    logging.info(f'Using a binary_threshold = {binary_threshold}')

    # Wrap
    model = XGBClassifierSKLWrapper(booster,
                                    features=X.shape[1],
                                    threshold=binary_threshold)

    # Save
    if save_path is not None:
        save_pickle(model, save_path)
    if save_path_booster is not None:
        save_pickle(model.get_booster(), save_path_booster)
    return model
예제 #2
0
    def test_dmatrix_creator(self):

        # This function acts as a pseudo-itertools.chain()
        def row_tup_iter(data):
            pdf = pd.DataFrame(data)
            yield pdf

        # Standard testing DMatrix creation
        expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
        expected_labels = np.array([1, 0] * 100)
        expected_dmatrix = DMatrix(data=expected_features,
                                   label=expected_labels)

        data = {
            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
            "label": [1, 0] * 100,
        }
        output_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=False,
            has_validation=False,
            has_base_margin=False,
        )
        # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
        # the same classifier and making sure the outputs are equal
        model = XGBClassifier()
        model.fit(expected_features, expected_labels)
        expected_preds = model.get_booster().predict(expected_dmatrix)
        output_preds = model.get_booster().predict(output_dmatrix)
        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))

        # DMatrix creation with weights
        expected_weight = np.array([0.2, 0.8] * 100)
        expected_dmatrix = DMatrix(data=expected_features,
                                   label=expected_labels,
                                   weight=expected_weight)

        data["weight"] = [0.2, 0.8] * 100
        output_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=True,
            has_validation=False,
            has_base_margin=False,
        )

        model.fit(expected_features,
                  expected_labels,
                  sample_weight=expected_weight)
        expected_preds = model.get_booster().predict(expected_dmatrix)
        output_preds = model.get_booster().predict(output_dmatrix)
        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
예제 #3
0
    def test_predict_sklearn_pickle(self):
        x, y = build_dataset()

        kwargs = {'tree_method': 'gpu_hist',
                  'predictor': 'gpu_predictor',
                  'verbosity': 2,
                  'objective': 'binary:logistic',
                  'n_estimators': 10}

        model = XGBClassifier(**kwargs)
        model.fit(x, y)

        save_pickle(model, "model.pkl")
        del model

        # load model
        model: xgb.XGBClassifier = load_pickle("model.pkl")
        os.remove("model.pkl")

        gpu_pred = model.predict(x, output_margin=True)

        # Switch to CPU predictor
        bst = model.get_booster()
        bst.set_param({'predictor': 'cpu_predictor'})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
예제 #4
0
def train_xgboost(args):
    """ Train a XGBoost model
    Args:
        args: structure with the following field:
            bucket_name, str, gcs bucket name to store trained model
            blob_name, str, gcs blob name to store trained model
            train_feature_name, str, name of the train feature csv
            train_label_name, str, name of train label csv
            no_classes, int, number of prediction classes in the model
            n_estimators, int, number of estimators (hypertune)
            max_depth, int, maximum depth of trees (hypertune)
            booster, str, type of boosters (hypertune)
    Return:
        xgboost model object
    
    """

    x_train = pd.read_csv(args.train_feature_name)
    y_train = pd.read_csv(args.train_label_name)

    # ---------------------------------------
    # Train model
    # ---------------------------------------

    params = {
        'n_estimators': args.n_estimators,
        'max_depth': args.max_depth,
        'booster': args.booster,
        'min_child_weight': 1,
        'learning_rate': 0.1,
        'gamma': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'reg_alpha': 0,
        'objective': 'multi:softprob',
        'num_class': args.no_classes,
    }
    xgb_model = XGBClassifier(**params, use_label_encoder=False)
    print(x_train.shape)
    print(y_train.shape)
    xgb_model.fit(x_train, y_train)

    # ---------------------------------------
    # Save the model to local
    # ---------------------------------------

    temp_name = 'model.bst'
    bst = xgb_model.get_booster()
    bst.save_model(temp_name)

    # ---------------------------------------
    # Move local model to gcs
    # ---------------------------------------

    subprocess.check_call(
        ['gsutil', 'cp', temp_name,
         os.path.join(args.job_dir, 'model.bst')],
        stderr=sys.stdout)

    return xgb_model
예제 #5
0
def get_feature_importance(data, labels, display=True):
    """
    :param data: dataframe to be used for feature importance
    :param labels: cluster labels to be used for classification
    :param display: Number of top important features and respective feature importance to be displayed.
    """
    df = pd.DataFrame(MinMaxScaler().fit_transform(data),
                      index=data.index,
                      columns=data.columns)
    imp_dict = {}
    for c in set(labels):
        print(f'cluster id = {c}')
        y = [1 if x == c else 0 for x in labels]
        X_train, X_test, y_train, y_test = train_test_split(df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=10)
        clf = XGBClassifier(n_estimators=1000,
                            max_depth=6,
                            learning_rate=0.01,
                            objective='binary:logistic',
                            eval_metric='auc')
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print('accuracy score is ', accuracy_score(y_test, y_pred))
        imp_dict[c] = clf.get_booster().get_score(importance_type='gain')
        if display:
            feature_imp_series = pd.Series(imp_dict[c], index=data.columns)
            print(feature_importance_df[cl + '_' +
                                        str(c)].dropna().sort_values(
                                            ascending=False)[:display])
    return imp_dict
 def test_xgboost_classifier_i5450(self):
     iris = load_iris()
     X, y = iris.data, iris.target
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=10)
     clr = XGBClassifier(objective="multi:softmax",
                         max_depth=1,
                         n_estimators=2)
     clr.fit(X_train,
             y_train,
             eval_set=[(X_test, y_test)],
             early_stopping_rounds=40)
     initial_type = [('float_input', FloatTensorType([None, 4]))]
     onx = convert_xgboost(clr, initial_types=initial_type)
     sess = InferenceSession(onx.SerializeToString())
     input_name = sess.get_inputs()[0].name
     label_name = sess.get_outputs()[1].name
     predict_list = [1., 20., 466., 0.]
     predict_array = np.array(predict_list).reshape(
         (1, -1)).astype(np.float32)
     pred_onx = sess.run([label_name], {input_name: predict_array})[0]
     pred_xgboost = sessresults = clr.predict_proba(predict_array)
     bst = clr.get_booster()
     bst.dump_model('dump.raw.txt')
     dump_data_and_model(
         X_test.astype(np.float32) + 1e-5,
         clr,
         onx,
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
         basename="XGBClassifierIris")
예제 #7
0
def get_importances(model: xgboost.XGBClassifier):
    """Возвращает важности моделей обученного xgboost-а"""
    imp = model.feature_importances_
    names = model.get_booster().feature_names
    li = list(zip(imp, names))
    li.sort(reverse=True)
    return li
예제 #8
0
def extract_xgboost_features(model: xgboost.XGBClassifier) -> pd.DataFrame:
    bst = model.get_booster()
    df = pd.DataFrame({
        "feature_name": bst.feature_names,
        "feature_importance": model.feature_importances_,
    })
    return df
예제 #9
0
    def test_predict_sklearn_pickle(self):
        x, y = build_dataset()

        kwargs = {
            'tree_method': 'gpu_hist',
            'predictor': 'gpu_predictor',
            'objective': 'binary:logistic',
            'n_estimators': 10
        }

        model = XGBClassifier(**kwargs)
        model.fit(x, y)

        save_pickle(model, "model.pkl")
        del model

        # load model
        model: xgb.XGBClassifier = load_pickle("model.pkl")
        os.remove("model.pkl")

        gpu_pred = model.predict(x, output_margin=True)

        # Switch to CPU predictor
        bst = model.get_booster()
        bst.set_param({'predictor': 'cpu_predictor'})
        cpu_pred = model.predict(x, output_margin=True)
        np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
예제 #10
0
def plot_feature_importance(plt,fig,X,Y,header,filename_out=None):

    model = XGBClassifier()
    model.fit(X, Y)

    keys, values = [],[]

    feature_importances = model.get_booster().get_score()
    for k, v in feature_importances.items():
        keys.append(k)
        values.append(v)



    values = numpy.array(values)
    idx = numpy.argsort(-values)
    keys = numpy.array(keys)[idx]
    values = values[idx]
    header = header[idx]

    N=5
    ax = fig.gca()
    ax.pie(values[:N],  labels=header[:N], autopct='%1.1f%%',shadow=False, startangle=90)
    #plt.set_title('Feature importance')
    if filename_out is not None:
        plt.savefig(filename_out)

    return
예제 #11
0
def myref(seed=1, plt_type='gain'):
    ### load module
    from xgboost import XGBClassifier
    ### load datasets
    if seed > 0:
        df_train = loadDataset('voice/voice_train_%d.csv' % seed)
        df_test = loadDataset('voice/voice_test_%d.csv' % seed)
    else:
        df_train = loadDataset('voice/voice.csv')
        df_test = loadDataset('voice/voice.csv')
    fixDeafults(df_train, discard=True)
    fixDeafults(df_test, discard=False)
    ### fit model for train data
    model = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=20, min_child_weight=1, gamma=0,
                          subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4,
                          scale_pos_weight=1, seed=27)
    model.fit(df_train.iloc[:,:-1], df_train.loc[:,'label'])
    ### make prediction for test data
    y_pred = model.predict(df_test.iloc[:,:-1])
    y_test = df_test.loc[:,'label'].values
    ### model evaluate
    diff = y_test - y_pred
    acc = float(diff[diff==0].size) / float(diff.size)
    print("accuarcy: %.4f%%" % (acc * 100.0))
    from xgboost import plot_importance
    imp_dict = model.get_booster().get_score(importance_type=plt_type)
    imp = pd.Series(imp_dict).sort_values(ascending=False)
    print(imp)
    fig,ax = plt.subplots(figsize=(10,15))
    plot_importance(model, height=0.5, max_num_features=64, ax=ax, importance_type=plt_type)
    plt.show()
예제 #12
0
def xgb_inference(model, X, Y, X_test):
    x = X.values
    y = Y.values
    x_tst = X_test.values
    x_tst = np.ascontiguousarray(x_tst)
    # y_oof = np.zeros(x.shape[0])
    y_tst = np.zeros((x_tst.shape[0], len(np.unique(y))))
    acc_scores = []
    rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                                   n_repeats=N_REPEATS,
                                   random_state=SEED)
    params = model.get_params()
    for i, (train_index, valid_index) in enumerate(rskf.split(x, y)):
        print(i)
        X_A, X_B = x[train_index, :], x[valid_index, :]
        y_A, y_B = y[train_index], y[valid_index]
        xgb_model = XGBClassifier(**params)
        X_A, X_B = np.ascontiguousarray(X_A), np.ascontiguousarray(X_B)
        y_A, y_B = np.ascontiguousarray(y_A), np.ascontiguousarray(y_B)
        xgb_model.fit(X_A,
                      y_A,
                      eval_set=[(X_B, y_B)],
                      early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                      verbose=0)
        best_iteration = xgb_model.get_booster().best_ntree_limit  # new
        # y_oof[valid_index] = xgb_model.predict(X_B, ntree_limit=best_iteration) # new
        tmp = xgb_model.predict(X_B, iteration_range=[0, best_iteration])
        acc_score = accuracy_score(y_B, tmp)
        acc_scores.append(acc_score)
        y_tst += model.predict_proba(x_tst,
                                     iteration_range=[0, best_iteration])
    y_tst /= N_SPLITS * N_REPEATS
    return y_tst, np.mean(acc_scores)
예제 #13
0
파일: xgboost.py 프로젝트: wtwong316/eland
 def __init__(
     self,
     model: XGBClassifier,
     feature_names: List[str],
     classification_labels: Optional[List[str]] = None,
 ):
     super().__init__(
         model.get_booster(),
         feature_names,
         model.base_score,
         model.objective,
         classification_labels,
     )
     if model.classes_ is None:
         n_estimators = model.get_params()["n_estimators"]
         num_trees = model.get_booster().trees_to_dataframe()["Tree"].max() + 1
         self._num_classes = num_trees // n_estimators
     else:
         self._num_classes = len(model.classes_)
예제 #14
0
def opt_BDT(input, output, params, show, names):

    model = XGBClassifier(**params)
    xgb_param = model.get_xgb_params()
    cvscores = []
    AUC = []
    X_train, X_test, y_train, y_test = train_test_split(input,
                                                        output,
                                                        test_size=0.2,
                                                        random_state=42)
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(
        xgb_param,
        matrix_train,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=30,
        verbose_eval=True,
    )
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(X_train, y_train, eval_metric="auc")
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    prediction = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_prob[:, 1])
    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)

        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))
        plot_importance(model.get_booster())
        plt.subplots_adjust(left=0.3)
        plt.show()
예제 #15
0
def feature_imporance_XGB(df, idx_target):
    X, Y = preprocess(df, idx_target)

    model = XGBClassifier()
    model.fit(X, Y)
    feature_importances = model.get_booster().get_score()
    # values = numpy.array([v[1] for v in feature_importances.items()])
    values = numpy.zeros(X.shape[1])
    for v in feature_importances.items():
        values[int(v[0][1:])] = v[1]

    return numpy.array(values)
예제 #16
0
 def __init__(
     self,
     model: XGBClassifier,
     feature_names: List[str],
     classification_labels: Optional[List[str]] = None,
 ):
     super().__init__(
         model.get_booster(),
         feature_names,
         model.base_score,
         model.objective,
         classification_labels,
     )
예제 #17
0
def objective(trial, x_train, y_train, params=params):
    start_time = timer()
    temp_map = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005,
                                                  0.05),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 5,
                                                     1000),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2,
                                                     0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0)
    }
    params.update(temp_map)
    # x_train = df.iloc[:train_rows, :].values
    # y_train = train_label.iloc[:train_rows].values
    y_oof = np.zeros((x_train.shape[0]))
    acc_scores = []
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-logloss")
    pruning_callback = optuna.integration.XGBoostPruningCallback(
        trial, "validation_0-auc")
    rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                                   n_repeats=N_REPEATS,
                                   random_state=RANDOM_SEED)
    for i, (train_index,
            valid_index) in enumerate(rskf.split(x_train, y_train)):
        X_A, X_B = x_train[train_index, :], x_train[valid_index, :]
        y_A, y_B = y_train[train_index], y_train[valid_index]
        xgb_classifier = XGBClassifier(**params)
        xgb_classifier.fit(X_A,
                           y_A,
                           eval_set=[(X_B, y_B)],
                           early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                           verbose=0,
                           callbacks=[pruning_callback])
        best_iteration = xgb_classifier.get_booster().best_ntree_limit  # new
        y_oof[valid_index] = xgb_classifier.predict(
            X_B, ntree_limit=best_iteration
        )  # new iteration_range=[0,best_iteration]
        acc_score = accuracy_score(y_B, y_oof[valid_index])
        acc_scores.append(acc_score)
        # print(f"===== {i} fold : acc {acc_score} =====")
    trial.set_user_attr(key="best_booster", value=xgb_classifier
                        )  # NOTE update the best model in the optuna's table.
    res = np.mean(acc_scores)
    # print(f"===== {res} =====")
    timer(start_time)
    return res
예제 #18
0
def objective(trial, x_train, y_train, params=params):
    # x_train, y_train: ndarray
    start_time = timer()
    temp_map = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 5e-3, 5e-2),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1,
                                                     300),  # 5, 100
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2,
                                                     0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0)
    }
    params.update(temp_map)

    y_oof = np.zeros(x_train.shape[0])
    acc_scores = []
    pruning_callback = optuna.integration.XGBoostPruningCallback(
        trial, "validation_0-auc"
    )  # depends on the choice of eval_metric; "validation_0-logloss"
    rskf = RepeatedStratifiedKFold(n_splits=N_SPLITS,
                                   n_repeats=N_REPEATS,
                                   random_state=SEED)

    for i, (train_index,
            valid_index) in enumerate(rskf.split(x_train, y_train)):
        X_A, X_B = x_train[train_index, :], x_train[valid_index, :]
        y_A, y_B = y_train[train_index], y_train[valid_index]
        xgb_classifier = XGBClassifier(**params)
        xgb_classifier.fit(X_A,
                           y_A,
                           eval_set=[(X_B, y_B)],
                           early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                           verbose=0,
                           callbacks=[pruning_callback])
        best_iteration = xgb_classifier.get_booster().best_ntree_limit  # new
        y_oof[valid_index] = xgb_classifier.predict(
            X_B, ntree_limit=best_iteration)  # new
        acc_scores.append(accuracy_score(y_B, y_oof[valid_index]))

    trial.set_user_attr(key="best_booster", value=xgb_classifier
                        )  # NOTE update the best model in the optuna's table.
    res = np.mean(acc_scores)

    timer(start_time)
    return res
예제 #19
0
def XGBClassifierMalwareImportantFeature(dataset):
    malware_feature = dataset.columns
    dataset = dataset.dropna(axis=0)
    malware_feature=malware_feature.drop("Class")
    X = dataset[malware_feature]  #independent columns
    y = dataset.Class
    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
    candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500,1000]
    for max_l in candidate_max_leaf_nodes :
         get_mae_XGB_Classifier(max_l, train_X, val_X, train_y, val_y)
    scores = {leaf_size: get_mae_XGB_Classifier(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
    best_tree_size = min(scores, key=scores.get)
    print('best tree size: ',best_tree_size)
    
    XGBClassifierMalware=XGBClassifier(learning_rate=0.1,max_leaf_nodes=best_tree_size,n_estimators=100)
    XGBClassifierMalware.fit(train_X, train_y, 
                 early_stopping_rounds=5, 
                 eval_set=[(val_X, val_y)], 
                 verbose=False)

    features_W = pd.Series(XGBClassifierMalware.get_booster().get_score(importance_type='weight'), index=X.columns)
    features_W.sort_values(axis=0, ascending=False).nlargest(25).plot(kind='barh').set_title('XGBClassifierMalware_weight')
    plt.show()
    
    feat_importances = pd.Series(XGBClassifierMalware.feature_importances_, index=X.columns)
    feat_importances.sort_values(axis=0, ascending=False)
    print(feat_importances.values)
    print('nico','\r\n')
    
    print(feat_importances[feat_importances.values > 0.001])
    best=feat_importances[feat_importances.values > 0.001]
    feat_importances.nlargest(20).plot(kind='barh').set_title('XGBClassifierMalware')
    plt.show()
    
    plot_importance(XGBClassifierMalware,max_num_features=22)
    pyplot.show()
    return best.index
예제 #20
0
        'n_estimators': 100,
        'max_depth': 3,
    },
    {  # performance test
        'n_estimators': 5000,
        'max_depth': 5,
        'nthread': 4,
    },
]

X, y = make_classification(10000)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

df_test = pd.DataFrame(X_test)
df_test['target'] = y_test

prepare_test_env()

for i, kwargs in enumerate(scenarios):
    classifier = XGBClassifier(**kwargs)
    classifier.fit(X_train, y_train)
    model_dir = os.path.join(os.path.dirname(__file__), f'build/model-{i}.txt')
    classifier.get_booster().dump_model(
        os.path.join(os.path.dirname(__file__), f'build/model-{i}.xgb'))
    probas = classifier.predict_proba(X_test)
    df_test[f'p_{i}_0'] = probas[:, 0]
    df_test[f'p_{i}_1'] = probas[:, 1]

df_test.to_csv(
    os.path.join(os.path.dirname(__file__), 'build/comparison_data.csv'))
예제 #21
0
파일: main.py 프로젝트: shavitta/FastForest
        },
        ignore_index=True)
    meta_results_with_avg.to_csv("meta_results_with_avg.csv", index=False)

    #######################################  FEATURES IMPORTANCE AND SHAP #######################################

    # First fit the model on the DF - drop the 'dataset' column and nan values
    meta_dataset.fillna(0, inplace=True)
    class_col = meta_dataset.columns.get_loc('Best AUC')
    X, y = split_to_X_and_y(meta_dataset, class_col)
    X = np.delete(X, [0], axis=1)

    xgb = XGBClassifier(booster='gbtree')
    xgb.fit(X, y)

    weight_res = xgb.get_booster().get_score(importance_type='weight')
    gain_res = xgb.get_booster().get_score(importance_type='gain')
    cover_res = xgb.get_booster().get_score(importance_type='cover')

    # Plot the 10 most features per importance type
    weight_res = plot_xgb_importance(weight_res, 'Weight',
                                     meta_dataset.columns)
    gain_res = plot_xgb_importance(gain_res, 'Gain', meta_dataset.columns)
    cover_res = plot_xgb_importance(cover_res, 'Cover', meta_dataset.columns)

    # Save the results for all the meta-features in a csv file
    data = {
        'Weight': list(weight_res.keys()),
        'Gain': list(gain_res.keys()),
        'Cover': list(cover_res.keys())
    }
예제 #22
0
                    # if the feature hasn't been seen yet
                    fmap[fid] = 1
                    gmap[fid] = g
                else:
                    fmap[fid] += 1
                    gmap[fid] += g

        # calculate average value (gain/cover) for each feature
        for fid in gmap:
            gmap[fid] = gmap[fid] / fmap[fid]

        return gmap

plot_importance(xgb1)

dic = (xgb1.get_booster().get_score(importance_type='weight'))
print(len(dic),dic)

def get_dic():
    data = pd.read_excel('./data/all_0.xlsx')
    columns = [column for column in data]
    columns.remove('target')

    dic = {}
    for i in range(len(columns)):
        dic['f'+str(i)] = columns[i]
    return dic

conv = get_dic()

예제 #23
0
    )  # MLPClassifier(solver='lbfgs',alpha=1e-1,hidden_layer_sizes=(10,2), random_state=1)
    details["Decade"] = decade + "s"
    details["Model"] = model.fit(X_train, y_train)
    details["Feature Importance"] = list(model.feature_importances_)

    try:
        details["Co-Efficient"] = model.coef_

    except:
        pass

    y_pred = model.predict(X_test)
    # predictions = [round(value) for value in y_pred]

    accuracy = round(100 * float(metrics.accuracy_score(y_test, y_pred)), 2)
    print(decade + "s Accuracy: ", accuracy)

    details["Accuracy"] = accuracy

    logger(details)

    #visualize(list(details["Feature Importance"]))
    header = ['danceability','energy','key',\
     'loudness','mode','speechiness','acousticness','instrumentalness',\
     'liveness','valence','tempo','duration_ms','time_signature','chorusHit','sections']

    model.get_booster().feature_names = header
    plot_importance(model.get_booster())  #.set_yticklabels(header)
    plt.show()

log.close()
def XGB(opts):
    reDirect = False
    FOLDER = 'clean_vpn12_xgb'
    if not os.path.exists(FOLDER):
        os.mkdir(FOLDER)
    MODEL_PATH = FOLDER + '/model.h5'
    FIG_PATH = FOLDER + '/Confusion_Matrix.png'
    FIG_PATH_N = FOLDER + '/Confusion_Matrix_Norm.png'


    import sys
    if(reDirect):
        old_stdout = sys.stdout
        sys.stdout = open( FOLDER + '/log', 'w')

    X_train = np.load(opts.source_data_folder+'/X_train.npy')
    y_train = np.load(opts.source_data_folder+'/y_train.npy')
    X_train = X_train.astype('float32') 

    print('X_train:', np.shape(X_train))
    print('y_train:', np.shape(y_train))

    maxsize = 0
    print('-'*20)
    for cat in np.unique(y_train):
        size = np.shape(np.where(y_train==cat))[1]
        print(str(cat)+": "+str(np.shape(np.where(y_train==cat))[1]))
        if(size > maxsize):
            maxsize = size
    print('-'*20)

    y = y_train

    X_train = normalize(X_train, norm='l2', axis=0, copy=True, return_norm=False)
    X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


    dim = np.shape(X_train)[1]
    print(dim)
    
    #Setting Classifier
    xgbc = XGBClassifier(max_depth=20, tree_method='exact',  n_estimators=180, n_jobs=-1)
    #training
    xgbc.fit(X_train, y_train,eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=30, verbose = True)


    results = xgbc.score(X_test, y_test)

    print('Test accuracy: ', results)

    if(reDirect):
        sys.stdout = old_stdout
    print('Test accuracy: ', results)

    xgbc.get_booster().save_model(MODEL_PATH)

    y_pred = xgbc.predict(X_test)

    #load the best model
    import xbgoost as xgb
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model(MODEL_PATH)  # load data
    y_pred = bst.predict(X_test)



    y_p = y_pred
    y_t = y_test
    class_names = [DIG2LABEL[i] for i in range(nclass)]
    cnf_matrix = confusion_matrix(y_t, y_p)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names,title='Confusion matrix, without normalization')
    plt.savefig(FIG_PATH)

    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,title='Normalized confusion matrix')
    plt.savefig(FIG_PATH_N)

    print('f1-scroe = {}'.format(f1_score(y_t, y_p, average=None)))
    print('prcision = {}'.format(precision_score(y_t, y_p, average=None)))
    print('recall = {}'.format(recall_score(y_t, y_p, average=None)))  
    print('macro f1 = {}'.format(f1_score(y_t, y_p, average='macro')))
예제 #25
0
ax.plot([0,1], [0,1], color ='k', linestyle='--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend(loc="lower right")
plt.savefig('boost_comparison_roc.jpeg')
fig.show()

# Feature Importances
data_boost.plot_importance()
plt.savefig('feature_importances.jpeg')
plt.show()
'''

feature_important = boost.get_booster().get_score(importance_type='gain')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys,
                    columns=["gain"]).sort_values(by="gain", ascending=True)
data.plot(kind='barh', color='r')
plt.title('XGBoost Feature Importance')
plt.show()
"""XGBoost - Run later today"""

# Grid Search XGBoost
# Ran in EC2 instance
'''
parameter_grid = {
                    'max_depth': [3, 9],
def opt(trial):

    global LONG_PROBA_THRESH

    global SHORT_PROBA_THRESH

    global VORARITY_THRESH

    param = {}

    if is_use_gpu:

        param['tree_method'] = 'gpu_hist'

        param['max_bin'] = 16

        param['gpu_id'] = 0

    long_prob_thresh = trial.suggest_discrete_uniform('long_prob_thresh', 0.5,
                                                      0.9, 0.05)

    short_prob_thresh = trial.suggest_discrete_uniform('short_prob_thresh',
                                                       0.1, 0.5, 0.05)

    vorarity_thresh = trial.suggest_discrete_uniform('vorarity_thresh', 0.01,
                                                     0.3, 0.02)

    eta = trial.suggest_discrete_uniform('eta', 0.05, 0.5, 0.05)

    n_estimators = trial.suggest_int('n_estimators', 0, 10000)

    #n_estimators = trial.suggest_int('n_estimators', 0, 100)

    max_depth = trial.suggest_int('max_depth', 1, 10)

    min_child_weight = trial.suggest_int('min_child_weight', 1, 20)

    subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1)

    colsample_bytree = trial.suggest_discrete_uniform('colsample_bytree', 0.5,
                                                      0.9, 0.1)

    xgboost_tuna = XGBClassifier(
        max_depth=max_depth,
        random_state=42,
        n_estimators=n_estimators,
        min_child_weight=min_child_weight,
        subsample=subsample,  # 0.7,
        colsample_bytree=colsample_bytree,  # 0.6,
        eta=eta,
        objective='binary:logistic',
        verbosity=0,
        n_thread=WHEN_TUNE_PARAM_THREAD_NUM,
        **param)

    verbosity = True

    if is_use_gpu or is_colab_cpu:

        verbosity = False

        # optuna.logging.set_verbosity(optuna.logging.CRITICAL)

        # optuna.logging.disable_default_handler()

    xgboost_tuna.fit(tr_input_arr, tr_angle_arr, verbose=verbosity)

    booster = xgboost_tuna.get_booster()

    cur_params = {
        'long_prob_thresh': str(long_prob_thresh),
        'short_prob_thresh': str(short_prob_thresh),
        'vorarity_thresh': str(vorarity_thresh),
        'eta': str(eta),
        'n_estimators': str(n_estimators),
        'max_depth': str(max_depth),
        'min_child_weight': str(min_child_weight),
        'subsample': str(subsample),
        'colsample_bytree': str(colsample_bytree)
    }

    logfile_writeln_opt(str(cur_params))

    portfolio_rslt = run_backtest(booster=booster,
                                  long_prob_thresh=long_prob_thresh,
                                  short_prob_thresh=short_prob_thresh,
                                  vorarity_thresh=vorarity_thresh)

    logfile_writeln_opt("portfolio_rslt =" + str(portfolio_rslt))

    #tuna_pred_test = xgboost_tuna.predict(val_input_arr)

    #return (1.0 - (accuracy_score(val_angle_arr, tuna_pred_test)))

    return (1.0 - ((portfolio_rslt / 1000000.0) - 0.5))
예제 #27
0
    plt.xlim([-1, len(features)])
    plt.savefig('vriable_importance_15032019_nTree260_endcap.png')


variable_importance(model, input_vars)

##################################################################################################################################

# convert xgboost to TMVA weights

import tempfile
feature_map = tempfile.NamedTemporaryFile(suffix=".txt")
for index, varname in enumerate(input_vars):
    print >> feature_map, index, varname, "q"

feature_map.flush()

import re

tmva_output_fname = re.sub("\\.pkl$", ".xml", model_fname)

model_dump = model.get_booster().get_dump(fmap=feature_map.name)
xgboost2tmva.convert_model(model_dump,
                           input_variables=[(input_var, 'F')
                                            for input_var in input_vars],
                           output_xml=tmva_output_fname,
                           pretty=True)

print "Wrote", tmva_output_fname
###############################################################################################################################
def train_and_generate_model():

    #global log_fd

    global log_fd_opt

    global tr_input_arr

    global tr_angle_arr

    global val_input_arr

    global val_angle_arr

    data_len = len(exchange_rates)

    log_fd_tr = open("./train_progress_log_" +
                     dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                     mode="w")

    # inner logger function for backtest

    def logfile_writeln_tr(log_str):

        nonlocal log_fd_tr

        log_fd_tr.write(log_str + "\n")

        log_fd_tr.flush()

    print("data size of rates: " + str(data_len))

    print("num of rate datas for tarin: " +
          str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR))

    print("input features sets for tarin: " + str(COMPETITION_TRAIN_DATA_NUM))

    logfile_writeln_tr("data size of rates: " + str(data_len))

    logfile_writeln_tr("num of rate datas for tarin: " +
                       str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR))

    tr_input_mat = []

    tr_angle_mat = []

    is_loaded_input_mat = False

    if os.path.exists("./tr_input_mat.pickle"):

        with open('./tr_input_mat.pickle', 'rb') as f:

            tr_input_mat = pickle.load(f)

        with open('./tr_angle_mat.pickle', 'rb') as f:

            tr_angle_mat = pickle.load(f)

        is_loaded_input_mat = True

    else:

        for i in range(DATA_HEAD_ASOBI,
                       len(exchange_rates) - DATA_HEAD_ASOBI - OUTPUT_LEN,
                       SLIDE_IDX_NUM_AT_GEN_INPUTS_AND_COLLECT_LABELS):

            tr_input_mat.append([
                exchange_rates[i],
                (exchange_rates[i] - exchange_rates[i - 1]) /
                exchange_rates[i - 1],
                get_rsi(exchange_rates, i),
                get_ma(exchange_rates, i),
                get_ma_kairi(exchange_rates, i),
                get_bb_1(exchange_rates, i),
                get_bb_2(exchange_rates, i),
                get_ema(exchange_rates, i),
                get_ema_rsi(exchange_rates, i),
                get_cci(exchange_rates, i),
                get_mo(exchange_rates, i),
                get_lw(exchange_rates, i),
                get_ss(exchange_rates, i),
                get_dmi(exchange_rates, i),
                get_vorarity(exchange_rates, i),
                get_macd(exchange_rates, i),
                str(judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i]))
            ])

            tr_input_mat.append([
                reverse_exchange_rates[i],
                (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) /
                reverse_exchange_rates[i - 1],
                get_rsi(reverse_exchange_rates, i),
                get_ma(reverse_exchange_rates, i),
                get_ma_kairi(reverse_exchange_rates, i),
                get_bb_1(reverse_exchange_rates, i),
                get_bb_2(reverse_exchange_rates, i),
                get_ema(reverse_exchange_rates, i),
                get_ema_rsi(reverse_exchange_rates, i),
                get_cci(reverse_exchange_rates, i),
                get_mo(reverse_exchange_rates, i),
                get_lw(reverse_exchange_rates, i),
                get_ss(reverse_exchange_rates, i),
                get_dmi(reverse_exchange_rates, i),
                get_vorarity(reverse_exchange_rates, i),
                get_macd(reverse_exchange_rates, i),
                str(
                    judge_chart_type(
                        reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i]))
            ])

            tmp = exchange_rates[i + OUTPUT_LEN] - exchange_rates[i]

            if tmp >= 0:

                tr_angle_mat.append(1)

            else:

                tr_angle_mat.append(0)

            tmp = reverse_exchange_rates[
                i + OUTPUT_LEN] - reverse_exchange_rates[i]

            if tmp >= 0:

                tr_angle_mat.append(1)

            else:

                tr_angle_mat.append(0)

        if is_loaded_input_mat == False:

            with open('tr_input_mat.pickle', 'wb') as f:

                pickle.dump(tr_input_mat, f)

            with open('tr_angle_mat.pickle', 'wb') as f:

                pickle.dump(tr_angle_mat, f)

    #log output for tensorboard

    #configure("logs/xgboost_trade_cpu_1")

    tr_input_arr = np.array(tr_input_mat[0:COMPETITION_TRAIN_DATA_NUM])

    tr_angle_arr = np.array(tr_angle_mat[0:COMPETITION_TRAIN_DATA_NUM])

    watchlist = None

    split_idx = COMPETITION_TRAIN_DATA_NUM + int(
        (len(tr_input_mat) - COMPETITION_TRAIN_DATA_NUM) *
        VALIDATION_DATA_RATIO)

    if VALIDATION_DATA_RATIO != 0.0:

        val_input_arr = np.array(
            tr_input_mat[COMPETITION_TRAIN_DATA_NUM:split_idx])

        val_angle_arr = np.array(
            tr_angle_mat[COMPETITION_TRAIN_DATA_NUM:split_idx])

        watchlist = [(tr_input_arr, tr_angle_arr),
                     (val_input_arr, val_angle_arr)]

    else:

        watchlist = [(tr_input_arr, tr_angle_arr)]

    start = time.time()

    if is_param_tune_with_optuna:

        log_fd_opt = open("./tune_progress_log_" +
                          dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                          mode="w")

        study = None

        if is_use_db_at_tune:

            study = optuna.Study(study_name='fxsystrade',
                                 storage='sqlite:///../fxsystrade.db')

        else:

            study = optuna.create_study()

        parallel_num = RAPTOP_THREAD_NUM * 2

        if is_colab_cpu or is_exec_at_mba:

            parallel_num = COLAB_CPU_AND_MBA_THREAD_NUM * 2

        if special_optuna_parallel_num != -1:

            parallel_num = special_optuna_parallel_num

        study.optimize(opt, n_trials=OPTUNA_TRIAL_NUM, n_jobs=parallel_num)

        process_time = time.time() - start

        logfile_writeln_opt("best_params: " + str(study.best_params))

        logfile_writeln_opt("best_value: " + str(study.best_value))

        logfile_writeln_opt("best_trial: " + str(study.best_trial))

        logfile_writeln_opt("excecution time of tune: " + str(process_time))

        log_fd_opt.flush()

        log_fd_opt.close()

        exit()

    param = {}

    n_thread = RAPTOP_THREAD_NUM

    if is_use_gpu:

        param['tree_method'] = 'gpu_hist'

        param['max_bin'] = 16

        param['gpu_id'] = 0

        n_thread = COLAB_CPU_AND_MBA_THREAD_NUM

    if is_colab_cpu or is_exec_at_mba:

        n_thread = COLAB_CPU_AND_MBA_THREAD_NUM

    logfile_writeln_tr("training parameters are below...")

    logfile_writeln_tr(str(param))

    eval_result_dic = {}

    logfile_writeln_tr("num_round: " + str(NUM_ROUND))

    clf = XGBClassifier(max_depth=MAX_DEPTH,
                        random_state=42,
                        n_estimators=NUM_ROUND,
                        min_child_weight=18,
                        subsample=0.9,
                        colsample_bytree=0.6,
                        eta=ETA,
                        objective='binary:logistic',
                        verbosity=0,
                        n_thread=n_thread,
                        **param)

    verbosity = True

    if is_use_gpu or is_colab_cpu:

        verbosity = False

    clf.fit(tr_input_arr, tr_angle_arr, eval_set=watchlist, verbose=verbosity)

    process_time = time.time() - start

    logfile_writeln_tr("excecution time of training: " + str(process_time))

    clf.save_model('./xgb.model')

    booster = clf.get_booster()

    booster.dump_model('./xgb_model.raw.txt')

    eval_result_dic = clf.evals_result()

    for ii in range(len(eval_result_dic['validation_0']['error'])):

        if VALIDATION_DATA_RATIO != 0.0:

            logfile_writeln_tr(
                str(ii) + "," +
                str(eval_result_dic['validation_0']['error'][ii]) + "," +
                str(eval_result_dic['validation_1']['error'][ii]))

        else:

            logfile_writeln_tr(
                str(ii) + "," +
                str(eval_result_dic['validation_0']['error'][ii]))

    # Feature Importance

    fti = clf.feature_importances_

    logfile_writeln_tr('Feature Importances:')

    for i, feat in enumerate(FEATURE_NAMES):

        logfile_writeln_tr('\t{0:20s} : {1:>.6f}'.format(feat, fti[i]))

    log_fd_tr.flush()

    log_fd_tr.close()

    print("finished training and saved model.")
예제 #29
0
# train
clf.fit(X, y, sample_weight=w)

#save results
if options.optimize:
    with open('%s/best_params.json' % options.out_dir, 'w+') as fout:
        fout.write(json.dumps(clf.best_params_))
    pd.DataFrame(clf.cv_results_).to_hdf('%s/cv_results.hd5' % options.out_dir,
                                         key='cv_results')
    if options.refit:
        clf = clf.best_estimator_
else:
    with open('%s/best_params.json' % options.out_dir, 'w+') as fout:
        fout.write(json.dumps(options.clf_params))

if not options.optimize or optimize.optimize and options.refit:
    if options.save_pickle:
        with gopen('%s/model.pkl.gz' % options.out_dir, 'w+') as fout:
            pickle.dump(clf, fout)
            fout.close()
    try:
        model = clf.get_booster()
    except:
        model = clf.booster()
    model.save_model('%s/model.xgb' % options.out_dir)

##
##
## # train it
## clf.fit(X_train,y_train,w_train)
예제 #30
0
#calculate the xgboost probability
import numpy as np
from xgboost import XGBClassifier

#simulate inputs for training the model
#simulation with a normail distribution N{mean=1,std=1}, generating a matrix of 10*6, each element is iid from the normal distribution
X=np.random.normal(1,1,[10,6])
#randomly generate 10 number, either 1 or 0
y=np.random.randint(2,size=10)

#use xgboost to train
model=XGBClassifier(learning_rate=0.1,n_estimators=2)
model.fit(X,y)

#simulate test data
Xtest=np.random.normal(1,1,[2,6])
ytest=np.random.randint(2,size=2)
#get prediction results
model.predict_proba(Xtest)
#get tree results
model.get_booster().dump_model('output.txt')
with open('output.txt','r') as f:
    lmodel_leaves=f.read()
print(model_leaves)

#replicate proba results with tree leaf results
#for each row, find the leaf value on each tree, there are two trees in this example
#proba = 1/(1+exp(-(tree0_leaf+tree1_leaf))
def run_backtest(booster=None,
                 long_prob_thresh=None,
                 short_prob_thresh=None,
                 vorarity_thresh=None):

    LONG_PROBA_THRESH_IN = LONG_PROBA_THRESH if long_prob_thresh == None else long_prob_thresh

    SHORT_PROBA_THRESH_IN = SHORT_PROBA_THRESH if short_prob_thresh == None else short_prob_thresh

    VORARITY_THRESH_IN = VORARITY_THRESH if vorarity_thresh == None else vorarity_thresh

    data_len = len(exchange_rates)

    log_fd_bt = open("./backtest_log_" +
                     dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                     mode="w")

    # inner logger function for backtest

    def logfile_writeln_bt(log_str):

        nonlocal log_fd_bt

        log_fd_bt.write(log_str + "\n")

        log_fd_bt.flush()

    logfile_writeln_bt("start backtest...")

    t_num = RAPTOP_THREAD_NUM

    if is_colab_cpu or is_exec_at_mba:

        t_num = COLAB_CPU_AND_MBA_THREAD_NUM

    if is_param_tune_with_optuna:

        t_num = WHEN_TUNE_PARAM_THREAD_NUM

    bst = None

    if booster == None:

        clf = XGBClassifier()

        clf.load_model("./xgb.model")

        bst = clf.get_booster()

        if is_use_gpu:

            bst.set_param({
                'predictor': 'gpu_predictor',
                'tree_method': 'gpu_hist'
            })

        else:

            bst.set_param({'predictor': 'cpu_predictor', 'nthread': t_num})

        #bst.load_model("./xgb.model")

    else:

        bst = booster  #引数のものを使う

        bst.set_param({'nthread': t_num})

    portfolio = 1000000

    LONG = "LONG"

    SHORT = "SHORT"

    NOT_HAVE = "NOT_HAVE"

    pos_kind = NOT_HAVE

    HALF_SPREAD = 0.0015

    SONKIRI_RATE = 0.05

    RIKAKU_PIPS = 0.60

    positions = 0

    trade_val = -1

    pos_cont_count = 0

    won_pips = 0

    start = time.time()

    ts_input_mat = []

    is_loaded_mat = False

    # if os.path.exists("./ts_input_mat.pickle"):

    #     with open('./ts_input_mat.pickle', 'rb') as f:

    #         ts_input_mat = pickle.load(f)

    #         is_loaded_mat = True

    logfile_writeln_bt("trade parameters LONG_PROBA_THRESH=" +
                       str(LONG_PROBA_THRESH) + " SHORT_PROBA_THRESH=" +
                       str(LONG_PROBA_THRESH) + " VORARITY_THRESH=" +
                       str(VORARITY_THRESH) + " trade_trying_times=" +
                       str(data_len - COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR -
                           OUTPUT_LEN))

    # log format

    a_log_str_line = "log marker, loop count, Did Action == Sonkiri, chart_type, Did Action == skip according to chart_type, Did Action == Rieki Kakutei, Did Action == Skip according to position cointain time, voratility, Did Action == skip accordint to voratility, predicted prob, Get long position => 1 Get Short position => 2 else => 0, Did Action == Skip by chart_type at last decision"

    #logfile_writeln_bt("check_ts_input_mat,range func argument," + str(data_len - COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR - OUTPUT_LEN))

    #logfile_writeln_bt("check_ts_input_mat,current_sport start," + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR + OUTPUT_LEN))

    for window_s in range(data_len - COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR -
                          OUTPUT_LEN):

        #current_spot = DATA_HEAD_ASOBI + window_s # for trying backtest with trained period

        current_spot = COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR + window_s + OUTPUT_LEN

        logfile_writeln_bt(a_log_str_line)

        skip_flag = False

        delay_continue_flag = False

        vorarity = -1  # default value for log output

        a_log_str_line = "log," + str(window_s)

        if pos_kind != NOT_HAVE:

            if pos_kind == LONG:

                cur_portfo = positions * (exchange_rates[current_spot] -
                                          HALF_SPREAD)

                diff = (exchange_rates[current_spot] - HALF_SPREAD) - trade_val

            elif pos_kind == SHORT:

                cur_portfo = portfolio + (
                    positions * trade_val - positions *
                    (exchange_rates[current_spot] + HALF_SPREAD))

                diff = trade_val - (exchange_rates[current_spot] + HALF_SPREAD)

            if (cur_portfo - portfolio) / portfolio < -1 * SONKIRI_RATE:

                portfolio = cur_portfo

                pos_kind = NOT_HAVE

                won_pips += diff

                logfile_writeln_bt(
                    str(diff) + "pips " + str(won_pips) + "pips")

                a_log_str_line += ",1,0,0,0,0,0,0,0,0,0"

                #continue

                delay_continue_flag = True

        long_chart_ok = False

        short_chart_ok = False

        if delay_continue_flag == False:  # or is_loaded_mat == False:

            chart_type = judge_chart_type(
                exchange_rates[current_spot - CHART_TYPE_JDG_LEN:current_spot])

            long_chart_ok = chart_type in chart_filter_type_long

            short_chart_ok = chart_type in chart_filter_type_short

            #if chart_type != 1 and chart_type != 2:

            if not (long_chart_ok or short_chart_ok):

                skip_flag = True

                if pos_kind != NOT_HAVE:

                    # if liner trend keep position

                    a_log_str_line += ",0," + str(
                        chart_type) + ",1,0,0,0,0,0,0,0"

                    #continue

                    delay_continue_flag = True

        if pos_kind != NOT_HAVE and delay_continue_flag == False:

            if pos_cont_count >= (OUTPUT_LEN - 1):

                if pos_kind == LONG:

                    pos_kind = NOT_HAVE

                    portfolio = positions * (exchange_rates[current_spot] -
                                             HALF_SPREAD)

                    diff = (exchange_rates[current_spot] -
                            HALF_SPREAD) - trade_val

                    won_pips += diff

                    logfile_writeln_bt(
                        str(diff) + "pips " + str(won_pips) + "pips")

                    logfile_writeln_bt(exchange_dates[current_spot] + " " +
                                       str(portfolio))

                    a_log_str_line += ",0," + str(
                        chart_type) + ",0,1,0,0,0,0,0,0"

                elif pos_kind == SHORT:

                    pos_kind = NOT_HAVE

                    portfolio += positions * trade_val - positions * (
                        exchange_rates[current_spot] + HALF_SPREAD)

                    diff = trade_val - (exchange_rates[current_spot] +
                                        HALF_SPREAD)

                    won_pips += diff

                    logfile_writeln_bt(
                        str(diff) + "pips " + str(won_pips) + "pips")

                    logfile_writeln_bt(exchange_dates[current_spot] + " " +
                                       str(portfolio))

                    a_log_str_line += ",0," + str(
                        chart_type) + ",0,1,0,0,0,0,0,0"

                pos_cont_count = 0

            else:

                a_log_str_line += ",0," + str(chart_type) + ",0,0,1,0,0,0,0,0"

                pos_cont_count += 1

            #continue

            delay_continue_flag = True

        if delay_continue_flag == False:  #or is_loaded_mat == False:

            vorarity = get_vorarity(exchange_rates, current_spot)

            #            if vorarity >= 0.07:

            if vorarity >= VORARITY_THRESH_IN:

                a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str(
                    vorarity) + ",1,0,0,0"

                #continue

                delay_continue_flag = True

        if skip_flag and delay_continue_flag == False:

            a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str(
                vorarity) + ",0,0,0,1"

            #continue

            delay_continue_flag = True

        if delay_continue_flag == True:

            continue

        # prediction

        ts_input_mat = []

        if is_loaded_mat == False:

            ts_input_mat.append([
                exchange_rates[current_spot],
                (exchange_rates[current_spot] -
                 exchange_rates[current_spot - 1]) /
                exchange_rates[current_spot - 1],
                get_rsi(exchange_rates, current_spot),
                get_ma(exchange_rates, current_spot),
                get_ma_kairi(exchange_rates, current_spot),
                get_bb_1(exchange_rates, current_spot),
                get_bb_2(exchange_rates, current_spot),
                get_ema(exchange_rates, current_spot),
                get_ema_rsi(exchange_rates, current_spot),
                get_cci(exchange_rates, current_spot),
                get_mo(exchange_rates, current_spot),
                get_lw(exchange_rates, current_spot),
                get_ss(exchange_rates, current_spot),
                get_dmi(exchange_rates, current_spot), vorarity,
                get_macd(exchange_rates, current_spot),
                str(chart_type)
            ])

            #logfile_writeln_bt("check_ts_input_mat,check append window_s," + str(window_s) + "\n")

        ts_input_arr = np.array(ts_input_mat)

        dtest = xgb.DMatrix(ts_input_arr)

        pred = bst.predict(dtest)

        #print(pred)

        predicted_prob = pred[0]

        if pos_kind == NOT_HAVE and skip_flag == False:

            if predicted_prob > LONG_PROBA_THRESH_IN and long_chart_ok:  #chart_type == 2:

                pos_kind = LONG

                positions = portfolio / (exchange_rates[current_spot] +
                                         HALF_SPREAD)

                trade_val = exchange_rates[current_spot] + HALF_SPREAD

                a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str(
                    vorarity) + ",1," + str(predicted_prob) + ",1,0"

            elif predicted_prob < SHORT_PROBA_THRESH_IN and short_chart_ok:  #chart_type == 1:

                pos_kind = SHORT

                positions = portfolio / (exchange_rates[current_spot] -
                                         HALF_SPREAD)

                trade_val = exchange_rates[current_spot] - HALF_SPREAD

                a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str(
                    vorarity) + ",1," + str(predicted_prob) + ",2,0"

            else:

                a_log_str_line += ",0," + str(chart_type) + ",0,0,0," + str(
                    vorarity) + ",1," + str(predicted_prob) + ",0,0"

        else:

            raise Exception("this path should not be executed!!!!")

            #a_log_str_line += "0," + str(chart_type) + ",0,0,0," + str(vorarity) + ",1,0,0,1"

    # if is_loaded_mat == False:

    #     with open('./ts_input_mat.pickle', 'wb') as f:

    #         pickle.dump(ts_input_mat, f)

    logfile_writeln_bt("finished backtest.")

    process_time = time.time() - start

    logfile_writeln_bt("excecution time of backtest: " + str(process_time))

    log_fd_bt.flush()

    log_fd_bt.close()

    return portfolio