예제 #1
0
class TestNode(unittest.TestCase):
    def setUp(self):
        X, y = make_classification(n_features=5, random_state=1)
        self.classifier = XGBClassifier(n_estimators=3)
        self.classifier.fit(X, y)
        self.predictions = self.classifier.predict_proba(X)
        self.model_dump = [
            tree.split('\n') for tree in self.classifier.booster().get_dump()
        ]

    def test_parse_root_node(self):
        node = bdt2cpp.Node(self.model_dump[0][0])
        self.assertEqual(node.cut_value, -0.464102)
        self.assertFalse(node.weight)
        self.assertEqual(node.root, node)
        self.assertIsNone(node.parent)
        return node

    def test_parse_left_node(self):
        node = self.test_parse_root_node()
        root = node
        node.left = bdt2cpp.Node(self.model_dump[0][1], parent=node)
        node = node.left
        self.assertEqual(node.parent, root)
        self.assertFalse(node.cut_value)
        self.assertEqual(node.weight, -0.184906)
        return node
def get_xgb_feature_importance_plot(best_param_, experiment_, 
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1. 
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({'name':index2feature.keys(),
                        'score':index2feature.values()})
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(x = 'score', y = 'name',
                data = fis,
                #ax=ax1,
                color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)

    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph', png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)#, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
예제 #3
0
def xgb_train_offline():
    print('data process...')
    data = get_data()
    data = fea_select(data)

    id_fea = ['user_id', 'item_id', 'shop_id', 'context_id', 'context_page_id']
    data.drop(id_fea, axis=1, inplace=True)  # 暂时不先利用这些id特征

    train, val, test = gen_train_val_test(data, True)

    y_train = train['is_trade']
    X_train = train.drop(['is_trade'], axis=1)

    y_val = val['is_trade']
    X_val = val.drop(['is_trade'], axis=1)

    print('start training...')
    xgb = XGBClassifier(objective='binary:logistic',
                        learning_rate=0.01,
                        n_estimators=10,
                        max_depth=5,
                        subsample=0.7,
                        colsample_bytree=0.7,
                        reg_lambda=0.005,
                        nthread=4,
                        seed=128,
                        silent=10)

    xgb.fit(X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric='logloss',
            early_stopping_rounds=50)
    y_prob = xgb.predict_proba(X_val, ntree_limit=xgb.best_ntree_limit)[:, 1]
    print('result log_loss = {0}'.format(log_loss(y_val, y_prob)))

    fea_score = pd.DataFrame()
    fea_score['feature'] = X_train.columns.tolist()
    fea_score['score'] = list(xgb.booster().get_fsocre())
    fea_score = fea_score.sort_values(by='score',
                                      ascending=False).reset_index(drop=True)
    fea_score.to_csv(file_path + 'fea_score.csv', index=None)

    fea_map = pd.Series(data=fea_score['score'].values,
                        index=fea_score['feature'])
    fea_map.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()

    print('end...')
예제 #4
0
def XGBooster(Xtrain, Ytrain, Xtest, Ytest):

    # Train
    XG = XGBClassifier()
    XG.fit(Xtrain, Ytrain)

    #Feature Importancess
    XGFeature_importances = pd.DataFrame(XG.booster().get_fscore(),
                                         index=Xtrain.columns,
                                         columns=['importance']).sort_values(
                                             'importance', ascending=False)

    # Test
    yPred = XG.predict(Xtest)
    XGPredictions = [round(value) for value in yPred]
    XGAccuracy = accuracy_score(Ytest, XGPredictions)
    return XGFeature_importances, XG, yPred, XGPredictions, XGAccuracy, print(
        "Accuracy: %.2f%%" % (XGAccuracy * 100.0))
예제 #5
0
# train
clf.fit(X, y, sample_weight=w)

#save results
if options.optimize:
    with open('%s/best_params.json' % options.out_dir, 'w+') as fout:
        fout.write(json.dumps(clf.best_params_))
    pd.DataFrame(clf.cv_results_).to_hdf('%s/cv_results.hd5' % options.out_dir,
                                         key='cv_results')
    if options.refit:
        clf = clf.best_estimator_
else:
    with open('%s/best_params.json' % options.out_dir, 'w+') as fout:
        fout.write(json.dumps(options.clf_params))

if not options.optimize or optimize.optimize and options.refit:
    if options.save_pickle:
        with gopen('%s/model.pkl.gz' % options.out_dir, 'w+') as fout:
            pickle.dump(clf, fout)
            fout.close()
    try:
        model = clf.get_booster()
    except:
        model = clf.booster()
    model.save_model('%s/model.xgb' % options.out_dir)

##
##
## # train it
## clf.fit(X_train,y_train,w_train)
예제 #6
0
df_all = pd.concat([df_all, bow], axis=1)
df_all['num_zero'] = num_zero
df_all = pipeline.fit(df_all).transform(df_all)

X_train = df_all.iloc[:df_train.shape[0], :]
X_test = df_all.iloc[df_train.shape[0]:, :]
y_train = df_target
ID_test = df_id

# best params so far using column/row subsampling, even longer training
learning_rate = 0.01
n_estimators = 800
max_depth = 6
subsample = 0.9
colsample_bytree = 0.85
min_child_weight = 1  # default

xgb = XGBClassifier(seed=0, learning_rate=learning_rate, n_estimators=n_estimators,
                    min_child_weight=min_child_weight, max_depth=max_depth,
                    colsample_bytree=colsample_bytree, subsample=subsample)
xgb = xgb.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric='auc')

importances = xgb.booster().get_fscore()
df_importance = pd.DataFrame(zip(importances.keys(), importances.values()), columns=['feature', 'importance'])
print df_importance.sort_values('importance', ascending=False).reset_index(drop=True)

y_pred = xgb.predict_proba(X_test)
submission = pd.DataFrame({'ID': ID_test, 'TARGET': y_pred[:, 1]})
submission.to_csv(filename, index=False)
print 'Wrote %s' % filename
예제 #7
0
def get_xgb_feature_importance_plot(best_param_,
                                    experiment_,
                                    png_folder,
                                    png_fname,
                                    score_threshold=0.8):

    # 1.
    train_X, train_y = experiment_.get_train_data()
    clf = XGBClassifier()
    try:
        del best_param_['model_type']
    except:
        pass
    clf.set_params(**best_param_)
    clf.fit(train_X, train_y)
    index2feature = clf.booster().get_fscore()
    fis = pd.DataFrame({
        'name': index2feature.keys(),
        'score': index2feature.values()
    })
    fis = fis.sort('score', ascending=False)
    if len(fis.index) > 20:
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(
            score_threshold)
        #where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)

    # 2. plot
    #gs = GridSpec(2,2)
    #ax1 = plt.subplot(gs[:,0])
    #ax2 = plt.subplot(gs[0,1])
    #ax3 = plt.subplot(gs[1,1])

    # 3.1 feature importance
    sns.barplot(
        x='score',
        y='name',
        data=fis,
        #ax=ax1,
        color="blue")
    #plt.title("Feature_Importance", fontsize=10)
    plt.ylabel("Feature", fontsize=10)
    plt.xlabel("Feature_Importance : f-Score", fontsize=10)
    """
    # 3.2 PDF
    confidence_score = clf.oob_decision_function_[:,1]
    sns.distplot(confidence_score, kde=False, rug=False, ax=ax2)
    ax2.set_title("PDF")

    # 3.3 CDF
    num_bins = min(best_param_.get('n_estimators',1), 100)
    counts, bin_edges = np.histogram(confidence_score, bins=num_bins, normed=True)
    cdf = np.cumsum(counts)
    ax3.plot(bin_edges[1:], cdf / cdf.max())
    ax3.set_title("CDF")
    ax3.set_xlabel("Oob_Decision_Function:Confidence_Score", fontsize=10)
    """

    png_fname = os.path.join(Config.get_string('data.path'), 'graph',
                             png_fname)
    plt.tight_layout()
    plt.savefig(png_fname)  #, bbox_inches='tight', pad_inches=1)
    plt.close()

    return True
예제 #8
0
        colsample_bytree=0.8,  # 0.9
        scale_pos_weight=14,  # 10
        objective="reg:logistic",
        nthread=-1,
        seed=random_seed)

    # analyse_n_estimators(model, train_x, train_y, est_list=[8000, 9000, 10000, 11000, 12000])

    # params = {"max_depth": [3, 4, 5],
    #           "min_child_weight": [1, 10, 100],}
    # params = {"min_child_weight": [5, 7, 9, 10, 11, 13, 15]}
    # params = {"subsample": [0.7, 0.8, 0.9, 1],
    #           "colsample_bytree": [0.7, 0.8, 0.9, 1],}
    # params = {"reg_lambda": [0.1, 1, 10, 100]}
    # params = {"scale_pos_weight": [6, 8, 10, 12, 14, 16, 18]}
    # g_model = GridSearchCV(model, param_grid=params, scoring="f1", cv=5, n_jobs=-1, iid=False, verbose=0)
    # g_model.fit(train_x, train_y)
    # print(g_model.grid_scores_)
    # print(g_model.best_score_)
    # print(g_model.best_params_)
    # log.info("Best parameter is {}, with score {}".format(g_model.best_params_, g_model.best_score_))

    model.fit(train_x, train_y)
    pred = model.predict(test_x)
    test_uid = test.iloc[:, 0]
    result = pd.DataFrame(columns=["uid", "label"])
    result["uid"] = test_uid
    result["label"] = pred
    result.to_csv("result.csv", index=False)
    model.booster().save_model("2.model")
예제 #9
0
class classifier:
    def __init__(self):
        self.model = XGBClassifier()
        self.progress = 0

    def para_tuning(
        self,
        X,
        y,
        para,
        grid,
        seed=0,
        verbose=False
    ):  # verbose = 1 for tuning log, verbose = 2 for plotting, verbose = 3 for both

        # determine which to parameter to tune this time
        if para == '':
            return None
        elif para == 'learning_rate':
            param_grid = dict(learning_rate=grid)  # [0,0.1]
        elif para == 'max_depth':
            param_grid = dict(max_depth=grid)  # int
        elif para == 'min_child_weight':
            param_grid = dict(min_child_weight=grid)  # [0,1]
        elif para == 'gamma':
            param_grid = dict(gamma=grid)  # [0,1]
        elif para == 'max_delta_step':
            param_grid = dict(max_delta_step=grid)  # int
        elif para == 'colsample_bytree':
            param_grid = dict(colsample_bytree=grid)  # [0,1]
        elif para == 'reg_alpha':
            param_grid = dict(reg_alpha=grid)  # [0,1]
        elif para == 'reg_lambda':
            param_grid = dict(reg_lambda=grid)  # [0,1]
        else:
            print('WRONG PARAMETER.')
            return None
        kfold = StratifiedKFold(n_splits=8, shuffle=True, random_state=seed)
        grid_search = GridSearchCV(self.model,
                                   param_grid,
                                   scoring='accuracy',
                                   n_jobs=-1,
                                   cv=kfold)
        grid_result = grid_search.fit(X, y)
        # summarize results
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        if verbose == 1 or verbose == 3:
            for mean, stdev, param in zip(means, stds, params):
                print('{:.4f} ({:.4f}) WITH: {} = {}'.format(
                    mean, stdev, para,
                    list(param.values())[0]))
            print('-' * 63)
        self.progress += 1
        progress = int(self.progress / 7 * 100)
        progress_bar = int(self.progress / 7 * 58)
        print('\r' + '█' * progress_bar + ' ' * (58 - progress_bar) +
              ' {:>3}%'.format(progress),
              end='')
        if verbose == 2 or verbose == 3:
            # plot
            plt.close()
            plt.figure(figsize=(20, 10))
            plt.errorbar(grid, means, yerr=stds)
            plt.title('XGBoost {} Tuning'.format(para))
            plt.xlabel(para)
            plt.ylabel('accuracy')
            plt.show()
        return list(grid_result.best_params_.values())[0]

    def tune(self, X, y, verbose=False, seed=0):
        self.model.seed = seed
        # fit model no training data
        print('-' * 63)
        print('AUTO TUNING ON TRAINING DATASET.')
        self.model.n_estimators = 1024
        self.model.subsample = 0.6
        self.model.learning_rate = 0.01

        self.model.max_depth = self.para_tuning(X, y, 'max_depth',
                                                [2, 4, 6, 8], seed, verbose)
        self.model.min_child_weight = self.para_tuning(X, y,
                                                       'min_child_weight',
                                                       [4, 8, 12, 16], seed,
                                                       verbose)
        self.model.gamma = self.para_tuning(
            X, y, 'gamma', [0, 0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8], seed,
            verbose)
        self.model.max_delta_step = self.para_tuning(X, y, 'max_delta_step',
                                                     [0, 1, 2, 4], seed,
                                                     verbose)
        self.model.colsample_bytree = self.para_tuning(X, y,
                                                       'colsample_bytree',
                                                       [0.5, 0.6, 0.7], seed,
                                                       verbose)
        self.model.reg_alpha = self.para_tuning(X, y, 'reg_alpha',
                                                [0, 0.001, 0.01, 0.1, 10, 100],
                                                seed, verbose)
        self.model.reg_lambda = self.para_tuning(
            X, y, 'reg_lambda', [0, 0.001, 0.01, 0.1, 10, 100], seed, verbose)
        self.model.learning_rate /= 2

        sleep(3)
        print('\rAUTO TUNING FINISHED.' + ' ' * 42)
        print('-' * 63)
        if input('MODEL REVIEWING? (Y/N) ') == 'Y':
            print(self.model)

    def train(self, data, early_stopping_rounds=None, verbose=True, seed=0):
        X_train, y_train = data.train[0], data.train[1]
        X_test, y_test = data.test[0], data.test[1]

        # tune paramters using trainging dataset
        self.tune(X_train, y_train, seed=seed)
        print('-' * 63)
        # train the model with optimized parameters
        print('MODEL TRAINING.')
        metric = ['error', 'logloss', 'auc']
        #         self.model.min_child_weight = 4
        self.model.fit(X_train,
                       y_train,
                       eval_metric=metric,
                       eval_set=[(X_train, y_train), (X_test, y_test)],
                       early_stopping_rounds=early_stopping_rounds,
                       verbose=False)

        # make predictions for train data
        y_pred = self.model.predict(X_train)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_train, predictions)
        print('TRAINING FINISHED.')
        print('ACCURACY TRAINING: {:.2f}%'.format(accuracy * 100))

        # make predictions for test data
        y_pred = self.model.predict(X_test)
        predictions = [round(value) for value in y_pred]
        # evaluate predictions
        accuracy = accuracy_score(y_test, predictions)
        print('ACCURACY TESTING: {:.2f}%'.format(accuracy * 100))

        if verbose is True:
            try:
                # plot boosting results
                results = self.model.evals_result()
                epochs = len(results['validation_0'][metric[0]])
                x_axis = range(0, epochs)
                plt.style.use('ggplot')
                plt.rcParams['font.size'] = 8
                plt.figure(figsize=(20, 10))
                i = 0
                for m in metric:
                    ax = plt.subplot2grid((len(metric), 2), (i, 0))
                    i += 1
                    ax.plot(x_axis, results['validation_0'][m], label='Train')
                    ax.plot(x_axis, results['validation_1'][m], label='Test')
                    ax.legend()
                    ax.set_ylabel(m)
                # plot feature importances
                features = data.features
                mapFeat = dict(
                    zip(['f' + str(i) for i in range(len(features))],
                        features))
                imp = pd.Series(self.model.booster().get_fscore())
                imp.index = imp.reset_index()['index'].map(mapFeat)
                ax = plt.subplot2grid((len(metric), 2), (0, 1),
                                      rowspan=len(metric))
                imp.sort_values().plot(kind='barh')
                ax.set_ylabel('importance')
                plt.show()
            except:
                print('PLOTTING ERROR.')
class XGBoostModel:
    def __init__(self, use_rfc=True):
        self.use_rfc = use_rfc
        if self.use_rfc:
            # Instantiate Random Forest Classifier
            self.rfc = RFCModel()
            self.rfc.unpickle()

    def load_train_data(self):
        self.df, y, _ = clean_df('data/data.json', training=True)

        if self.use_rfc:
            # Include results from random forest classifier as new column
            rfc_probs = self.rfc.predict_proba_all()
            self.df['rfc_proba'] = rfc_probs

        X = self.df.values

        self.features = self.df.columns
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.20, stratify=y, random_state=42)

    def load_test_data(self):
        self.df, _, oid = clean_df('data/data_point.json', training=False)

        if self.use_rfc:
            # Include results from random forest classifier as new column
            rfc_probs = self.rfc.predict_proba('data/data_point.json')
            self.df['rfc_proba'] = rfc_probs

        return self.df.values, oid

    def load_one(self, one_json):
        # with open('one.json', 'w') as f:
        #     temp = '[' + one_json + ']'
        #     f.write(temp)

        self.df, _, oid = clean_df('[' + one_json + ']', training=False)

        if self.use_rfc:
            # Include results from random forest classifier as new column
            rfc_probs = self.rfc.predict_proba('data/data_point.json')
            self.df['rfc_proba'] = rfc_probs

        return self.df.values, oid

    def fit(self):
        self.model = XGBClassifier(max_depth=8,\
                                # reg_alpha=.8,\
                                n_estimators=200,\
                                scale_pos_weight=10.13,\
                                learning_rate=0.1)

        self.model.fit(self.X_train, self.y_train)

    @property
    def feature_importances_(self):
        #I couldn't call the master class, so just copy-n-pasted
        #See https://github.com/dmlc/xgboost/commit/dd477ac903eb6f658d6fb2984763c3f8a4516389#diff-2c197a11c1b576e821f5942be9eab74c
        b = self.model.booster()
        fs = b.get_fscore()
        all_features = [fs.get(f, 0.) for f in b.feature_names]
        all_features = np.array(all_features, dtype=np.float32)
        return all_features / all_features.sum()

    def plot_features(self, save_img_dir=None, img_name_prefix='', ext='svg'):
        '''
        use ext='svg' for web!
        add save_file_dir location to save images
        save_file_dir has NO trailing slash!
        eg 'static/images'
        to keep multiple images saved add prefix string
        prefix will be added to image file name

        '''

        # this is needed to fix lable clipping in saved files
        from matplotlib import rcParams
        rcParams.update({'figure.autolayout': True})

        #severly modified from https://gist.github.com/light94/6c42df29f3232ae31e52
        b = self.model.booster()
        fs = b.get_fscore()
        #print('feature...')
        #print(b.feature_names)
        #all_features = {f:fs.get(f, 0.) for f in b.feature_names}
        #need to add real feature names
        all_features = {
            self.features[i]: float(fs.get('f' + str(i), 0.))
            for i in range(len(b.feature_names))
        }
        importance = sorted(all_features.items(), key=itemgetter(1))

        ff = pd.DataFrame(importance, columns=['feature', 'fscore'])
        ff['fscore'] = ff['fscore'] / ff['fscore'].sum()

        #"plot 1"
        ax = ff.fscore.plot(xticks=ff.index, rot=65)
        ax.set_xticklabels(ff.feature)
        plt.title('XGBoost F-scores by feature')

        if save_img_dir is not None:
            plt.savefig('{}/{}feature_fscores.{}'.format(
                save_img_dir, img_name_prefix, ext))
        plt.show()

        #"plot 2"
        ff.plot(kind='barh',
                x='feature',
                y='fscore',
                legend=False,
                figsize=(6, 10))
        plt.title('XGBoost Feature Importance')
        plt.xlabel('relative importance')
        if save_img_dir is not None:
            plt.savefig('{}/{}features_barh.{}'.format(save_img_dir,
                                                       img_name_prefix, ext))
        plt.show()
        plt.close()

    def pickle(self):
        _pickle(self.model, 'data/XGBoostModel.pkl')

    def unpickle(self):
        self.model = _unpickle('data/XGBoostModel.pkl')

    def score(self):
        y_pred = self.model.predict(self.X_test)
        probs = self.model.predict_proba(self.X_test)[:, 1]
        accuracy = accuracy_score(self.y_test, y_pred)
        f1 = f1_score(self.y_test, y_pred)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        print("f1: %.2f" % f1)
        print('Confusion matrix')
        print(np.array([['TN', 'FP'], ['FN', 'TP']]))
        print(confusion_matrix(self.y_test, y_pred))

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        prob = self.model.predict_proba(X)
        return prob[:, 1]
예제 #11
0
def MLdecTree (learnData, picpath, endpoint="I9_STR_EXH", delCol=["I9_STR_SAH","I9_SEQULAE", "I9_STR", "IX_CIRCULATORY"], corrValue=0.995, binary=True):
    #reads in processed Data from other function
    learnColumn=learnData.columns
    
    #correlates all the nevt columns with the target columns and saves the columns with high corr in list 
    matching = [s for s in learnColumn if endpoint.lower() in s.lower()]
    endpointofInterest = [s for s in matching if "nevt" in s.lower()]
    corrDropCol=[]
    for colName in learnData.columns:
        #print(colName)
        if "nevt" in colName.lower():
            coreName=colName.split('_NEVT')[0]
            for match in matching:
                corrCo=learnData[match].corr(learnData[colName], method='spearman')
                if (corrCo > corrValue) or (corrCo < -corrValue):
                    #spike_cols = [col for col in learnColumn if coreName in col]
                    corrDropCol.extend([colName, coreName+"_AGE"])

    #setting the y for endpoint of interest
    y = learnData[endpointofInterest[0]].copy().to_numpy()
    y = y.astype(int)

    #drop all columns which are medicly too close related to endpoint 
    mask_pattrn = '|'.join(delCol) 
    if mask_pattrn:
        learnData1 = learnData[learnData.columns.drop(list(learnData.filter(regex=mask_pattrn)))]

    #deletes all strongly corr columns
    corrDropCol=list(set(corrDropCol)-set(matching))
    mask_pattrn = '|'.join(corrDropCol)  
    if mask_pattrn:
        learnData1 = learnData1[learnData1.columns.drop(list(learnData1.filter(regex=mask_pattrn)))]
    
    #Splitting dependent and independent Variable y=result
    mask_pattrn = '|'.join(matching)  
    if mask_pattrn:
        X = learnData1[learnData1.columns.drop(list(learnData1.filter(regex=mask_pattrn)))]
    else:
        X=learnData1

    
    #splitting Data in train and test Data set
    y=pd.Series(preprocessing.LabelEncoder().fit_transform(np.array(y)))
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)


#    ka=pd.Series(y.unique()).sort_values()
#   , use_label_encoder=False
    #fitting xgbTree
#    clf_xgb= xgb.XGBClassifier(use_label_encoder=False) objective="multi:softmax", num_class = len(y.unique()))
#    clf_xgb.fit(X_train, y_train, eval_metric="merror", eval_set=[(X_test, y_test)])
    
    if binary is True:
        #to be modified: gamma, n_jobs(threads)
        #normal weight: scale_pos_weight= (y != 0).sum()/(y == 0).sum()
        model = XGBClassifier(base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, 
                          colsample_bytree=1, gamma=0.25, learning_rate=0.1, max_delta_step=0,
                          max_depth=6, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, 
                          objective="binary:logistic", random_state=0, reg_alpha=0, reg_lambda=1,
                          scale_pos_weight=30, seed=None, subsample=1, verbosity=1) 
	#eval_metric:aucpr + auc + logloss
        model.fit(X_train, y_train, verbose=True, eval_metric="auc")
        
    else:
        lc = LabelEncoder() 
        lc = lc.fit(y)   
        model = XGBClassifier(base_score=0.5, booster="gbtree", colsample_bylevel=1, colsample_bynode=1, 
                          colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
                          max_depth=6, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, 
                          objective="reg:tweedie", random_state=0, reg_alpha=0, reg_lambda=1,
                          scale_pos_weight=(y == 0).sum()/(y != 0).sum(), seed=None, subsample=1, verbosity=1) 
        #eval_metric:rmse + tweedie-nloglik
        model.fit(X_train, y_train, verbose=True, eval_metric="tweedie-nloglik")

#The accuracy of the model is calculated and printed
    y_pred = model.predict(X_test) 
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions) 

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
 
#Confusion plot (makes sense when the value is binary classified)
    conf = plot_confusion_matrix(model,
                          X_test,
                          y_test,
                          display_labels=["Have no stroke", "Have a stroke"])

    plt.savefig(picpath + '/confmatrix', format = "png")
    
#Code for printing out the xgb Tree calculated and make it pretty    
    bst = model.booster()
    #for importance_type in ("weight","gain","cover","total_gain","total_cover"):
    #    print("%s: " % importance_type, bst.get_score(importance_type=importance_type))
    #next two section is to make visual adjustments
    node_params = {"shape": "box",
                   "style": "filled, rounded",
                   "fillcolor": "#78cbe"}
    leaf_params= {"shape" : "box",
                  "style" : "filled",
                  "fillcolor" : "#e48038"}
    #creates tree
    image = xgb.to_graphviz(model, num_trees=0, size="10,10",
                    condition_node_params=node_params,
                    leaf_node_params=leaf_params)
    
    #Set a different dpi (work only if format == 'png')
    image.graph_attr = {'dpi':'400'}
    #Saving the tree where the code is saved
    image.render(picpath + '/modellbild1', format = "png")

    return accuracy, model, corrDropCol