示例#1
0
 def __init__(self, model: XGBRegressor, feature_names: List[str]):
     # XGBRegressor.base_score defaults to 0.5.
     base_score = model.base_score
     if base_score is None:
         base_score = 0.5
     super().__init__(model.get_booster(), feature_names, base_score,
                      model.objective)
示例#2
0
def feature_importance():
    """Obtains the most important features using XGBoosts
    """
    dataset = pd.read_csv('../results/dataframe_final_project.csv',
                          index_col=0)
    dataset['Precio_Precio'] = np.log(dataset['Precio_Precio'])
    X = dataset.drop(columns=[
        'Precio_Precio', 'Precio_Open', 'Precio_Low', 'Precio_Close',
        'Precio_High', 'Fecha'
    ]).values
    y = dataset['Precio_Precio'].values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    xgb_reg = XGBRegressor()

    xgb_reg_fit = xgb_reg.fit(X_train, y_train)
    y_hat = xgb_reg.predict(X_test)

    print('score:', r2_score(y_test, y_hat))
    print(xgb_reg.get_booster().get_score(importance_type="gain"))
    plot_importance(
        xgb_reg,
        importance_type='gain',
        max_num_features=20,
        height=0.8,
    )
    plt.savefig('../figs/feature_importance.png')
示例#3
0
            def PredictMetadata(ASV_table, metadata_variables, train_size,
                                test_size, seed):
                X_ASV = ASV_table
                X_ASV.columns = [''] * len(X_ASV.columns)
                X_ASV = X_ASV.to_numpy()
                metadata_list = []
                for i in metadata_variables:
                    #y_CDOM = metadata.loc[:, i][:, np.newaxis]

                    # split data into train and test sets
                    y_meta = metadata.loc[:, i]  #Requires 1d array
                    X_train, X_test, y_train, y_test = train_test_split(
                        X_ASV,
                        y_meta,
                        train_size=train_size,
                        test_size=test_size,
                        random_state=seed)

                    # fit model no training data
                    model = XGBRegressor(objective='reg:squarederror')
                    model.fit(X_train,
                              y_train,
                              eval_set=[(X_train, y_train), (X_test, y_test)],
                              eval_metric='rmse',
                              early_stopping_rounds=100,
                              verbose=False)

                    #Get best model by test MSE
                    XGboost_best_model_index = model.best_iteration
                    XGboost_best_iteration = model.get_booster(
                    ).best_ntree_limit

                    # make predictions for full dataset
                    y_pred = model.predict(X_ASV,
                                           ntree_limit=XGboost_best_iteration)
                    metadata_list.append(y_pred[:, np.newaxis])
                return MergeTable(metadata_list, metadata_variables)
示例#4
0
class Prediction_xgb:
    def __init__(self, model_file):
        self.xgb_model_path = model_file
        self.param = {
            'learning_rate': 0.1,
            'max_depth': 5,
            'gamma': 0,
            'min_child_weight': 3,
            'subsample': 0.8,
            # 'colsample': 0.75,
            'colsample_bytree': 0.8,
            'scale_pos_weight': 1,
            'verbosity': 3,
            'objective': 'reg:squarederror',
            'eval_metric': 'mae',
        }
        self.model = XGBRegressor(
            slice=1,
            learning_rate=0.1,
            n_estimators=96,  # 树的个数--1000棵树建立xgboost
            max_depth=5,  # 树的深度
            min_child_weight=3,  # 叶子节点最小权重
            gamma=0.,  # 惩罚项中叶子结点个数前的参数
            subsample=0.8,  # 随机选择80%样本建立决策树
            # colsample = 0.75,  # 随机选择80%特征建立决策树
            colsample_bytree=0.6,
            verbosity=3,
            objective='reg:squarederror',  # 指定损失函数
            # eval_metric='mae',
            # scale_pos_weight=1,  # 解决样本个数不平衡的问题
            # random_state = 27,  # 随机数
        )

    def cut_data(self, data_x, data_y):
        pass
        res_x = []
        res_y = []
        for i in range(data_x.shape[0]):
            if any(data_x[i][4:11]):
                res_x.append(data_x[i])
                res_y.append(data_y[i])
        return np.array(res_x), np.array(res_y)

    def train_XGBClassifier(self):
        print('---xgb start---')
        self.model.fit(self.train_data,
                       self.train_y,
                       eval_set=[(self.train_data, self.train_y)])

    def train_xgboost(self, trian_data, train_y):
        print('---xgb start---')
        train_data, train_y = self.cut_data(trian_data, train_y)
        train_xdf = pd.DataFrame(train_data[:-50])
        train_ydf = pd.DataFrame(train_y[:-50])
        test_xdf = pd.DataFrame(train_data[-50:])
        test_ydf = pd.DataFrame(train_y[-50:])

        dtrain = xgb.DMatrix(train_xdf, label=train_ydf)
        dtest = xgb.DMatrix(test_xdf, label=test_ydf)

        def modelfit(alg,
                     train_xdf,
                     train_ydf,
                     useTrainCV=True,
                     cv_folds=5,
                     early_stopping_rounds=20):
            if useTrainCV:
                xgb_param = alg.get_xgb_params()
                xgtrain = xgb.DMatrix(train_xdf, label=train_ydf)
                cvresult = xgb.cv(
                    xgb_param,
                    xgtrain,
                    num_boost_round=alg.get_params()['n_estimators'],
                    nfold=cv_folds,
                    metrics='mae',
                    early_stopping_rounds=early_stopping_rounds)

                alg.set_params(n_estimators=cvresult.shape[0])
                print(cvresult.shape[0])

            # Fit the algorithm on the data
            alg.fit(train_xdf, train_ydf, eval_metric='mae')
            # bst = xgb.train(alg.get_xgb_params(), xgtrain, num_boost_round=cvresult.shape[0])

            # Predict training set:
            dtrain_predictions = alg.predict(test_xdf)
            # dtrain_predictions = bst.predict(dtest)

            # Print model report:
            print("\nModel Report")
            test_y = dtest.get_label()
            print('error---',
                  np.mean(abs(dtrain_predictions - test_y) / test_y))
            print("Accuracy : %.4g" %
                  mean_absolute_error(test_y, dtrain_predictions))
            # print("Accuracy : %.4g" % np.mean(abs(dtrain_predictions - np.array(train_ydf[0])) / np.array(train_ydf[0])))

            feat_imp = pd.Series(
                alg.get_booster().get_fscore()).sort_values(ascending=False)
            feat_imp.plot(kind='bar', title='Feature Importances')
            plt.ylabel('Feature Importance Score')
            plt.show()

        # modelfit(self.model, train_xdf, train_ydf)

        param_test1 = {
            # 'max_depth': range(4, 7, 1),
            # 'min_child_weight': range(2, 4, 1)
            # 'gamma': [i / 10.0 for i in range(0, 5)]
            'subsample': [i / 10.0 for i in range(6, 10)],
            'colsample_bytree': [i / 10.0 for i in range(6, 10)]
        }
        gsearch1 = GridSearchCV(
            estimator=XGBRegressor(
                slice=1,
                learning_rate=0.01,
                n_estimators=96,  # 树的个数--1000棵树建立xgboost
                max_depth=5,  # 树的深度
                min_child_weight=2,  # 叶子节点最小权重
                gamma=0.,  # 惩罚项中叶子结点个数前的参数
                subsample=0.8,  # 随机选择80%样本建立决策树
                # colsample = 0.75,  # 随机选择80%特征建立决策树
                colsample_bytree=0.6,
                verbosity=3,
                objective='reg:squarederror',  # 指定损失函数
                eval_metric='mae',
                scale_pos_weight=1,  # 解决样本个数不平衡的问题
                # random_state = 27,  # 随机数
            ),
            param_grid=param_test1,
            scoring='neg_mean_absolute_error',
            n_jobs=4,
            iid=False,
            cv=5)
        # gsearch1.fit(test_xdf, test_ydf)
        # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

        bst = xgb.train(self.param, dtrain, num_boost_round=200)
        bst.save_model(self.xgb_model_path)
        test_preds = bst.predict(dtest)  #
        test_y = dtest.get_label()
        print('error---', np.mean(abs(test_preds - test_y) / test_y))
        print('---')
        self.model.fit(train_xdf, train_ydf, eval_metric='mae')
        # bst = xgb.train(alg.get_xgb_params(), xgtrain, num_boost_round=cvresult.shape[0])

        # Predict training set:
        dtrain_predictions = self.model.predict(test_xdf)
        # dtrain_predictions = bst.predict(dtest)

        # Print model report:
        print("\nModel Report")
        test_y = dtest.get_label()
        print('error---', np.mean(abs(dtrain_predictions - test_y) / test_y))
        # print("Accuracy : %.4g" % mean_absolute_error(test_y, dtrain_predictions))
        feat_imp = pd.Series(
            self.model.get_booster().get_fscore()).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
        plt.show()
        pass

    def pred_xgboost(self, predict_data):
        pass
        predict_df = pd.DataFrame(predict_data)
        dpred = xgb.DMatrix(predict_df)
        bst = xgb.Booster(model_file=self.xgb_model_path)
        preds = bst.predict(dpred)
        print('preds---', preds)
        print('---')
        return preds
        pass
示例#5
0
def a():
    print("error")
    a()


# a()
print(set([(199, 198), (198, 178)]))
list_eg = [{198} & set(each) for each in [(199, 198), (198, 178)]
           if {198} & set(each)]
print(bool(list_eg))

from sklearn.datasets import load_iris, load_boston, load_iris
from xgboost import XGBClassifier, XGBRegressor
boston = load_boston()
train = pd.DataFrame(boston['data'])
label = pd.Series(boston['target'], name='label')
full = pd.concat((train, label), axis=1)
model = XGBRegressor(n_estimators=3, max_depth=1, reg_lambda=0, reg_alpha=0)
model.fit(train, label)
model.predict()
model.get_booster().trees_to_dataframe()
示例#6
0
class MetaRecommender(BaseRecommender):
    """Penn AI meta recommender.
    Recommends machine learning algorithms and parameters as follows:
    maintains an internal model of the form f_d(ML,P,MF) = E
    where 
    d is the dataset
    ML is the machine learning
    P is the ML parameters
    MF is the metafeatures associated with d
        
    to produce recommendations for dataset d, it does the following:
    E_a = f_d(ML_a,P_a,MF_d) prediction of performance of a on d
    Sort E_a for several a (sampled from ML+P options)
    recommend top E_a 

    Parameters
    ----------
    ml_type: str, 'classifier' or 'regressor'
        Recommending classifiers or regressors. Used to determine ML options.
    
    metric: str (default: accuracy for classifiers, mse for regressors)
        The metric by which to assess performance on the datasets.
    
    ml_p: Dataframe
        Contains all the machine learning / algorithm combinations available for recommendation.

    sample_size: int
        Number of ML/P combos to evaluate when making a recommendation. 

    """
    def __init__(self,
                 ml_type='classifier',
                 metric=None,
                 ml_p=None,
                 sample_size=100):
        """Initialize recommendation system."""
        if ml_type not in ['classifier', 'regressor']:
            raise ValueError('ml_type must be "classifier" or "regressor"')

        self.ml_type = ml_type

        if metric is None:
            self.metric = 'bal_accuracy' if self.ml_type == 'classifier' else 'mse'
        else:
            self.metric = metric

        # training data
        self.training_features = None
        # store metafeatures of datasets that have been seen
        # self.dataset_metafeatures = None
        # maintain a set of dataset-algorithm-parameter combinations that have already been
        # evaluated
        self.trained_dataset_models = set()
        # TODO: add option for ML estimator
        self.first_update = True

        # load ML Parameter combinations and fit an encoding to them that can be used for
        # learning a model : score = f(ml,p,dataset,metafeatures)

        self.ml_p = ml_p
        if self.ml_p is not None:
            self.ml_p = self.params_to_features(self.ml_p, init=True)
            self.ml_p = self.ml_p.drop_duplicates(
            )  # just in case duplicates are present

        # print('ml_p:',self.ml_p)
        self.cat_params = [
            'criterion', 'kernel', 'loss', 'max_depth', 'max_features',
            'min_weight_fraction_leaf', 'n_estimators', 'n_neighbors',
            'weights'
        ]

        self.sample_size = min(sample_size, len(self.ml_p))
        # Encoding the variables
        self.LE = defaultdict(LabelEncoder)
        # self.OHE = OneHotEncoder(sparse=False)
        # pdb.set_trace()
        self.ml_p = self.ml_p.apply(lambda x: self.LE[x.name].fit_transform(x))
        # print('ml_p after LE:',self.ml_p)
        # self.X_ml_p = self.OHE.fit_transform(self.ml_p.values)
        self.X_ml_p = self.ml_p.values
        # self.ml_p = self.ml_p.apply(lambda x: self.OHE[x.name].fit_transform(x))
        # print('X after OHE:',self.X_ml_p.shape)
        # print('self.ml_p:',self.ml_p)
        print('loaded {nalg} ml/parameter combinations with '
              '{nparams} parameters'.format(nalg=self.X_ml_p.shape[0],
                                            nparams=self.X_ml_p.shape[1] - 1))

        # our ML
        self.ml = XGBRegressor(max_depth=6, n_estimators=500)

    def params_to_features(self, df, init=False):
        """convert parameter dictionaries to dataframe columns"""
        # pdb.set_trace()
        try:
            param = df['parameters'].apply(eval)
            param = pd.DataFrame.from_records(list(param))
            param = param.applymap(str)
            # get rid of trailing .0 added to integer vals
            param = param.applymap(lambda x: x[:-2] if x[-2:] == '.0' else x)
            param = param.reset_index(drop=True)
            # print('param:',param)
            df = df.drop('parameters', axis=1).reset_index(drop=True)
            df = pd.concat([df, param], axis=1)

            if not init:  # need to add additional parameter combos for other ml
                df_tmp = pd.DataFrame(columns=self.ml_p.columns)
                df_tmp = df_tmp.append(df)
                df_tmp.fillna('nan', inplace=True)
                df = df_tmp
            # sort columns by name
            df.sort_index(axis=1, inplace=True)
            # print('df:',df)
        except Exception as e:
            print(e)
            pdb.set_trace()
        return df

    def features_to_params(self, df):
        """convert dataframe columns to parameter dictionaries"""
        param = df.to_dict('index')
        plist = []
        for k, v in param.items():
            tmp = {k1: v1 for k1, v1 in v.items() if v1 != 'nan'}
            for k1, v1 in tmp.items():
                try:
                    tmp[k1] = int(v1)
                except:
                    try:
                        tmp[k1] = float(v1)
                    except:
                        pass
                    pass
            plist.append(str(tmp))

        return plist

    def update(self, results_data, results_mf):
        """Update ML / Parameter recommendations based on overall performance in results_data.

        Updates self.scores

        Parameters
        ----------
        results_data: DataFrame with columns corresponding to:
                'dataset'
                'algorithm'
                'parameters'
                self.metric
        """
        # keep track of unique dataset / parameter / classifier combos in results_data
        dap = (results_data['dataset'].values + '|' +
               results_data['algorithm'].values + '|' +
               results_data['parameters'].values)
        d_ml_p = np.unique(dap)
        self.trained_dataset_models.update(d_ml_p)
        # transform data for learning a model from it
        self.setup_training_data(results_data, results_mf)

        # update internal model
        self.update_model()

    def transform_ml_p(self, df_ml_p):
        """Encodes categorical labels and transforms them using a one hot encoding."""
        df_ml_p = self.params_to_features(df_ml_p)
        # df_tmp = pd.DataFrame(columns=self.ml_p.columns)
        # df_tmp = df_tmp.append(df_ml_p)
        # df_tmp.fillna('nan', inplace=True)
        df_ml_p = df_ml_p.apply(lambda x: self.LE[x.name].transform(x))
        # df_ml_p = df_ml_p.apply(lambda x: self.LE[x.name].transform(x))

        # print('df_ml_p after LE transform:',df_ml_p)
        # X_ml_p = self.OHE.transform(df_ml_p.values)
        X_ml_p = df_ml_p.values
        # X_ml_p = self.OHE.transform(df_ml_p.values)
        # print('df_ml_p after OHE (',X_ml_p.shape,':\n',X_ml_p)
        return X_ml_p

    def setup_training_data(self, results_data, results_mf):
        """Transforms metafeatures and results data into learnable format."""
        # join df_mf to results_data to get mf rows for each result
        df_mf = pd.merge(results_data, results_mf, on='dataset', how='inner')
        df_mf = df_mf.loc[:, df_mf.columns.isin(results_mf.columns)]
        if 'dataset' in df_mf.columns:
            df_mf = df_mf.drop('dataset', axis=1)
        # print('df_mf:',df_mf)
        # print('dataset_metafeatures:',dataset_metafeatures)
        # transform algorithms and parameters to one hot encoding
        df_ml_p = results_data.loc[:,
                                   results_data.columns.
                                   isin(['algorithm', 'parameters'])]
        X_ml_p = self.transform_ml_p(df_ml_p)
        print('df_ml_p shape:', df_ml_p.shape)
        # join algorithm/parameters with dataset metafeatures
        print('df_mf shape:', df_mf.shape)
        self.training_features = np.hstack((X_ml_p, df_mf.values))
        # transform data using label encoder and one hot encoder
        self.training_y = results_data[self.metric].values
        assert (len(self.training_y) == len(self.training_features))

    def recommend(self, dataset_id=None, n_recs=1, dataset_mf=None):
        """Return a model and parameter values expected to do best on dataset.

        Parameters
        ----------
        dataset_id: string
            ID of the dataset for which the recommender is generating recommendations.
        n_recs: int (default: 1), optional
            Return a list of length n_recs in order of estimators and parameters expected to do best.
        """
        # TODO: predict scores over many variations of ML+P and pick the best
        # return ML+P for best average y
        try:
            ml_rec, p_rec, rec_score = self.best_model_prediction(
                dataset_id, n_recs, dataset_mf)

            for (m, p, r) in zip(ml_rec, p_rec, rec_score):
                print('ml_rec:', m, 'p_rec', p, 'rec_score', r)
            ml_rec, p_rec, rec_score = ml_rec[:
                                              n_recs], p_rec[:
                                                             n_recs], rec_score[:
                                                                                n_recs]
            # # if a dataset is specified, do not make recommendations for
            # # algorithm-parameter combos that have already been run
            # if dataset_id is not None:
            #     rec = [r for r in rec if dataset_id + '|' + r not in
            #            self.trained_dataset_models]

            # ml_rec = [r.split('|')[0] for r in rec]
            # p_rec = [r.split('|')[1] for r in rec]
            # rec_score = [self.scores[r] for r in rec]
        except Exception as e:
            print('error running self.best_model_prediction for', dataset_id)
            # print('ml_rec:', ml_rec)
            # print('p_rec', p_rec)
            # print('rec_score',rec_score)
            raise e

        # update the recommender's memory with the new algorithm-parameter combos that it recommended
        # ml_rec = ml_rec[:n_recs]
        # p_rec = p_rec[:n_recs]
        # rec_score = rec_score[:n_recs]

        # if dataset_id is not None:
        #     self.trained_dataset_models.update(
        #                                 ['|'.join([dataset_id, ml, p])
        #                                 for ml, p in zip(ml_rec, p_rec)])

        return ml_rec, p_rec, rec_score

    def update_model(self):
        """Trains model on datasets and metafeatures."""
        print('updating model')
        current_model = None if self.ml._Booster is None else self.ml.get_booster(
        )
        self.ml.fit(self.training_features,
                    self.training_y,
                    xgb_model=current_model)
        print('model updated')

    def best_model_prediction(self, dataset_id, n_recs=1, df_mf=None):
        """Predict scores over many variations of ML+P and pick the best"""
        # get dataset metafeatures
        # df_mf = self.get_metafeatures(dataset_id)
        mf = df_mf.drop('dataset', axis=1).values.flatten()
        # setup input data by sampling ml+p combinations from all possible combos
        # choices = np.random.choice(len(self.X_ml_p),size=self.sample_size,replace=False)
        X_ml_p = self.X_ml_p[np.random.choice(len(self.X_ml_p),
                                              size=self.sample_size,
                                              replace=False)]
        print('generating predictions for:')
        df_tmp = pd.DataFrame(X_ml_p, columns=self.ml_p.columns)
        print(df_tmp.apply(lambda x: self.LE[x.name].inverse_transform(x)))
        # make prediction data consisting of ml + p combinations plus metafeatures
        predict_features = np.array([np.hstack((ml_p, mf)) for ml_p in X_ml_p])

        # print('predict_features:',predict_features)
        # generate predicted scores
        predict_scores = self.ml.predict(predict_features)
        # print('predict_scores:',predict_scores)

        # grab best scores
        predict_idx = np.argsort(predict_scores)[::-1][:n_recs]
        # print('predict_idx:',predict_idx)
        # indices in X_ml_p that match best prediction scores
        predict_ml_p = X_ml_p[predict_idx]
        pred_ml_p_df = df_tmp.loc[predict_idx, :]
        # print('df_tmp[predict_idx]:',pred_ml_p_df)
        # invert the one hot encoding
        # fi = self.OHE.feature_indices_
        # predict_ml_p_le = [x[fi[i]:fi[i+1]].dot(np.arange(nv)) for i,nv in
        #                    enumerate(self.OHE.n_values_)
        #                    for x in predict_ml_p]
        predict_ml_p_le = predict_ml_p

        # df_pr_ml_p = pd.DataFrame(
        #         data=np.array(predict_ml_p_le).reshape(-1,len(self.ml_p.columns)),
        #         columns = self.ml_p.columns, dtype=np.int64)
        # # invert the label encoding
        df_pr_ml_p = df_tmp.loc[predict_idx, :]
        df_pr_ml_p = df_pr_ml_p.apply(
            lambda x: self.LE[x.name].inverse_transform(x))
        # predict_ml_p = df_pr_ml_p.values

        # grab recommendations
        ml_recs = list(df_pr_ml_p['algorithm'].values)
        p_recs = self.features_to_params(df_pr_ml_p.drop('algorithm', axis=1))
        scores = predict_scores[predict_idx]
        # pdb.set_trace()

        return ml_recs, p_recs, scores
示例#7
0
    # df_train = df_train[(z < 10).all(axis=1)]

    # print(df_train.shape)

    label = df_train['NU_NOTA_MT']

    df_train.drop(['NU_NOTA_MT'], axis=1, inplace=True)

    label_y = df_test.pop('NU_NOTA_MT')

    model = XGBRegressor()

    model.fit(df_train, label)

    booster = model.get_booster()

    model_explainer = explain_weights(model, top=None)

    exp_df = formatters.format_as_dataframe(model_explainer)

    exp_df.to_csv("model_explainer.csv", index=False)

    for i in range(10):
        # , feature_names=booster.feature_names)
        individual_explainer = explain_prediction(model, df_test.iloc[i])

        df_i = formatters.format_as_dataframe(individual_explainer)

        name = df_answer['NU_INSCRICAO'].iloc[i]
示例#8
0
class Modelo:
    '''
    Clase que sirve para preprocesar ligeramente los datos a emplear, principalmente
    a través de métodos de escalado, concretamente estandarización y normalización, y
    para la construcción de modelos de clasificación y regresión basados en los algoritmos
    de las bibliotecas Scikit-learn, Tensorflow, XGBoost y LightGBM. También permite
    evaluar dichos modelos a través de una serie de métricas provenientes de la librería
    Scikit-learn y, para algunos algoritmos concretos, permite la visualización, bien 
    del proceso, bien de la importancia de las características empleadas. Concretamente,
    los algoritmos que se emplearán en esta clase serán:
        
        Algoritmos de Clasificación:
        -----------------------------------------------------------------------
        (Todos ellos permiten visualizar matrices de confusión.)
        
        -> Regresión Logística: Sklearn (Visualización de características)
        -> SVC: Sklearn (No permite visualización de características)
        -> K-vecinos: Sklearn (No permite visualización de características)
        -> Bosques Aleatorios: Sklearn (Visualización de características)
        -> Compilación: Sklearn (No permite visualización de características)
        -> Redes Neuronales: Tensorflow (Visualización de función pérdida)
        -> Clasificador XGB: XGBoost (Visualización de características)
        -> Clasificador LightGBM: LightGBM (Visualización de características)
        -----------------------------------------------------------------------
        Algoritmos de Regresión:
        -----------------------------------------------------------------------
        
        -> Regresión lineal: Sklearn (Visualización de características)
        -> K-vecinos: Sklearn (No permite visualización de características)
        -> Regresor de Gradient Boosting: Sklearn (Visualización de características)
        -> Bosques Aleatorios: Sklearn (Visualización de características)
        -> Redes Neuronales: Tensorflow (Visualización de función de pérdida)
        -> Regresor XGB: XGBoost (Visualización de características)
        -> Regresor LightGBM: LightGBM (Visualización de características)
        -----------------------------------------------------------------------
        
    En cuanto a las métricas, podrán emplearse las siguientes:
        
        Para evaluar modelos de clasificación:
        -----------------------------------------------------------------------
        
        -> Matriz de confusión
        -> Reporte de Clasificación (que incluye las principales métricas para cada clase)
        -> Balance de la clasificación: Puntuación de 0 a 1 del modelo, definida como:
            
                      especifidad + sensibilidad
           balance =  --------------------------
                                  2
        -----------------------------------------------------------------------
        Para evaluar modelos de regresión:
        -----------------------------------------------------------------------
        
        -> Error absoluto medio
        -> Error cuadrático medio
        -> Varianza explicada: Puntuación de 0 a 1 definida como:
            
                       Var{y-y_pred}
            EVS = 1 -  -------------
                           Var{y}
        -----------------------------------------------------------------------
        
            Parámetros:
                df (pandas.DataFrame): Dataframe con los datos.
                tipo (str): Distinción entre clasificador y regresor.
                
            Return:
                Clase Modelo.
    '''
    def __init__(self, df, tipo='Clasificador'):

        self.__data = df
        self.tipo = tipo

        if tipo == 'Clasificador':

            self.y = self.__data['Ganador']
            self.X = self.__data.drop(['Ganador', 'Diferencia'], axis=1)

        else:

            self.y = self.__data['Diferencia']
            self.X = self.__data.drop(['Ganador', 'Diferencia'], axis=1)

        self.__columns = self.X.columns

    ##### Preprocesado de datos #####

    def estandarizar(self):
        '''
        Función para estandarizar el conjunto de datos. Estandarización significa
        reescalar los datos para que tengan media de cero y desviación estándar de uno.
        '''

        self.Scaler = StandardScaler()
        self.X = self.Scaler.fit_transform(self.X)
        self.X = pd.DataFrame(data=self.X, columns=self.__columns)

    def normalizar(self):
        '''
        Función para normalizar el conjunto de datos. Normalización significa reescalar
        los datos para que se encuentren entre cero y uno.
        '''

        self.norma = MinMaxScaler()
        self.X = self.norma.fit_transform(self.X)
        self.X = pd.DataFrame(data=self.X, columns=self.__columns)

    def Split(self, size=0.2):
        '''
        Función para separar los datos en conjuntos de entrenamiento y prueba
        '''

        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(self.X, self.y, test_size = size, random_state = 42)

    def retorno(self):
        '''
        Función para invertir el escalado de los datos de prueba.
        '''

        try:
            if self.Scaler:
                self.X_test = self.Scaler.inverse_transform(self.X_test)
                self.X_test = pd.DataFrame(data=self.X_test,
                                           columns=self.__columns)

        except NameError:
            if self.norma:
                self.X_test = self.norma.inverse_transform(self.X_test)
                self.X_test = pd.DataFrame(data=self.X_test,
                                           columns=self.__columns)

        except:
            print('No se han empleado métodos de reescalado.')

    ##### Modelos de Clasificación #####

    def NN_Clas_model(self,
                      neuronas=[512, 512, 256, 256, 128],
                      dropouts=[0.4, 0.4, 0.3, 0.3],
                      epochs=150,
                      split=0.2,
                      size=11640):
        '''
        Función para construir un modelo de clasificación basado en una red neuronal,
        mediante el módulo keras de tensorflow. Se establece, por resutados anteriores
        de prueba y error un número de 5 capas.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                neuronas (int): Lista con el número de neuronas por capa, excepto la
                    última, que solo tiene una.
                dropouts (int): Lista con la tasa de neuronas en drop-out por capa.
                epoch (int): Número de veces que la red recorre el dataset.
                split (int): Tasa de valores del conjunto de entrenamiento empleados como
                    validación.
                size (int): valor del batch_size.
                
            Return:
                self.model (modelo): Modelo de la red neuronal.
                self.history (modelo): Modelo entrenado.
        '''

        self.model_type = 'NN'

        tf.keras.backend.clear_session()

        self.model = models.Sequential(
            [
                layers.Dense(units=neuronas[0],
                             input_dim=self.X_train.shape[1],
                             activation='relu'),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[0]),
                layers.Dense(units=neuronas[1], activation='relu'),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[1]),
                layers.Dense(units=neuronas[2], activation='relu'),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[2]),
                layers.Dense(units=neuronas[3], activation='relu'),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[3]),
                layers.Dense(units=neuronas[4], activation='relu'),
                layers.LeakyReLU(),
                layers.Dense(units=1, activation="sigmoid"),
            ],
            name="Modelo de Clasificación con Redes Neuronales",
        )

        self.model.compile(optimizer=optimizers.Adam(),
                           loss=losses.binary_crossentropy,
                           metrics=[metrics.binary_accuracy])

        self.history = self.model.fit(self.X_train,
                                      self.y_train.tolist(),
                                      epochs=epochs,
                                      batch_size=size,
                                      validation_split=split,
                                      verbose=1)

    def RFC(self, max_depth=50, n_estimators=150):
        '''
        Función para construir un modelo de clasificación basado en el algoritmo
        RandomForestClassifier.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                max_depth (int): Profundidad máxima de los árboles.
                n_estimators (int): Número de árboles.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'rfc'
        self.model = RandomForestClassifier(max_depth = max_depth, n_estimators =\
                                            n_estimators)
        self.model.fit(self.X_train, self.y_train)

    def SVClass(self, C=16):
        '''
        Función para construir un modelo a partir de los algorimos de clasificación
        de maquinas de soporte de vectores.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                C (int): Parámetro de regularización.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'svc'
        self.model = SVC(C=C)
        self.model.fit(self.X_train, self.y_train)

    def LogReg(self, C=5):
        '''
        Función para construir un modelo a partir del algoritmo de regresión logística
        de sklearn.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                C (int): Parámetro de regularización.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'logreg'
        self.model = LogisticRegression(C=C)
        self.model.fit(self.X_train, self.y_train)

    def KNN(self, n_neighbors=5):
        '''
        Función para construir un modelo a partir del algoritmo de K-vecinos
        de sklearn.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                n_neighboors (int): Número de vecinos.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'knn'
        self.model = KNeighborsClassifier(n_neighbors=n_neighbors)
        self.model.fit(self.X_train, self.y_train)

    def StackModel(self):
        '''
        Función para construir un modelo a partir de los algoritmos SVClassifier,
        RandomForestClassifier y Regresión logística, mediante la función Stacking
        Classifier de sklearn.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'stack'
        estimators = estimators = [('svc', SVC()),
                                   ('rf',
                                    RandomForestClassifier(n_estimators=100,
                                                           max_depth=50))]
        self.model = StackingClassifier(estimators=estimators,
                                        final_estimator=LogisticRegression())
        self.model.fit(self.X_train, self.y_train)

    ##### Modelo de Regresión #####

    def NN_Reg_model(self,
                     neuronas=[1024, 512, 512, 256, 256, 128],
                     epochs=250,
                     dropouts=[0.4, 0.3, 0.3, 0.2, 0.2],
                     split=0.2,
                     size=11640,
                     lr=0.02,
                     decay=6e-4):
        '''
        Función para construir un modelo de clasificación basado en una red neuronal,
        mediante el módulo keras de tensorflow. Se establece, por resutados anteriores
        de prueba y error un número de 5 capas.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                neuronas (int): Lista con el número de neuronas por capa, excepto la
                    última, que solo tiene una.
                dropouts (float): Lista con la tasa de neuronas en drop-out por capa.
                epoch (int): Número de veces que la red recorre el dataset.
                split (float): Tasa de valores del conjunto de entrenamiento empleados como
                    validación.
                size (int): valor del batch_size.
                lr (float): Valor del learning rate.
                decay (float): Valor del decaimiento del ratio de aprendizaje.
                
            Return:
                self.model (modelo): Modelo de la red neuronal.
                self.history (modelo): Modelo entrenado.
        '''

        self.model_type = 'NN'

        tf.keras.backend.clear_session()

        self.model = models.Sequential(
            [
                layers.Dense(units=neuronas[0],
                             input_dim=self.X_train.shape[1]),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[0]),
                layers.Dense(units=neuronas[1]),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[1]),
                layers.Dense(units=neuronas[2]),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[2]),
                layers.Dense(units=neuronas[3]),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[3]),
                layers.Dense(units=neuronas[4]),
                layers.LeakyReLU(),
                layers.BatchNormalization(),
                layers.Dropout(dropouts[4]),
                layers.Dense(units=neuronas[5]),
                layers.LeakyReLU(),
                layers.Dense(units=1, activation="linear"),
            ],
            name="Modelo de Regresión con Redes Neuronales",
        )

        self.model.compile(optimizer=optimizers.Adam(lr=lr, decay=decay),
                           loss=losses.mae,
                           metrics=[metrics.mse])

        self.history = self.model.fit(self.X_train,
                                      self.y_train.tolist(),
                                      epochs=epochs,
                                      batch_size=size,
                                      validation_split=split,
                                      verbose=1)

    def LinReg(self):
        '''
        Función para construir un modelo de regresión basado en regresión lineal.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'linreg'
        self.model = LinearRegression()
        self.model.fit(self.X_train, self.y_train)

    def GradBoost(self, n_estimators=200, max_depth=10, learning_rate=0.3):
        '''
        Función para construir un modelo de regresión basado en Gradient Boosting.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'gradboost'

        self.model = GradientBoostingRegressor(learning_rate=learning_rate,
                                               max_depth=max_depth,
                                               n_estimators=n_estimators)
        self.model.fit(self.X_train, self.y_train)

    def RFR(self, n_estimators=150, max_depth=20):
        '''
         Función para construir un modelo de regresión basado en Bosques aleatorios.
        
             Parámetros:
                 self.X_train (array): Array del conjunto de entrenamiento.
                 self.y_train (array): Array de la clasificación real.
                 n_estimators (int): Número de árboles.
                 max_depth (int): Profundidad máxima de los árboles
                
             Return:
                 self.model (modelo): Modelo entrenado.
         '''

        self.model_type = 'rfr'

        self.model = RandomForestRegressor(max_depth=max_depth,
                                           n_estimators=n_estimators)
        self.model.fit(self.X_train, self.y_train)

    def KNNR(self, n_neigbors=12):
        '''
        Función para construir un modelo de regresión basado en K-vecinos.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                n_neigbors (int): Número de vecinos a tener en cuenta.
                
            Return:
                self.model (modelo): Modelo entrenado.
        '''

        self.model_type = 'knn'

        self.model = KNeighborsRegressor(n_neighbors=n_neigbors)
        self.model.fit(self.X_train, self.y_train)

    ##### LigthGBM #####

    def LGBModel(self,
                 learning_rate=0.5,
                 max_depth=15,
                 n_estimators=100,
                 epoch=100):
        '''
        Función para construir un modelo basado en los algoritmos de LigthGBM, tanto
        clasificación como de regresión.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                learning_rate (int): Ratio de aprendizaje
                max_depth (int): Profundidad máxima de los árboles.
                n_estimators (int): Número de estimadores.
                epoch (int): Número de veces que se recorre el dataset.
                
            Return:
                self.model (modelo): Modelo de lgb.
        '''

        self.model_type = 'lgb'
        d_train = lgb.Dataset(self.X_train, label=self.y_train)

        params = {}
        params['learning_rate'] = learning_rate
        params['boosting_type'] = 'gbdt'
        params['max_depth'] = max_depth
        params['use_missing'] = False

        if self.tipo == 'Clasificador':
            params['objective'] = 'binary'
            params['metric'] = 'binary_logloss'

        else:
            params['objective'] = 'regression'
            params['n_estimators'] = n_estimators

        self.model = lgb.train(params, d_train, epoch)

    ##### XGBoost #####

    def XGBmodel(self, learning_rate=0.5, max_depth=10, n_estimators=100):
        '''
        Función para construir modelos de predicción basados en la librería XGBoost.
        
            Parámetros:
                self.X_train (array): Array del conjunto de entrenamiento.
                self.y_train (array): Array de la clasificación real.
                learning_rate (int): Ratio de aprendizaje
                max_depth (int): Profundidad máxima de los árboles.
                n_estimators (int): Número de árboles.
                
            Return:
                self.model (modelo): Modelo entrenado.
                
        '''

        self.model_type = 'XGB'

        if self.tipo == 'Clasificador':
            self.model = XGBClassifier(learning_rate=learning_rate,
                                       max_depth=max_depth)
            self.model.fit(self.X_train, self.y_train)

        else:
            self.model = XGBRegressor(learning_rate=learning_rate,
                                      max_depth=max_depth,
                                      n_estimators=n_estimators)
            self.model.fit(self.X_train, self.y_train)

    ##### Predicción #####

    def pred_class(self):
        '''
        Función para la predicción de clases para los modelos de clasificación, y su 
        regularización respecto a los valores reales.
        
            Parámetros:
                self.X_test (array): Conjunto de prueba.
                self.model (modelo): Modelo entrenado.
                self.model_type (str): Tipo de modelo empleado.
                
            Return:
                self.y_pred (array): Predicciones del modelo.
        '''

        if self.model_type == 'NN':
            self.y_pred = self.model.predict_classes(self.X_test).reshape(-1)

        elif self.model_type == 'lgb':
            self.y_pred = self.model.predict(self.X_test).round(0)

        else:
            self.y_pred = self.model.predict(self.X_test)

    def pred(self):
        '''
        Función para la predicción en los modelos de regresión, y su regularización 
        respecto a los valores reales.
        
            Parámetros:
                self.X_test (array): Conjunto de prueba.
                self.model (modelo): Modelo entrenado.
                self.model_type (str): Tipo de modelo empleado.
                
            Return:
                self.y_pred (array): Predicciones del modelo.
        '''

        self.y_pred = self.model.predict(self.X_test)

    ##### Visualización #####

    def Graficar_Perdida(self):
        '''
        Función para graficar la función perdida del set de entrenamiento y el 
        de validación durante el proceso de entrenamiento de una red neuronal.
        
            Parámetros:
                self.model (model): Modelo entrenado.
                
            Return:
                fig (Figure): Gráfica.
        '''

        trace0 = go.Scatter(y=self.history.history['loss'],
                            x=self.history.epoch,
                            mode='lines',
                            marker=dict(color="blue", size=5, opacity=0.5),
                            name="Training Loss")

        trace1 = go.Scatter(y=self.history.history['val_loss'],
                            x=self.history.epoch,
                            mode='lines',
                            marker=dict(color="red", size=5, opacity=0.5),
                            name="Validation Loss")

        data = [trace0, trace1]

        fig = go.Figure(data=data,
                        layout=go.Layout(title="Curva de aprendizaje",
                                         yaxis=dict(title="Pérdida"),
                                         xaxis=dict(title="Epoch"),
                                         legend=dict(yanchor='top',
                                                     xanchor='center')))

        return fig

    def Feature_importance(self, importance_type='gain', color='green'):
        '''
        Función para graficar la importancia de las características de un modelo.
        No vale para redes neuronales.
        
            Parámetros:
                self.model (model): Modelo entrenado.
                importance_type (str): Tipo de importancia a graficar.
                color (str): Color de representación de la gráfica.
                figsize (int): Tupla con las dimensiones deseadas de la figura.
                
            Return:
                fig (Figure): figura.
        '''

        if self.model_type == 'lgb':

            #Valores de las características del modelo lgb.
            valores = dict(zip(self.X_train.columns,
                               self.model.feature_importance(importance_type = \
                                                             importance_type)))

        elif self.model_type == 'XGB':

            #Valores de las características del modelo de XGBoost.
            valores = self.model.get_booster().get_score(
                importance_type=importance_type)

        elif self.model_type == 'logreg':

            #Valores de las características del modelo de regresión logística
            valores = dict(zip(self.X_train.columns, self.model.coef_[0]))

        elif self.model_type == 'linreg':

            #Valores de las características del modelo de regresión lineal
            valores = dict(zip(self.X_train.columns, self.model.coef_))

        else:

            #Valores de las características de modelos basados en algoritmos de sklearn.
            valores = dict(
                zip(self.X_train.columns, self.model.feature_importances_))

        #Ordenar los valores de mayor a menor.
        sorted_tuples = sorted(valores.items(), key=lambda item: item[1])
        valores = {k: v for k, v in sorted_tuples}

        fig = go.Figure(
            go.Bar(x=list(valores.values()),
                   y=list(valores.keys()),
                   orientation='h'))
        fig.update_traces(marker_color=color,
                          marker_line_color='black',
                          marker_line_width=1.5,
                          opacity=0.8)
        fig.update_layout(xaxis_title='Feature importance',
                          yaxis_title='Feature',
                          title='Importancia de las características',
                          width=900,
                          height=850)

        return fig

    def Plot_conf_mat(self, colorscale='Jet'):
        '''
        Función para graficar la matriz de confusión como un mapa de calor.
        
            Parámetros:
                self.y_test (array): Array con las observaciones reales.
                self.y_pred (array): Array con las predicciones.
                colorscale (str): Escala de color.
                
            Return:
                fig (Figure): Figura.
        '''

        z = confusion_matrix(self.y_test, self.y_pred)
        x = ['Izquierda', 'Derecha']
        y = ['Izquierda', 'Derecha']

        z_text = [[str(y) for y in x] for x in z]
        fig = ff.create_annotated_heatmap(z,
                                          x=x,
                                          y=y,
                                          annotation_text=z_text,
                                          colorscale=colorscale)

        fig.update_layout(title_text='<i><b>Matriz de confusión</b></i>',
                          xaxis_title='Predicción',
                          yaxis_title='Valor Real')

        fig.update_layout(margin=dict(t=50, l=200))
        fig['data'][0]['showscale'] = True

        return fig

    ##### Métricas #####

    def conf_mat(self):
        '''
        Función para calcular la especifidad y sensibilidad de un modelo de clasificación
        a partir de la función confusion_matrix de sklearn.
        
            Parámetros:
                self.y_test (array): Array con los resultados reales de la clasificación.
                self.y_pred (array): Array con los resultados del modelo.
                
            Return:
                conf_mat (array): Lista con los valores de la matriz de confusión, con
                    orden [TN, FP, FN, TN]
        '''

        return print(confusion_matrix(self.y_test, self.y_pred))

    def class_report(self):
        '''
        Función para recibir el reporte del modelo de clasificación.
        
            Parámetros:
                self.y_test (array): Array con los resultados reales de la clasificación.
                self.y_pred (array): Array con los resultados del modelo.
                
            Return:
                classification_report
        '''

        return print(classification_report(self.y_test, self.y_pred))

    def balance(self):
        '''
        Función para recibir el balance de precisión del modelo de clasificación.
        
            Parámetros:
                self.y_test (array): Array con los resultados reales de la clasificación.
                self.y_pred (array): Array con los resultados del modelo.
                
            Return:
                balanced_accuracy_score
        '''

        return print(balanced_accuracy_score(self.y_test, self.y_pred))

    def mae(self):
        '''
        Función para recibir el error absoluto medio del modelo de regresión.
        
            Parámetros:
                self.y_test (array): Array con los resultados reales de la clasificación.
                self.y_pred (array): Array con los resultados del modelo.
                
            Return:
                mean_absolute_error
        '''

        return print(mean_absolute_error(self.y_test, self.y_pred))

    def mse(self):
        '''
        Función para recibir el error cuadrático medio del modelo de regresión.
        
            Parámetros:
                self.y_test (array): Array con los resultados reales de la clasificación.
                self.y_pred (array): Array con los resultados del modelo.
                
            Return:
                mean_absolute_error
        '''

        return print(mean_squared_error(self.y_test, self.y_pred))

    def explain_variance(self):
        '''
        Función para recibir la explained variance score medio del modelo de regresión.
        
            Parámetros:
                self.y_test (array): Array con los resultados reales de la clasificación.
                self.y_pred (array): Array con los resultados del modelo.
                
            Return:
                explained_variance_score
        '''

        return print(explained_variance_score(self.y_test, self.y_pred))
示例#9
0
文件: xgb.py 项目: paantya/lish-moa
def get_xgboost_fe(train,
                   targets,
                   test,
                   sub,
                   xgb_params,
                   importance_type='weight',
                   NFOLDS=7,
                   verbosity=0):
    """

    :param train:
    :param targets:
    :param test:
    :param sub:
    :param xgb_params:
    :param importance_type: (def.: 'weight') also choice ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
    :param NFOLDS:
    :param verbosity:
    :return:
    """
    train_score = targets

    train = train.iloc[:, 1:]
    test = test.iloc[:, 1:]
    train_score = targets.iloc[:, 1:]
    sample = sub

    cols = train_score.columns
    submission = sample.copy()
    submission.loc[:, train_score.columns] = 0
    # test_preds = np.zeros((test.shape[0], train_score.shape[1]))
    oof_loss = 0

    start_time = datetime.now()

    fe_dict = {}
    for column in train.columns.values:
        fe_dict[column] = 0.0
    for c, column in enumerate(tqdm(cols, 'models_one_cols'), 1):
        y = train_score[column]
        total_loss = 0

        # cv = KFold(n_splits=NFOLDS, shuffle=True).split(train)
        CV = MultilabelStratifiedKFold(n_splits=NFOLDS,
                                       random_state=42).split(X=train,
                                                              y=targets)

        start_time_loc = datetime.now()
        for fn, (trn_idx, val_idx) in enumerate(CV):
            if verbosity > 1:
                print('\rFold: ', fn + 1, end='')
            X_train, X_val = train.iloc[trn_idx], train.iloc[val_idx]
            y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx]

            model = XGBRegressor(**xgb_params)

            model.fit(
                X_train,
                y_train,
            )

            importance = model.get_booster().get_score(
                importance_type=importance_type).items()
            if len(importance) < 1:
                if verbosity:
                    print(
                        f"[column {c} ({column}), CV {fn}] importance len < 1")
            else:
                for k, v in importance:
                    # print(f"{k}: {v}")
                    fe_dict[k] += v / len(cols)
            pred = model.predict(X_val)
            # pred = [n if n>0 else 0 for n in pred]

            loss = metric(y_val, pred)
            total_loss += loss
            predictions = model.predict(test)
            # predictions = [n if n>0 else 0 for n in predictions]
            submission[column] += predictions / NFOLDS

        stop_time_loc = datetime.now()
        submission[column] = submission[column] / NFOLDS
        oof_loss += total_loss / NFOLDS

        if verbosity > 1:
            print(f"\r[{stop_time_loc - start_time_loc}] Model " + str(c) +
                  ": Loss =" + str(total_loss / NFOLDS))

    stop_time = datetime.now()

    if verbosity:
        print(
            f"[{stop_time - start_time}] oof_loss/len(cols): {oof_loss/len(cols)}"
        )
    # submission.loc[test['cp_type'] == 1, train_score.columns] = 0
    return {
        k: v
        for k, v in sorted(fe_dict.items(), key=lambda kv: kv[1], reverse=True)
    }
示例#10
0
from sklearn.datasets import load_iris, load_boston, load_iris
from xgboost import XGBClassifier, XGBRegressor
import pandas as pd

boston = load_boston()

train = pd.DataFrame(boston['data'])
label = pd.Series(boston["target"], name='label')
full = pd.concat((train, label), axis=1)
print(full)
model = XGBRegressor(n_estimators=3, max_depth=1, reg_lambda=0, reg_alpha=0)
model.fit(train, label)
xgb_res = model.get_booster().trees_to_dataframe()
print(xgb_res)

# import pdb;pdb.set_trace()
# 回归任务,损失函数为均方误差
full["g"] = full["label"] - 0  # 为什么是-0.5
full["h"] = 1
root_score = full["g"].sum()**2 / full.shape[0]  # 根节点506个数据
left_df = full[full.iloc[:, 5] < 6.9410]
right_df = full[full.iloc[:, 5] >= 6.9410]
left_leaf = left_df["g"].sum() / left_df["h"].sum()  # 左子树430个数据
right_leaf = right_df["g"].sum() / right_df["h"].sum()  # 右子树76个数据

# left_df = full[full.iloc[:,12] < 9.725]
# right_df = full[full.iloc[:,12] >= 9.725]

left_score = left_df["g"].sum()**2 / left_df.shape[0]
right_score = right_df["g"].sum()**2 / right_df.shape[0]
print('The Gain for Root is left node score {} + right node score {} - root score {} = {},\nleft leaf: {}, right_leaf: {}' \
示例#11
0
def train():
    data = pd.read_csv('DATA\\data.csv', header=None, sep=' ')
    score = pd.read_csv('DATA\\score.csv', header=None, sep=' ')
    param_test1 = {
        'max_depth': [i for i in range(20, 30)],
        'learning_rate': [0.05 * i for i in range(1, 10)],
        'min_child_weight': [0.25 * i for i in range(1, 10)],
        'subsample': [0.5 + 0.05 * i for i in range(10)],
        'gamma': [0.002 * i for i in range(50)],
        'colsample_bytree': [0.5 + 0.02 * i for i in range(25)]
    }
    for iters in range(1):
        x_train, x_test, y_train, y_test = train_test_split(data,
                                                            score,
                                                            test_size=0.2,
                                                            random_state=0)
        print('Starting training...')
        count = 0
        for depth in param_test1['max_depth']:
            for learning_r in param_test1['learning_rate']:
                # for min_child_weigh in param_test1['min_child_weight']:
                for sub_sample in param_test1["subsample"]:
                    # for gammas in param_test1["gamma"]:
                    for colsample in param_test1['colsample_bytree']:
                        try:
                            xgb1 = XGBRegressor(
                                learning_rate=learning_r,
                                max_depth=depth,
                                # min_child_weight=min_child_weigh,
                                subsample=sub_sample,
                                colsample_bytree=colsample,
                                # gamma=gammas,
                                eval_metric='rmse',
                                nthread=4,
                                # scale_pos_weight=1,
                                n_estimators=1500)
                            xgb1.fit(x_train, y_train)
                            y_pred = xgb1.predict(x_test)
                            # predictions = [round(value,2) for value in y_pred]
                            # 计算rmse
                            rmse_new = math.sqrt(
                                sklearn.metrics.mean_squared_error(
                                    y_test, y_pred))
                            print(rmse_new)
                            if rmse_new < rmse:
                                current_path = os.path.join(
                                    'result-xgboost', str(rmse))
                                os.mkdir(current_path)
                                print(rmse_new)
                                rmse = rmse_new
                                x_test.to_csv(os.path.join(
                                    current_path, 'test_feat.csv'),
                                              index=False,
                                              header=False)
                                y_test.to_csv(os.path.join(
                                    current_path, 'test_score.csv'),
                                              index=False,
                                              header=False)
                                xgb1.get_booster().save_model(
                                    os.path.join(current_path, 'xgb.model'))
                                print("saved")
                                count += 1
                        except:
                            continue
示例#12
0
    ax = sns.scatterplot(y, y_predicted_xgb_cv, alpha=.3, color=colors[4])
    ax.plot(np.arange(0, max(y)), np.arange(0, max(y)), c='grey')
    ax.set_xlim(min(y), max(y))
    ax.set_xlabel('True age')
    ax.set_ylabel('Predicted age')
    plt.title('Crossvalidated predictions (XGB)')
    plt.show()

    # FEATURE IMPORTANCES #
    #######################

    feature_importances = pd.Series(data=xgb.feature_importances_,
                                    index=features.columns)
    feature_importances.to_csv(join(hlp.DATA_DIR, 'feature_importances.csv'))
    fig, ax = plt.subplots(figsize=(14, 10))
    xgb.get_booster().feature_names = list(features.columns)
    plot_importance(xgb, max_num_features=30, ax=ax, importance_type='gain')
    plt.show()

    feature_importances = feature_importances.sort_values(ascending=False)
    proportions = []
    for i in range(1, len(feature_importances)):
        temp = feature_importances[:i]
        total = len(temp)
        md = len([f for f in temp.iteritems() if f[0].startswith('md')])
        proportions.append(md / i)
    plt.plot(proportions, label='Proportion in top')
    plt.title('Importance of theory-driven features')
    plt.xlabel('Top n features')
    plt.axhline(y=23 / features.shape[1],
                color='grey',
示例#13
0
 def __init__(self, model: XGBRegressor, feature_names: List[str]):
     super().__init__(model.get_booster(), feature_names, model.base_score,
                      model.objective)
示例#14
0
def GBDT_main():
    '''GBDT主函数'''

    #载入数据
    # dataset = boston_1()
    dataset = DataGenerator()

    #画图横纵坐标序列
    X, Y = [], []

    #初始化MSE和交叉验证折数fold
    MSE, fold = 0, 1

    #子学习器个数
    n_estimators = 1
    #设置误差阈值:三个误差评估设置#
    Threshold = 70000000

    # k-fold对象,用于生成训练集和交叉验证集数据
    kf = model_selection.KFold(n_splits=5, shuffle=False, random_state=32)

    #生成GBDT模型
    model = XGBRegressor(
        max_depth=7,  # 树的最大深度(可调)
        learning_rate=0.1,  # 学习率(可调)
        n_estimators=n_estimators,  # 树的个数
        objective='reg:linear',  # 损失函数类型
        nthread=4,  # 线程数
        gamma=0.1,  # 节点分裂时损失函数所需最小下降值(可调)
        min_child_weight=1,  # 叶子结点最小权重
        subsample=1.,  # 随机选择样本比例建立决策树
        colsample_bytree=1.,  # 随机选择样本比例建立决策树
        reg_lambda=3,  # 二阶范数正则化项权衡值(可调)
        scale_pos_weight=1.,  # 解决样本个数不平衡问题
        random_state=1000,  # 随机种子设定值
    )

    while 1:

        # 定义最终输出的target= target - pre_target,数据维度同target
        fin_GBDT_error_target = np.array(None)

        for train_data_index, cv_data_index in kf.split(dataset):
            #找到对应索引数据
            train_data, cv_data = dataset[train_data_index], dataset[
                cv_data_index]
            # 训练数据
            model.fit(X=train_data[:, :4], y=train_data[:, -1])

            # 对验证集进行预测
            pred_cv = model.predict(cv_data[:, :4])

            #每次对验证集都需要计算误差向量
            fold_error = cv_data[:, -1] - pred_cv
            fin_GBDT_error_target = fold_error if fin_GBDT_error_target.any() == None else \
                np.hstack((fin_GBDT_error_target, fold_error))

            # 对测试集进行MSE计算
            MSE = ((fold - 1) * MSE +
                   mean_squared_error(cv_data[:, -1], pred_cv)) / fold
            fold += 1

        print('CART树个数: %s, 验证集MSE: %s' % (model.n_estimators, MSE))
        X = [1] if X == [] else X + [X[-1] + 1]
        Y.append(MSE)
        if MSE < Threshold:
            break
        else:
            MSE, fold = 0, 1
            # 如果验证集MSE值大于阈值则将GBDT中弱学习器数量自增1
            model.n_estimators += 1

    # print(fin_GBDT_error_target, fin_GBDT_error_target.shape)
    # print(X)
    ###################################实验数据需要修改###############################
    data = np.hstack((dataset[:, 4:-1], fin_GBDT_error_target[:, np.newaxis]))
    #################################################################################
    print(data.shape)
    SaveFile(data)

    #保存模型
    model.get_booster().save_model('GBDT.model')

    # 显示重要参数以及验证集误差随学习器个数的变化曲线
    plt.plot(X, Y)
    # plot_importance(model)
    plt.show()

    # 模型可视化
    digraph = xgb.to_graphviz(model, num_trees=4)
    digraph.format = 'png'
    digraph.view('./boston_xgb')
示例#15
0
def TA_screening(stock):

    #print(stocklist)

    #tic = time.perf_counter()
    index = []
    start_date = datetime.datetime.now() - datetime.timedelta(days=59)

    end_date = datetime.datetime.now()

    df = pdr.get_data_yahoo(stock,
                            start=start_date,
                            end=end_date,
                            interval="2m",
                            prepost=True)

    #df = pdr.get_data_yahoo(stock, period = "max", interval = "1d", prepost = True)

    df.index = df.index.tz_localize(None)

    #print(df.size)

    '''#2 min ticker
    # 30 intervals = 1 hour
    # 195 intervals = trading day''' # < old
    # there are more intervals that we can use / change

    #1 interval = 1 day

    really_fast = 30
    fast = 60
    slow = 120

    # these are the overlap studies

    def add_indicators():

        upper_band, mid_band, lower_band = BBANDS(df['Adj Close'],
                                                  timeperiod=really_fast,
                                                  nbdevup=2,
                                                  nbdevdn=2,
                                                  matype=0)
        d_ema = DEMA(df['Adj Close'], timeperiod=really_fast)
        E_M_A = EMA(df['Adj Close'], timeperiod=fast)
        ht_trend = HT_TRENDLINE(df['Adj Close'])
        kama = KAMA(df['Adj Close'], timeperiod=fast)
        ma = MA(df['Adj Close'], timeperiod=fast, matype=0)
        #mama, fama = MAMA(df['Adj Close'], fastlimit=really_fast, slowlimit=slow) < this gave me issues?
        #mavp = MAVP(df['Adj Close'])
        mid = MIDPOINT(df['Adj Close'], timeperiod=fast)
        mid_price = MIDPRICE(df['High'], df['Low'], timeperiod=fast)
        sar = SAR(df['High'], df['Low'], acceleration=.02, maximum=.2)
        sarext = SAREXT(df['High'],
                        df['Low'],
                        startvalue=0,
                        offsetonreverse=0,
                        accelerationinitlong=.02,
                        accelerationlong=.02,
                        accelerationmaxlong=.2,
                        accelerationinitshort=.02,
                        accelerationshort=.02,
                        accelerationmaxshort=.2)
        sma = SMA(df['Adj Close'], timeperiod=slow)
        tema = TEMA(df['Adj Close'], timeperiod=slow)
        trima = TRIMA(df['Adj Close'], timeperiod=slow)
        wma = WMA(df['Adj Close'], timeperiod=slow)

        #this is some of the beginning stuff

        O_B_V = OBV(df['Adj Close'], df['Volume'])
        A_D_O_S_C = ADOSC(df['High'],
                          df['Low'],
                          df['Adj Close'],
                          df['Volume'],
                          fastperiod=fast,
                          slowperiod=slow)
        O_G_chaikin = AD(df['High'], df['Low'], df['Adj Close'], df['Volume'])
        HT_DCper = HT_DCPERIOD(df['Adj Close'])
        HT_DCphase = HT_DCPHASE(df['Adj Close'])
        inphase, quad = HT_PHASOR(df['Adj Close'])
        r_sin, leadsin = HT_SINE(df['Adj Close'])

        #volatility
        atr = ATR(df['High'], df['Low'], df['Adj Close'], timeperiod=slow)
        natr = NATR(df['High'], df['Low'], df['Adj Close'], timeperiod=slow)
        t_range = TRANGE(df['High'], df['Low'], df['Adj Close'])

        #below here are momentum ind

        adx = ADX(df['High'], df['Low'], df['Adj Close'], timeperiod=fast)
        adxr = ADXR(df['High'], df['Low'], df['Adj Close'], timeperiod=fast)
        apo = APO(df['Adj Close'],
                  fastperiod=really_fast,
                  slowperiod=fast,
                  matype=0)
        aroon_d, aroon_u = AROON(df['High'], df['Low'], timeperiod=fast)
        aroon_osc = AROONOSC(df['High'], df['Low'], timeperiod=fast)
        bop = BOP(df['Open'], df['High'], df['Low'], df['Adj Close'])
        cci = CCI(df['High'], df['Low'], df['Adj Close'], timeperiod=fast)
        cmo = CMO(df['Adj Close'], timeperiod=fast)
        dx = DX(df['High'], df['Low'], df['Adj Close'], timeperiod=fast)
        macd, macdsig, macdhist = MACD(df['Adj Close'],
                                       fastperiod=fast,
                                       slowperiod=slow,
                                       signalperiod=really_fast)
        macdex, macdexsig, macdexhist = MACDEXT(df['Adj Close'],
                                                fastperiod=fast,
                                                fastmatype=0,
                                                slowperiod=slow,
                                                slowmatype=0,
                                                signalperiod=really_fast,
                                                signalmatype=0)
        macdfixd, macdfixdsig, macdfixdhist = MACDFIX(df['Adj Close'],
                                                      signalperiod=really_fast)
        # more momo's

        mfi = MFI(df['High'],
                  df['Low'],
                  df['Adj Close'],
                  df['Volume'],
                  timeperiod=fast)
        min_di = MINUS_DI(df['High'],
                          df['Low'],
                          df['Adj Close'],
                          timeperiod=fast)
        min_dm = MINUS_DM(df['High'], df['Low'], timeperiod=fast)
        momo = MOM(df['Adj Close'], timeperiod=really_fast)
        plus_di = PLUS_DI(df['High'],
                          df['Low'],
                          df['Adj Close'],
                          timeperiod=fast)
        plus_dm = PLUS_DM(df['High'], df['Low'], timeperiod=fast)
        ppo = PPO(df['Adj Close'],
                  fastperiod=really_fast,
                  slowperiod=fast,
                  matype=0)
        roc = ROC(df['Adj Close'], timeperiod=fast)
        rocp = ROCP(df['Adj Close'], timeperiod=fast)
        rocr = ROCR(df['Adj Close'], timeperiod=fast)
        rocr_hund = ROCR100(df['Adj Close'], timeperiod=fast)
        rsi_fastk, rsi_fastd = STOCHRSI(df['Adj Close'],
                                        timeperiod=fast,
                                        fastk_period=slow,
                                        fastd_period=really_fast,
                                        fastd_matype=0)
        trix = TRIX(df['Adj Close'], timeperiod=slow)
        ult_osc = ULTOSC(df['High'],
                         df['Low'],
                         df['Adj Close'],
                         timeperiod1=really_fast,
                         timeperiod2=fast,
                         timeperiod3=slow)

        #old some of the first added
        R_S_I = RSI(df['Adj Close'], timeperiod=slow)
        slowk, slowd = STOCH(df['High'],
                             df['Low'],
                             df['Adj Close'],
                             fastk_period=fast,
                             slowk_period=slow,
                             slowk_matype=0,
                             slowd_period=slow,
                             slowd_matype=0)
        fastk, fastd = STOCHF(df['High'],
                              df['Low'],
                              df['Adj Close'],
                              fastk_period=fast,
                              fastd_period=really_fast,
                              fastd_matype=0)

        real = WILLR(df['High'], df['Low'], df['Adj Close'], timeperiod=slow)

        # below are the TA indicators

        two_crows = CDL2CROWS(df['Open'], df['High'], df['Low'],
                              df['Adj Close'])
        three_crows = CDL3BLACKCROWS(df['Open'], df['High'], df['Low'],
                                     df['Adj Close'])
        three_inside = CDL3INSIDE(df['Open'], df['High'], df['Low'],
                                  df['Adj Close'])
        three_line = CDL3LINESTRIKE(df['Open'], df['High'], df['Low'],
                                    df['Adj Close'])
        three_out = CDL3OUTSIDE(df['Open'], df['High'], df['Low'],
                                df['Adj Close'])
        three_stars = CDL3STARSINSOUTH(df['Open'], df['High'], df['Low'],
                                       df['Adj Close'])
        three_soldier = CDL3WHITESOLDIERS(df['Open'], df['High'], df['Low'],
                                          df['Adj Close'])
        baby = CDLABANDONEDBABY(df['Open'],
                                df['High'],
                                df['Low'],
                                df['Adj Close'],
                                penetration=0)
        adv = CDLADVANCEBLOCK(df['Open'], df['High'], df['Low'],
                              df['Adj Close'])
        belt_hold = CDLBELTHOLD(df['Open'], df['High'], df['Low'],
                                df['Adj Close'])
        breakaway = CDLBREAKAWAY(df['Open'], df['High'], df['Low'],
                                 df['Adj Close'])
        closingmara = CDLCLOSINGMARUBOZU(df['Open'], df['High'], df['Low'],
                                         df['Adj Close'])
        baby_swallow = CDLCONCEALBABYSWALL(df['Open'], df['High'], df['Low'],
                                           df['Adj Close'])

        #more TA

        counter = CDLCOUNTERATTACK(df['Open'], df['High'], df['Low'],
                                   df['Adj Close'])
        dark_cloud = CDLDARKCLOUDCOVER(df['Open'],
                                       df['High'],
                                       df['Low'],
                                       df['Adj Close'],
                                       penetration=0)
        doji = CDLDOJI(df['Open'], df['High'], df['Low'], df['Adj Close'])
        doji_star = CDLDOJISTAR(df['Open'], df['High'], df['Low'],
                                df['Adj Close'])
        dragon_doji = CDLDRAGONFLYDOJI(df['Open'], df['High'], df['Low'],
                                       df['Adj Close'])
        engulf = CDLENGULFING(df['Open'], df['High'], df['Low'],
                              df['Adj Close'])
        evening_star = CDLEVENINGSTAR(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])
        gapside = CDLGAPSIDESIDEWHITE(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])
        gravestone = CDLGRAVESTONEDOJI(df['Open'], df['High'], df['Low'],
                                       df['Adj Close'])
        hammer = CDLHAMMER(df['Open'], df['High'], df['Low'], df['Adj Close'])
        hang_man = CDLHANGINGMAN(df['Open'], df['High'], df['Low'],
                                 df['Adj Close'])
        harami = CDLHARAMI(df['Open'], df['High'], df['Low'], df['Adj Close'])
        harami_cross = CDLHARAMICROSS(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])

        #more TA

        high_wave = CDLHIGHWAVE(df['Open'], df['High'], df['Low'],
                                df['Adj Close'])
        hikkake = CDLHIKKAKE(df['Open'], df['High'], df['Low'],
                             df['Adj Close'])
        hikkake_mod = CDLHIKKAKEMOD(df['Open'], df['High'], df['Low'],
                                    df['Adj Close'])
        pidgeon = CDLHOMINGPIGEON(df['Open'], df['High'], df['Low'],
                                  df['Adj Close'])
        id_three_crows = CDLIDENTICAL3CROWS(df['Open'], df['High'], df['Low'],
                                            df['Adj Close'])
        in_neck = CDLINNECK(df['Open'], df['High'], df['Low'], df['Adj Close'])
        inv_hammer = CDLINVERTEDHAMMER(df['Open'], df['High'], df['Low'],
                                       df['Adj Close'])
        kicking = CDLKICKING(df['Open'], df['High'], df['Low'],
                             df['Adj Close'])
        kicking_len = CDLKICKINGBYLENGTH(df['Open'], df['High'], df['Low'],
                                         df['Adj Close'])
        ladder_bot = CDLLADDERBOTTOM(df['Open'], df['High'], df['Low'],
                                     df['Adj Close'])
        doji_long = CDLLONGLEGGEDDOJI(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])
        long_line = CDLLONGLINE(df['Open'], df['High'], df['Low'],
                                df['Adj Close'])
        marabozu = CDLMARUBOZU(df['Open'], df['High'], df['Low'],
                               df['Adj Close'])

        #more TA

        match_glow = CDLMATCHINGLOW(df['Open'], df['High'], df['Low'],
                                    df['Adj Close'])
        mat_hold = CDLMATHOLD(df['Open'],
                              df['High'],
                              df['Low'],
                              df['Adj Close'],
                              penetration=0)
        morning_doji = CDLMORNINGDOJISTAR(df['Open'],
                                          df['High'],
                                          df['Low'],
                                          df['Adj Close'],
                                          penetration=0)
        morning_star = CDLMORNINGSTAR(df['Open'],
                                      df['High'],
                                      df['Low'],
                                      df['Adj Close'],
                                      penetration=0)
        on_neck = CDLONNECK(df['Open'], df['High'], df['Low'], df['Adj Close'])
        pierce = CDLPIERCING(df['Open'], df['High'], df['Low'],
                             df['Adj Close'])
        rickshaw = CDLRICKSHAWMAN(df['Open'], df['High'], df['Low'],
                                  df['Adj Close'])
        rise_fall = CDLRISEFALL3METHODS(df['Open'], df['High'], df['Low'],
                                        df['Adj Close'])
        sep_line = CDLSEPARATINGLINES(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])
        shooting_star = CDLSHOOTINGSTAR(df['Open'], df['High'], df['Low'],
                                        df['Adj Close'])
        sl_candle = CDLSHORTLINE(df['Open'], df['High'], df['Low'],
                                 df['Adj Close'])
        spin_top = CDLSPINNINGTOP(df['Open'], df['High'], df['Low'],
                                  df['Adj Close'])
        stalled = CDLSTALLEDPATTERN(df['Open'], df['High'], df['Low'],
                                    df['Adj Close'])

        #more TA

        stick_sand = CDLSTICKSANDWICH(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])
        takuri = CDLTAKURI(df['Open'], df['High'], df['Low'], df['Adj Close'])
        tasuki_gap = CDLTASUKIGAP(df['Open'], df['High'], df['Low'],
                                  df['Adj Close'])
        thrust = CDLTHRUSTING(df['Open'], df['High'], df['Low'],
                              df['Adj Close'])
        tristar = CDLTRISTAR(df['Open'], df['High'], df['Low'],
                             df['Adj Close'])
        three_river = CDLUNIQUE3RIVER(df['Open'], df['High'], df['Low'],
                                      df['Adj Close'])
        ud_two_gap = CDLUPSIDEGAP2CROWS(df['Open'], df['High'], df['Low'],
                                        df['Adj Close'])
        down_three_gap = CDLXSIDEGAP3METHODS(df['Open'], df['High'], df['Low'],
                                             df['Adj Close'])

        #76 vars

        #are_all_zero = (test_TA == 0).all()
        #true if all values are 0
        #false if contain a non 0'''

        df.drop(['Close'], axis=1, inplace=True)

        df['upper_band'] = upper_band
        df['lower_band'] = lower_band
        df['mid_band'] = mid_band
        df['d_ema'] = d_ema
        df['ht_trend'] = ht_trend
        df['kama'] = kama
        df['ma'] = ma
        #df['mama'] = mama
        df['mid'] = mid
        df['mid_price'] = mid_price

        df['sar'] = sar
        df['sarext'] = sarext
        df['sma'] = sma
        df['tema'] = tema
        df['trima'] = trima
        df['wma'] = wma
        #df['fama'] = fama

        df['EMA'] = E_M_A
        df['SlowK'] = slowk
        df['SlowD'] = slowd
        df['R_S_I'] = R_S_I
        df['FastK'] = fastk
        df['FastD'] = fastd
        df['WilliamsR'] = real

        df['atr'] = atr
        df['natr'] = natr
        df['t_range'] = t_range

        #df['na_tr'] = natr

        df['OBV'] = O_B_V
        df['ADOSC'] = A_D_O_S_C
        df['ogchaikin'] = O_G_chaikin
        df['HTDCperiod'] = HT_DCper
        df['HTDCphase'] = HT_DCphase
        df['inphase'] = inphase
        df['quad'] = quad
        df['rsin'] = r_sin
        df['leadsin'] = leadsin

        df['mfi'] = mfi
        df['min_di'] = min_di
        df['min_dm'] = min_dm
        df['momo'] = momo
        df['plus_di'] = plus_di
        df['plus_dm'] = plus_dm
        df['ppo'] = ppo
        df['roc'] = roc
        df['rocp'] = rocp

        df['rocr'] = rocr
        df['rocr_hund'] = rocr_hund
        df['rsi_fastk'] = rsi_fastk
        df['rsi_fastd'] = rsi_fastd
        df['trix'] = trix
        df['ult_osc'] = ult_osc

        df['adx'] = adx
        df['adxr'] = adxr
        df['apo'] = apo
        df['aroon_d'] = aroon_d
        df['aroon_u'] = aroon_u
        df['aroon_osc'] = aroon_osc
        df['bop'] = bop
        df['cci'] = cci
        df['cmo'] = cmo

        df['dx'] = dx
        df['macd'] = macd
        df['macdsig'] = macdsig
        df['macdhist'] = macdhist
        df['macdex'] = macdex
        df['macdexsig'] = macdexsig
        df['macdexhist'] = macdexhist
        df['macdfixd'] = macdfixd
        df['macdfixdsig'] = macdfixdsig
        df['macdfixdhist'] = macdfixdhist

        df['two_crows'] = two_crows
        df['three_crows'] = three_crows
        df['three_inside'] = three_inside
        df['three_line'] = three_line
        df['three_out'] = three_out
        df['three_stars'] = three_stars
        df['three_soldier'] = three_soldier
        df['baby'] = baby
        df['adv'] = adv
        df['belt_hold'] = belt_hold
        df['breakaway'] = breakaway
        df['closingmara'] = closingmara
        df['baby_swallow'] = belt_hold

        df['counter'] = counter
        df['dark_cloud'] = dark_cloud
        df['doji'] = doji
        df['doji_star'] = doji_star
        df['dragon_doji'] = dragon_doji
        df['engulf'] = engulf
        df['evening_star'] = evening_star
        df['gapside'] = gapside
        df['gravestone'] = gravestone
        df['hammer'] = hammer
        df['hang_man'] = hang_man
        df['harami'] = harami
        df['harami_cross'] = harami_cross

        df['high_wave'] = high_wave
        df['hikkake'] = hikkake
        df['hikkake_mod'] = hikkake_mod
        df['pidgeon'] = pidgeon
        df['id_three_crows'] = id_three_crows
        df['in_neck'] = in_neck
        df['inv_hammer'] = inv_hammer
        df['kicking'] = kicking
        df['kicking_len'] = kicking_len
        df['ladder_bot'] = ladder_bot
        df['doji_long'] = doji_long
        df['long_line'] = long_line
        df['marabozu'] = marabozu

        # this is  a comment

        df['match_glow'] = match_glow
        df['mat_hold'] = mat_hold
        df['morning_doji'] = morning_doji
        df['morning_star'] = morning_star
        df['on_neck'] = on_neck
        df['pierce'] = pierce
        df['rickshaw'] = rickshaw
        df['rise_fall'] = rise_fall
        df['sep_line'] = sep_line
        df['shooting_star'] = shooting_star
        df['sl_candle'] = sl_candle
        df['spin_top'] = spin_top
        df['stalled'] = stalled

        df['stick_sand'] = stick_sand
        df['takuri'] = takuri
        df['tasuki_gap'] = tasuki_gap
        df['thrust'] = thrust
        df['tristar'] = tristar
        df['three_river'] = three_river
        df['ud_two_gap'] = ud_two_gap
        df['down_three_gap'] = down_three_gap

    add_indicators()
    # Convert Date column to datetime
    df.reset_index(level=0, inplace=True)

    # Change all column headings to be lower case, and remove spacing
    df.columns = [str(x).lower().replace(' ', '_') for x in df.columns]

    # Get difference between high and low of each day
    df['range_hl'] = df['high'] - df['low']
    df.drop(['high', 'low'], axis=1, inplace=True)
    # Get difference between open and close of each day
    df['range_oc'] = df['open'] - df['adj_close']
    df.drop(['open'], axis=1, inplace=True)
    # Add a column 'order_day' to indicate the order of the rows by date
    df['order_day'] = [x for x in list(range(len(df)))]
    # merging_keys
    merging_keys = ['order_day']

    #define shift range
    # 2 min intervals - 30 = 1hr

    N = 15

    lag_cols = [
        'ema', 'slowk', 'slowd', 'r_s_i', 'fastk', 'fastd', 'williamsr',
        'volume', 'range_hl', 'range_oc', 'adj_close', 'obv', 'adosc',
        'ogchaikin', 'htdcperiod', 'htdcphase', 'inphase', 'quad', 'rsin',
        'leadsin', 'two_crows', 'three_crows', 'three_inside', 'three_line',
        'three_out', 'three_stars', 'three_soldier', 'baby', 'adv',
        'belt_hold', 'breakaway', 'closingmara', 'baby_swallow', 'counter',
        'dark_cloud', 'doji', 'doji_star', 'dragon_doji', 'engulf',
        'evening_star', 'gapside', 'gravestone', 'hammer', 'hang_man',
        'harami', 'harami_cross', 'high_wave', 'hikkake', 'hikkake_mod',
        'pidgeon', 'id_three_crows', 'in_neck', 'inv_hammer', 'kicking',
        'kicking_len', 'ladder_bot', 'doji_long', 'long_line', 'marabozu',
        'match_glow', 'mat_hold', 'morning_doji', 'morning_star', 'on_neck',
        'pierce', 'rickshaw', 'rise_fall', 'sep_line', 'shooting_star',
        'sl_candle', 'spin_top', 'stalled', 'stick_sand', 'takuri',
        'tasuki_gap', 'thrust', 'tristar', 'three_river', 'ud_two_gap',
        'down_three_gap', 'upper_band', 'lower_band', 'mid_band', 'd_ema',
        'ht_trend', 'kama', 'ma', 'mid', 'mid_price', 'sar', 'sarext', 'sma',
        'tema', 'trima', 'wma', 'adx', 'adxr', 'apo', 'aroon_d', 'aroon_u',
        'aroon_osc', 'bop', 'cci', 'cmo', 'dx', 'macd', 'macdsig', 'macdhist',
        'macdex', 'macdexsig', 'macdexhist', 'macdfixd', 'macdfixdsig',
        'macdfixdhist', 'mfi', 'min_di', 'min_dm', 'momo', 'plus_di',
        'plus_dm', 'ppo', 'roc', 'rocp', 'rocr', 'rocr_hund', 'rsi_fastk',
        'rsi_fastd', 'trix', 'ult_osc', 'atr', 'natr', 't_range'
    ]

    shift_range = [x + 1 for x in range(N)]

    for shift in shift_range:
        train_shift = df[merging_keys + lag_cols].copy()

        # E.g. order_day of 0 becomes 1, for shift = 1.
        # So when this is merged with order_day of 1 in df, this will represent lag of 1.
        train_shift['order_day'] = train_shift['order_day'] + shift

        foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
        train_shift = train_shift.rename(columns=foo)

        df = pd.merge(df, train_shift, on=merging_keys,
                      how='left')  #.fillna(0)

    del train_shift

    df.fillna(0, inplace=True)
    # other ways to render the NAN values exist

    #defining test and train len
    num_test = int(.05 * len(df))
    num_train = len(df) - num_test

    # Split into train, cv, and test
    train = df[:num_train]
    test = df[num_train:]

    cols_to_scale = [
        'ema', 'slowk', 'slowd', 'r_s_i', 'fastk', 'fastd', 'williamsr',
        'volume', 'range_hl', 'range_oc', 'adj_close', 'obv', 'adosc',
        'ogchaikin', 'htdcperiod', 'htdcphase', 'inphase', 'quad', 'rsin',
        'leadsin', 'two_crows', 'three_crows', 'three_inside', 'three_line',
        'three_out', 'three_stars', 'three_soldier', 'baby', 'adv',
        'belt_hold', 'breakaway', 'closingmara', 'baby_swallow', 'counter',
        'dark_cloud', 'doji', 'doji_star', 'dragon_doji', 'engulf',
        'evening_star', 'gapside', 'gravestone', 'hammer', 'hang_man',
        'harami', 'harami_cross', 'high_wave', 'hikkake', 'hikkake_mod',
        'pidgeon', 'id_three_crows', 'in_neck', 'inv_hammer', 'kicking',
        'kicking_len', 'ladder_bot', 'doji_long', 'long_line', 'marabozu',
        'match_glow', 'mat_hold', 'morning_doji', 'morning_star', 'on_neck',
        'pierce', 'rickshaw', 'rise_fall', 'sep_line', 'shooting_star',
        'sl_candle', 'spin_top', 'stalled', 'stick_sand', 'takuri',
        'tasuki_gap', 'thrust', 'tristar', 'three_river', 'ud_two_gap',
        'down_three_gap', 'upper_band', 'lower_band', 'mid_band', 'd_ema',
        'ht_trend', 'kama', 'ma', 'mid', 'mid_price', 'sar', 'sarext', 'sma',
        'tema', 'trima', 'wma', 'adx', 'adxr', 'apo', 'aroon_d', 'aroon_u',
        'aroon_osc', 'bop', 'cci', 'cmo', 'dx', 'macd', 'macdsig', 'macdhist',
        'macdex', 'macdexsig', 'macdexhist', 'macdfixd', 'macdfixdsig',
        'macdfixdhist', 'mfi', 'min_di', 'min_dm', 'momo', 'plus_di',
        'plus_dm', 'ppo', 'roc', 'rocp', 'rocr', 'rocr_hund', 'rsi_fastk',
        'rsi_fastd', 'trix', 'ult_osc', 'atr', 'natr', 't_range'
    ]

    for i in range(1, N + 1):
        cols_to_scale.append("ema_lag_" + str(i))
        cols_to_scale.append("slowk_lag_" + str(i))
        cols_to_scale.append("slowd_lag_" + str(i))
        cols_to_scale.append("r_s_i_lag_" + str(i))
        cols_to_scale.append("fastk_lag_" + str(i))
        cols_to_scale.append("fastd_lag_" + str(i))
        cols_to_scale.append("williamsr_lag_" + str(i))
        cols_to_scale.append("volume_lag_" + str(i))
        cols_to_scale.append("range_hl_lag_" + str(i))
        cols_to_scale.append("range_oc_lag_" + str(i))
        cols_to_scale.append("adj_close_lag_" + str(i))

        cols_to_scale.append("upper_band_lag_" + str(i))
        cols_to_scale.append("lower_band_lag_" + str(i))
        cols_to_scale.append("mid_band_lag_" + str(i))
        cols_to_scale.append("d_ema_lag_" + str(i))
        cols_to_scale.append("ht_trend_lag_" + str(i))
        cols_to_scale.append("kama_lag_" + str(i))
        cols_to_scale.append("ma_lag_" + str(i))
        cols_to_scale.append("mid_lag_" + str(i))
        cols_to_scale.append("mid_price_lag_" + str(i))
        cols_to_scale.append("sar_lag_" + str(i))
        cols_to_scale.append("sarext_lag_" + str(i))
        cols_to_scale.append("sma_lag_" + str(i))
        cols_to_scale.append("tema_lag_" + str(i))
        cols_to_scale.append("trima_lag_" + str(i))
        cols_to_scale.append("wma_lag_" + str(i))

        cols_to_scale.append("atr_lag_" + str(i))
        cols_to_scale.append("natr_lag_" + str(i))
        cols_to_scale.append("t_range_lag_" + str(i))

        #momentum indicator lag cols

        cols_to_scale.append("adx_lag_" + str(i))
        cols_to_scale.append("adxr_lag_" + str(i))
        cols_to_scale.append("apo_lag_" + str(i))
        cols_to_scale.append("aroon_d_lag_" + str(i))
        cols_to_scale.append("aroon_u_lag_" + str(i))
        cols_to_scale.append("aroon_osc_lag_" + str(i))
        cols_to_scale.append("bop_lag_" + str(i))
        cols_to_scale.append("cci_lag_" + str(i))
        cols_to_scale.append("cmo_lag_" + str(i))
        cols_to_scale.append("dx_lag_" + str(i))
        cols_to_scale.append("macd_lag_" + str(i))
        cols_to_scale.append("macdsig_lag_" + str(i))
        cols_to_scale.append("macdhist_lag_" + str(i))
        cols_to_scale.append("macdex_lag_" + str(i))

        cols_to_scale.append("mfi_lag_" + str(i))
        cols_to_scale.append("min_di_lag_" + str(i))
        cols_to_scale.append("min_dm_lag_" + str(i))
        cols_to_scale.append("momo_lag_" + str(i))
        cols_to_scale.append("plus_di_lag_" + str(i))
        cols_to_scale.append("plus_dm_lag_" + str(i))
        cols_to_scale.append("ppo_lag_" + str(i))
        cols_to_scale.append("roc_lag_" + str(i))
        cols_to_scale.append("rocp_lag_" + str(i))
        cols_to_scale.append("rocr_lag_" + str(i))
        cols_to_scale.append("rocr_hund_lag_" + str(i))
        cols_to_scale.append("rsi_fastk_lag_" + str(i))
        cols_to_scale.append("rsi_fastd_lag_" + str(i))
        cols_to_scale.append("trix_lag_" + str(i))
        cols_to_scale.append("ult_osc_lag_" + str(i))

        cols_to_scale.append("macdexsig_lag_" + str(i))
        cols_to_scale.append("macdexhist_lag_" + str(i))
        cols_to_scale.append("macdfixd_lag_" + str(i))
        cols_to_scale.append("macdfixdsig_lag_" + str(i))
        cols_to_scale.append("macdfixdhist_lag_" + str(i))

        #cols_to_scale.append("mama_lag_"+str(i))
        #cols_to_scale.append("NATR_lag_"+str(i))

        cols_to_scale.append("obv_lag_" + str(i))
        cols_to_scale.append("adosc_lag_" + str(i))
        cols_to_scale.append("ogchaikin_lag_" + str(i))
        cols_to_scale.append("htdcperiod_lag_" + str(i))
        cols_to_scale.append("htdcphase_lag_" + str(i))
        cols_to_scale.append("inphase_lag_" + str(i))
        cols_to_scale.append("quad_lag_" + str(i))
        cols_to_scale.append("rsin_lag_" + str(i))
        cols_to_scale.append("leadsin_lag_" + str(i))
        #cols_to_scale.append("fama_lag_"+str(i))

        cols_to_scale.append("two_crows_lag_" + str(i))
        cols_to_scale.append("three_crows_lag_" + str(i))
        cols_to_scale.append("three_inside_lag_" + str(i))
        cols_to_scale.append("three_line_lag_" + str(i))
        cols_to_scale.append("three_out_lag_" + str(i))
        cols_to_scale.append("three_stars_lag_" + str(i))
        cols_to_scale.append("three_soldier_lag_" + str(i))
        cols_to_scale.append("baby_lag_" + str(i))
        cols_to_scale.append("adv_lag_" + str(i))
        cols_to_scale.append("belt_hold_lag_" + str(i))
        cols_to_scale.append("breakaway_lag_" + str(i))
        cols_to_scale.append("closingmara_lag_" + str(i))
        cols_to_scale.append("baby_swallow_lag_" + str(i))

        cols_to_scale.append("counter_lag_" + str(i))
        cols_to_scale.append("dark_cloud_lag_" + str(i))
        cols_to_scale.append("doji_lag_" + str(i))
        cols_to_scale.append("doji_star_lag_" + str(i))
        cols_to_scale.append("dragon_doji_lag_" + str(i))
        cols_to_scale.append("engulf_lag_" + str(i))
        cols_to_scale.append("evening_star_lag_" + str(i))
        cols_to_scale.append("gapside_lag_" + str(i))
        cols_to_scale.append("gravestone_lag_" + str(i))
        cols_to_scale.append("hammer_lag_" + str(i))
        cols_to_scale.append("hang_man_lag_" + str(i))
        cols_to_scale.append("harami_lag_" + str(i))
        cols_to_scale.append("harami_cross_lag_" + str(i))

        cols_to_scale.append("high_wave_lag_" + str(i))
        cols_to_scale.append("hikkake_lag_" + str(i))
        cols_to_scale.append("hikkake_mod_lag_" + str(i))
        cols_to_scale.append("pidgeon_lag_" + str(i))
        cols_to_scale.append("id_three_crows_lag_" + str(i))
        cols_to_scale.append("in_neck_lag_" + str(i))
        cols_to_scale.append("inv_hammer_lag_" + str(i))
        cols_to_scale.append("kicking_lag_" + str(i))
        cols_to_scale.append("kicking_len_lag_" + str(i))
        cols_to_scale.append("ladder_bot_lag_" + str(i))
        cols_to_scale.append("doji_long_lag_" + str(i))
        cols_to_scale.append("long_line_lag_" + str(i))
        cols_to_scale.append("marabozu_lag_" + str(i))

        cols_to_scale.append("match_glow_lag_" + str(i))
        cols_to_scale.append("mat_hold_lag_" + str(i))
        cols_to_scale.append("morning_doji_lag_" + str(i))
        cols_to_scale.append("morning_star_lag_" + str(i))
        cols_to_scale.append("on_neck_lag_" + str(i))
        cols_to_scale.append("pierce_lag_" + str(i))
        cols_to_scale.append("rickshaw_lag_" + str(i))
        cols_to_scale.append("rise_fall_lag_" + str(i))
        cols_to_scale.append("sep_line_lag_" + str(i))
        cols_to_scale.append("shooting_star_lag_" + str(i))
        cols_to_scale.append("sl_candle_lag_" + str(i))
        cols_to_scale.append("spin_top_lag_" + str(i))
        cols_to_scale.append("stalled_lag_" + str(i))

        cols_to_scale.append("stick_sand_lag_" + str(i))
        cols_to_scale.append("takuri_lag_" + str(i))
        cols_to_scale.append("tasuki_gap_lag_" + str(i))
        cols_to_scale.append("thrust_lag_" + str(i))
        cols_to_scale.append("tristar_lag_" + str(i))
        cols_to_scale.append("three_river_lag_" + str(i))
        cols_to_scale.append("ud_two_gap_lag_" + str(i))
        cols_to_scale.append("down_three_gap_lag_" + str(i))

    #print(train.columns.tolist())

    # Do scaling for train set
    # Here we only scale the train dataset, and not the entire dataset to prevent information leak
    scaler = StandardScaler()

    scaler.fit(train[cols_to_scale])
    train_scaled = scaler.transform(train[cols_to_scale])

    # Convert the numpy array back into pandas dataframe

    #print(cols_to_scale)

    train_scaled = pd.DataFrame(train_scaled, columns=cols_to_scale)

    #duplicate_columns = train_scaled.columns[train_scaled.columns.duplicated()]

    #print(duplicate_columns)

    #df.to_csv(file_name)
    # this may be a good place to save to a .csv file and export the data to matlab
    #  / do diagnostic visualizations

    #print(train_scaled.columns.tolist())
    #train_scaled['datetime'] = train.reset_index()['datetime']
    #print("train_scaled.shape = " + str(train_scaled.shape))
    #print(train_scaled.head(5))

    #this line is needed for the PCA

    #train_scaled = train_scaled[100:]

    scaler_2 = StandardScaler()
    scaler_2.fit(test[cols_to_scale])
    test_scaled = scaler_2.transform(test[cols_to_scale])

    # Convert the numpy array back into pandas dataframe

    test_scaled = pd.DataFrame(test_scaled, columns=cols_to_scale)

    features = []
    target = "adj_close"
    features = cols_to_scale
    features.remove(target)

    # Split into X and y
    X_train_scaled = train_scaled[features]
    y_train_scaled = train_scaled[target]

    X_test_scaled = test_scaled[features]
    y_test_scaled = test_scaled[target]

    ## PCA testing needs to be done here to see what should / should not be included.

    #print(X_sample_scaled.columns.tolist())
    #print(type(X_train_scaled))
    #testing = X_train_scaled.to_numpy()
    #print(np.isnan(testing.any()))
    #print(np.isfinite(testing))
    #pca = PCA(n_components = 80).fit(X_train_scaled)
    #print(pca.explained_variance_ratio_)
    #print(pca.singular_values_)
    #X_train_scaled, y_train_scaled, X_sample_scaled, y_sample_scaled = preprocessing_data(stock = 'BB')

    ## these values can be adjusted to customize the model

    #rand = np.random.randint(low=1,high = 999)

    model = XGBRegressor(seed=100,
                         n_estimators=200,
                         max_depth=20,
                         learning_rate=0.1,
                         min_child_weight=1,
                         subsample=1,
                         colsample_bytree=1,
                         colsample_bylevel=1,
                         gamma=0.1)

    # Train the regressor

    model.fit(X_train_scaled, y_train_scaled)
    #xgb.plot_importance(model)
    #feat_list = xgb.plot_importance(model).get_yticklabels()[::-1]

    #print(feat_list)

    #xgb.plot_tree(model)

    path_out = r'C:\\Users\\Michael\\Desktop\\Python\Stonks\\YF & modeling\\TestSP500Out\\'

    feat_save_name = path_out + stock + "features"
    tree_save_name = path_out + stock + "tree"

    xgb.plot_importance(model).figure.savefig(feat_save_name, dpi=600)
    xgb.plot_tree(model).figure.savefig(tree_save_name, dpi=600)

    feature_important = model.get_booster().get_score(importance_type='weight')
    keys = list(feature_important.keys())
    values = list(feature_important.values())

    #print(keys)
    #print(values)

    data = pd.DataFrame(data=values, index=keys,
                        columns=["score"]).sort_values(by="score",
                                                       ascending=False)

    print(data.head(5))

    #data.plot(kind='barh')

    #plt.show()

    #doing predictions on model
    #print(X_train_scaled)
    test_pred = model.predict(X_test_scaled)
    #insert back into test_scaled array
    test_scaled['adj_close'] = test_pred
    ''' there is some consideration to be made if we can grab the top 20-30 most influential features from xgboost and use them to train a different model type'''
    ''' there is also consideration to be made about exporting these models'''

    # this methodology works for saving a trained model
    pickle.dump(model, open("test.model", "wb"))

    #unscaling
    pred_unscaled = scaler_2.inverse_transform(test_scaled)
    plt.figure()
    #plotting
    plt.plot(test_scaled.index, test[target])
    plt.plot(test_scaled.index, pred_unscaled[:, 10])

    plt.legend(('True', 'est'), loc='upper left')

    plt.title(str(stock))

    plt.xlabel("Intervals")

    plt.ylabel('$')

    stonk_path_out = path_out + stock

    plt.savefig(stonk_path_out)

    test_true_num = test[target].iloc[-1]
    test_pred_num = pred_unscaled[-1, 10]

    is_going_up = test_pred_num > test_true_num
    print(stock)
    print(is_going_up)

    #change intervals back to date-time
    '''toc = time.perf_counter()