def __init__(self, model: XGBRegressor, feature_names: List[str]): # XGBRegressor.base_score defaults to 0.5. base_score = model.base_score if base_score is None: base_score = 0.5 super().__init__(model.get_booster(), feature_names, base_score, model.objective)
def feature_importance(): """Obtains the most important features using XGBoosts """ dataset = pd.read_csv('../results/dataframe_final_project.csv', index_col=0) dataset['Precio_Precio'] = np.log(dataset['Precio_Precio']) X = dataset.drop(columns=[ 'Precio_Precio', 'Precio_Open', 'Precio_Low', 'Precio_Close', 'Precio_High', 'Fecha' ]).values y = dataset['Precio_Precio'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) xgb_reg = XGBRegressor() xgb_reg_fit = xgb_reg.fit(X_train, y_train) y_hat = xgb_reg.predict(X_test) print('score:', r2_score(y_test, y_hat)) print(xgb_reg.get_booster().get_score(importance_type="gain")) plot_importance( xgb_reg, importance_type='gain', max_num_features=20, height=0.8, ) plt.savefig('../figs/feature_importance.png')
def PredictMetadata(ASV_table, metadata_variables, train_size, test_size, seed): X_ASV = ASV_table X_ASV.columns = [''] * len(X_ASV.columns) X_ASV = X_ASV.to_numpy() metadata_list = [] for i in metadata_variables: #y_CDOM = metadata.loc[:, i][:, np.newaxis] # split data into train and test sets y_meta = metadata.loc[:, i] #Requires 1d array X_train, X_test, y_train, y_test = train_test_split( X_ASV, y_meta, train_size=train_size, test_size=test_size, random_state=seed) # fit model no training data model = XGBRegressor(objective='reg:squarederror') model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='rmse', early_stopping_rounds=100, verbose=False) #Get best model by test MSE XGboost_best_model_index = model.best_iteration XGboost_best_iteration = model.get_booster( ).best_ntree_limit # make predictions for full dataset y_pred = model.predict(X_ASV, ntree_limit=XGboost_best_iteration) metadata_list.append(y_pred[:, np.newaxis]) return MergeTable(metadata_list, metadata_variables)
class Prediction_xgb: def __init__(self, model_file): self.xgb_model_path = model_file self.param = { 'learning_rate': 0.1, 'max_depth': 5, 'gamma': 0, 'min_child_weight': 3, 'subsample': 0.8, # 'colsample': 0.75, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'verbosity': 3, 'objective': 'reg:squarederror', 'eval_metric': 'mae', } self.model = XGBRegressor( slice=1, learning_rate=0.1, n_estimators=96, # 树的个数--1000棵树建立xgboost max_depth=5, # 树的深度 min_child_weight=3, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 # colsample = 0.75, # 随机选择80%特征建立决策树 colsample_bytree=0.6, verbosity=3, objective='reg:squarederror', # 指定损失函数 # eval_metric='mae', # scale_pos_weight=1, # 解决样本个数不平衡的问题 # random_state = 27, # 随机数 ) def cut_data(self, data_x, data_y): pass res_x = [] res_y = [] for i in range(data_x.shape[0]): if any(data_x[i][4:11]): res_x.append(data_x[i]) res_y.append(data_y[i]) return np.array(res_x), np.array(res_y) def train_XGBClassifier(self): print('---xgb start---') self.model.fit(self.train_data, self.train_y, eval_set=[(self.train_data, self.train_y)]) def train_xgboost(self, trian_data, train_y): print('---xgb start---') train_data, train_y = self.cut_data(trian_data, train_y) train_xdf = pd.DataFrame(train_data[:-50]) train_ydf = pd.DataFrame(train_y[:-50]) test_xdf = pd.DataFrame(train_data[-50:]) test_ydf = pd.DataFrame(train_y[-50:]) dtrain = xgb.DMatrix(train_xdf, label=train_ydf) dtest = xgb.DMatrix(test_xdf, label=test_ydf) def modelfit(alg, train_xdf, train_ydf, useTrainCV=True, cv_folds=5, early_stopping_rounds=20): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(train_xdf, label=train_ydf) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='mae', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) print(cvresult.shape[0]) # Fit the algorithm on the data alg.fit(train_xdf, train_ydf, eval_metric='mae') # bst = xgb.train(alg.get_xgb_params(), xgtrain, num_boost_round=cvresult.shape[0]) # Predict training set: dtrain_predictions = alg.predict(test_xdf) # dtrain_predictions = bst.predict(dtest) # Print model report: print("\nModel Report") test_y = dtest.get_label() print('error---', np.mean(abs(dtrain_predictions - test_y) / test_y)) print("Accuracy : %.4g" % mean_absolute_error(test_y, dtrain_predictions)) # print("Accuracy : %.4g" % np.mean(abs(dtrain_predictions - np.array(train_ydf[0])) / np.array(train_ydf[0]))) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() # modelfit(self.model, train_xdf, train_ydf) param_test1 = { # 'max_depth': range(4, 7, 1), # 'min_child_weight': range(2, 4, 1) # 'gamma': [i / 10.0 for i in range(0, 5)] 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } gsearch1 = GridSearchCV( estimator=XGBRegressor( slice=1, learning_rate=0.01, n_estimators=96, # 树的个数--1000棵树建立xgboost max_depth=5, # 树的深度 min_child_weight=2, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 # colsample = 0.75, # 随机选择80%特征建立决策树 colsample_bytree=0.6, verbosity=3, objective='reg:squarederror', # 指定损失函数 eval_metric='mae', scale_pos_weight=1, # 解决样本个数不平衡的问题 # random_state = 27, # 随机数 ), param_grid=param_test1, scoring='neg_mean_absolute_error', n_jobs=4, iid=False, cv=5) # gsearch1.fit(test_xdf, test_ydf) # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_) bst = xgb.train(self.param, dtrain, num_boost_round=200) bst.save_model(self.xgb_model_path) test_preds = bst.predict(dtest) # test_y = dtest.get_label() print('error---', np.mean(abs(test_preds - test_y) / test_y)) print('---') self.model.fit(train_xdf, train_ydf, eval_metric='mae') # bst = xgb.train(alg.get_xgb_params(), xgtrain, num_boost_round=cvresult.shape[0]) # Predict training set: dtrain_predictions = self.model.predict(test_xdf) # dtrain_predictions = bst.predict(dtest) # Print model report: print("\nModel Report") test_y = dtest.get_label() print('error---', np.mean(abs(dtrain_predictions - test_y) / test_y)) # print("Accuracy : %.4g" % mean_absolute_error(test_y, dtrain_predictions)) feat_imp = pd.Series( self.model.get_booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() pass def pred_xgboost(self, predict_data): pass predict_df = pd.DataFrame(predict_data) dpred = xgb.DMatrix(predict_df) bst = xgb.Booster(model_file=self.xgb_model_path) preds = bst.predict(dpred) print('preds---', preds) print('---') return preds pass
def a(): print("error") a() # a() print(set([(199, 198), (198, 178)])) list_eg = [{198} & set(each) for each in [(199, 198), (198, 178)] if {198} & set(each)] print(bool(list_eg)) from sklearn.datasets import load_iris, load_boston, load_iris from xgboost import XGBClassifier, XGBRegressor boston = load_boston() train = pd.DataFrame(boston['data']) label = pd.Series(boston['target'], name='label') full = pd.concat((train, label), axis=1) model = XGBRegressor(n_estimators=3, max_depth=1, reg_lambda=0, reg_alpha=0) model.fit(train, label) model.predict() model.get_booster().trees_to_dataframe()
class MetaRecommender(BaseRecommender): """Penn AI meta recommender. Recommends machine learning algorithms and parameters as follows: maintains an internal model of the form f_d(ML,P,MF) = E where d is the dataset ML is the machine learning P is the ML parameters MF is the metafeatures associated with d to produce recommendations for dataset d, it does the following: E_a = f_d(ML_a,P_a,MF_d) prediction of performance of a on d Sort E_a for several a (sampled from ML+P options) recommend top E_a Parameters ---------- ml_type: str, 'classifier' or 'regressor' Recommending classifiers or regressors. Used to determine ML options. metric: str (default: accuracy for classifiers, mse for regressors) The metric by which to assess performance on the datasets. ml_p: Dataframe Contains all the machine learning / algorithm combinations available for recommendation. sample_size: int Number of ML/P combos to evaluate when making a recommendation. """ def __init__(self, ml_type='classifier', metric=None, ml_p=None, sample_size=100): """Initialize recommendation system.""" if ml_type not in ['classifier', 'regressor']: raise ValueError('ml_type must be "classifier" or "regressor"') self.ml_type = ml_type if metric is None: self.metric = 'bal_accuracy' if self.ml_type == 'classifier' else 'mse' else: self.metric = metric # training data self.training_features = None # store metafeatures of datasets that have been seen # self.dataset_metafeatures = None # maintain a set of dataset-algorithm-parameter combinations that have already been # evaluated self.trained_dataset_models = set() # TODO: add option for ML estimator self.first_update = True # load ML Parameter combinations and fit an encoding to them that can be used for # learning a model : score = f(ml,p,dataset,metafeatures) self.ml_p = ml_p if self.ml_p is not None: self.ml_p = self.params_to_features(self.ml_p, init=True) self.ml_p = self.ml_p.drop_duplicates( ) # just in case duplicates are present # print('ml_p:',self.ml_p) self.cat_params = [ 'criterion', 'kernel', 'loss', 'max_depth', 'max_features', 'min_weight_fraction_leaf', 'n_estimators', 'n_neighbors', 'weights' ] self.sample_size = min(sample_size, len(self.ml_p)) # Encoding the variables self.LE = defaultdict(LabelEncoder) # self.OHE = OneHotEncoder(sparse=False) # pdb.set_trace() self.ml_p = self.ml_p.apply(lambda x: self.LE[x.name].fit_transform(x)) # print('ml_p after LE:',self.ml_p) # self.X_ml_p = self.OHE.fit_transform(self.ml_p.values) self.X_ml_p = self.ml_p.values # self.ml_p = self.ml_p.apply(lambda x: self.OHE[x.name].fit_transform(x)) # print('X after OHE:',self.X_ml_p.shape) # print('self.ml_p:',self.ml_p) print('loaded {nalg} ml/parameter combinations with ' '{nparams} parameters'.format(nalg=self.X_ml_p.shape[0], nparams=self.X_ml_p.shape[1] - 1)) # our ML self.ml = XGBRegressor(max_depth=6, n_estimators=500) def params_to_features(self, df, init=False): """convert parameter dictionaries to dataframe columns""" # pdb.set_trace() try: param = df['parameters'].apply(eval) param = pd.DataFrame.from_records(list(param)) param = param.applymap(str) # get rid of trailing .0 added to integer vals param = param.applymap(lambda x: x[:-2] if x[-2:] == '.0' else x) param = param.reset_index(drop=True) # print('param:',param) df = df.drop('parameters', axis=1).reset_index(drop=True) df = pd.concat([df, param], axis=1) if not init: # need to add additional parameter combos for other ml df_tmp = pd.DataFrame(columns=self.ml_p.columns) df_tmp = df_tmp.append(df) df_tmp.fillna('nan', inplace=True) df = df_tmp # sort columns by name df.sort_index(axis=1, inplace=True) # print('df:',df) except Exception as e: print(e) pdb.set_trace() return df def features_to_params(self, df): """convert dataframe columns to parameter dictionaries""" param = df.to_dict('index') plist = [] for k, v in param.items(): tmp = {k1: v1 for k1, v1 in v.items() if v1 != 'nan'} for k1, v1 in tmp.items(): try: tmp[k1] = int(v1) except: try: tmp[k1] = float(v1) except: pass pass plist.append(str(tmp)) return plist def update(self, results_data, results_mf): """Update ML / Parameter recommendations based on overall performance in results_data. Updates self.scores Parameters ---------- results_data: DataFrame with columns corresponding to: 'dataset' 'algorithm' 'parameters' self.metric """ # keep track of unique dataset / parameter / classifier combos in results_data dap = (results_data['dataset'].values + '|' + results_data['algorithm'].values + '|' + results_data['parameters'].values) d_ml_p = np.unique(dap) self.trained_dataset_models.update(d_ml_p) # transform data for learning a model from it self.setup_training_data(results_data, results_mf) # update internal model self.update_model() def transform_ml_p(self, df_ml_p): """Encodes categorical labels and transforms them using a one hot encoding.""" df_ml_p = self.params_to_features(df_ml_p) # df_tmp = pd.DataFrame(columns=self.ml_p.columns) # df_tmp = df_tmp.append(df_ml_p) # df_tmp.fillna('nan', inplace=True) df_ml_p = df_ml_p.apply(lambda x: self.LE[x.name].transform(x)) # df_ml_p = df_ml_p.apply(lambda x: self.LE[x.name].transform(x)) # print('df_ml_p after LE transform:',df_ml_p) # X_ml_p = self.OHE.transform(df_ml_p.values) X_ml_p = df_ml_p.values # X_ml_p = self.OHE.transform(df_ml_p.values) # print('df_ml_p after OHE (',X_ml_p.shape,':\n',X_ml_p) return X_ml_p def setup_training_data(self, results_data, results_mf): """Transforms metafeatures and results data into learnable format.""" # join df_mf to results_data to get mf rows for each result df_mf = pd.merge(results_data, results_mf, on='dataset', how='inner') df_mf = df_mf.loc[:, df_mf.columns.isin(results_mf.columns)] if 'dataset' in df_mf.columns: df_mf = df_mf.drop('dataset', axis=1) # print('df_mf:',df_mf) # print('dataset_metafeatures:',dataset_metafeatures) # transform algorithms and parameters to one hot encoding df_ml_p = results_data.loc[:, results_data.columns. isin(['algorithm', 'parameters'])] X_ml_p = self.transform_ml_p(df_ml_p) print('df_ml_p shape:', df_ml_p.shape) # join algorithm/parameters with dataset metafeatures print('df_mf shape:', df_mf.shape) self.training_features = np.hstack((X_ml_p, df_mf.values)) # transform data using label encoder and one hot encoder self.training_y = results_data[self.metric].values assert (len(self.training_y) == len(self.training_features)) def recommend(self, dataset_id=None, n_recs=1, dataset_mf=None): """Return a model and parameter values expected to do best on dataset. Parameters ---------- dataset_id: string ID of the dataset for which the recommender is generating recommendations. n_recs: int (default: 1), optional Return a list of length n_recs in order of estimators and parameters expected to do best. """ # TODO: predict scores over many variations of ML+P and pick the best # return ML+P for best average y try: ml_rec, p_rec, rec_score = self.best_model_prediction( dataset_id, n_recs, dataset_mf) for (m, p, r) in zip(ml_rec, p_rec, rec_score): print('ml_rec:', m, 'p_rec', p, 'rec_score', r) ml_rec, p_rec, rec_score = ml_rec[: n_recs], p_rec[: n_recs], rec_score[: n_recs] # # if a dataset is specified, do not make recommendations for # # algorithm-parameter combos that have already been run # if dataset_id is not None: # rec = [r for r in rec if dataset_id + '|' + r not in # self.trained_dataset_models] # ml_rec = [r.split('|')[0] for r in rec] # p_rec = [r.split('|')[1] for r in rec] # rec_score = [self.scores[r] for r in rec] except Exception as e: print('error running self.best_model_prediction for', dataset_id) # print('ml_rec:', ml_rec) # print('p_rec', p_rec) # print('rec_score',rec_score) raise e # update the recommender's memory with the new algorithm-parameter combos that it recommended # ml_rec = ml_rec[:n_recs] # p_rec = p_rec[:n_recs] # rec_score = rec_score[:n_recs] # if dataset_id is not None: # self.trained_dataset_models.update( # ['|'.join([dataset_id, ml, p]) # for ml, p in zip(ml_rec, p_rec)]) return ml_rec, p_rec, rec_score def update_model(self): """Trains model on datasets and metafeatures.""" print('updating model') current_model = None if self.ml._Booster is None else self.ml.get_booster( ) self.ml.fit(self.training_features, self.training_y, xgb_model=current_model) print('model updated') def best_model_prediction(self, dataset_id, n_recs=1, df_mf=None): """Predict scores over many variations of ML+P and pick the best""" # get dataset metafeatures # df_mf = self.get_metafeatures(dataset_id) mf = df_mf.drop('dataset', axis=1).values.flatten() # setup input data by sampling ml+p combinations from all possible combos # choices = np.random.choice(len(self.X_ml_p),size=self.sample_size,replace=False) X_ml_p = self.X_ml_p[np.random.choice(len(self.X_ml_p), size=self.sample_size, replace=False)] print('generating predictions for:') df_tmp = pd.DataFrame(X_ml_p, columns=self.ml_p.columns) print(df_tmp.apply(lambda x: self.LE[x.name].inverse_transform(x))) # make prediction data consisting of ml + p combinations plus metafeatures predict_features = np.array([np.hstack((ml_p, mf)) for ml_p in X_ml_p]) # print('predict_features:',predict_features) # generate predicted scores predict_scores = self.ml.predict(predict_features) # print('predict_scores:',predict_scores) # grab best scores predict_idx = np.argsort(predict_scores)[::-1][:n_recs] # print('predict_idx:',predict_idx) # indices in X_ml_p that match best prediction scores predict_ml_p = X_ml_p[predict_idx] pred_ml_p_df = df_tmp.loc[predict_idx, :] # print('df_tmp[predict_idx]:',pred_ml_p_df) # invert the one hot encoding # fi = self.OHE.feature_indices_ # predict_ml_p_le = [x[fi[i]:fi[i+1]].dot(np.arange(nv)) for i,nv in # enumerate(self.OHE.n_values_) # for x in predict_ml_p] predict_ml_p_le = predict_ml_p # df_pr_ml_p = pd.DataFrame( # data=np.array(predict_ml_p_le).reshape(-1,len(self.ml_p.columns)), # columns = self.ml_p.columns, dtype=np.int64) # # invert the label encoding df_pr_ml_p = df_tmp.loc[predict_idx, :] df_pr_ml_p = df_pr_ml_p.apply( lambda x: self.LE[x.name].inverse_transform(x)) # predict_ml_p = df_pr_ml_p.values # grab recommendations ml_recs = list(df_pr_ml_p['algorithm'].values) p_recs = self.features_to_params(df_pr_ml_p.drop('algorithm', axis=1)) scores = predict_scores[predict_idx] # pdb.set_trace() return ml_recs, p_recs, scores
# df_train = df_train[(z < 10).all(axis=1)] # print(df_train.shape) label = df_train['NU_NOTA_MT'] df_train.drop(['NU_NOTA_MT'], axis=1, inplace=True) label_y = df_test.pop('NU_NOTA_MT') model = XGBRegressor() model.fit(df_train, label) booster = model.get_booster() model_explainer = explain_weights(model, top=None) exp_df = formatters.format_as_dataframe(model_explainer) exp_df.to_csv("model_explainer.csv", index=False) for i in range(10): # , feature_names=booster.feature_names) individual_explainer = explain_prediction(model, df_test.iloc[i]) df_i = formatters.format_as_dataframe(individual_explainer) name = df_answer['NU_INSCRICAO'].iloc[i]
class Modelo: ''' Clase que sirve para preprocesar ligeramente los datos a emplear, principalmente a través de métodos de escalado, concretamente estandarización y normalización, y para la construcción de modelos de clasificación y regresión basados en los algoritmos de las bibliotecas Scikit-learn, Tensorflow, XGBoost y LightGBM. También permite evaluar dichos modelos a través de una serie de métricas provenientes de la librería Scikit-learn y, para algunos algoritmos concretos, permite la visualización, bien del proceso, bien de la importancia de las características empleadas. Concretamente, los algoritmos que se emplearán en esta clase serán: Algoritmos de Clasificación: ----------------------------------------------------------------------- (Todos ellos permiten visualizar matrices de confusión.) -> Regresión Logística: Sklearn (Visualización de características) -> SVC: Sklearn (No permite visualización de características) -> K-vecinos: Sklearn (No permite visualización de características) -> Bosques Aleatorios: Sklearn (Visualización de características) -> Compilación: Sklearn (No permite visualización de características) -> Redes Neuronales: Tensorflow (Visualización de función pérdida) -> Clasificador XGB: XGBoost (Visualización de características) -> Clasificador LightGBM: LightGBM (Visualización de características) ----------------------------------------------------------------------- Algoritmos de Regresión: ----------------------------------------------------------------------- -> Regresión lineal: Sklearn (Visualización de características) -> K-vecinos: Sklearn (No permite visualización de características) -> Regresor de Gradient Boosting: Sklearn (Visualización de características) -> Bosques Aleatorios: Sklearn (Visualización de características) -> Redes Neuronales: Tensorflow (Visualización de función de pérdida) -> Regresor XGB: XGBoost (Visualización de características) -> Regresor LightGBM: LightGBM (Visualización de características) ----------------------------------------------------------------------- En cuanto a las métricas, podrán emplearse las siguientes: Para evaluar modelos de clasificación: ----------------------------------------------------------------------- -> Matriz de confusión -> Reporte de Clasificación (que incluye las principales métricas para cada clase) -> Balance de la clasificación: Puntuación de 0 a 1 del modelo, definida como: especifidad + sensibilidad balance = -------------------------- 2 ----------------------------------------------------------------------- Para evaluar modelos de regresión: ----------------------------------------------------------------------- -> Error absoluto medio -> Error cuadrático medio -> Varianza explicada: Puntuación de 0 a 1 definida como: Var{y-y_pred} EVS = 1 - ------------- Var{y} ----------------------------------------------------------------------- Parámetros: df (pandas.DataFrame): Dataframe con los datos. tipo (str): Distinción entre clasificador y regresor. Return: Clase Modelo. ''' def __init__(self, df, tipo='Clasificador'): self.__data = df self.tipo = tipo if tipo == 'Clasificador': self.y = self.__data['Ganador'] self.X = self.__data.drop(['Ganador', 'Diferencia'], axis=1) else: self.y = self.__data['Diferencia'] self.X = self.__data.drop(['Ganador', 'Diferencia'], axis=1) self.__columns = self.X.columns ##### Preprocesado de datos ##### def estandarizar(self): ''' Función para estandarizar el conjunto de datos. Estandarización significa reescalar los datos para que tengan media de cero y desviación estándar de uno. ''' self.Scaler = StandardScaler() self.X = self.Scaler.fit_transform(self.X) self.X = pd.DataFrame(data=self.X, columns=self.__columns) def normalizar(self): ''' Función para normalizar el conjunto de datos. Normalización significa reescalar los datos para que se encuentren entre cero y uno. ''' self.norma = MinMaxScaler() self.X = self.norma.fit_transform(self.X) self.X = pd.DataFrame(data=self.X, columns=self.__columns) def Split(self, size=0.2): ''' Función para separar los datos en conjuntos de entrenamiento y prueba ''' self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(self.X, self.y, test_size = size, random_state = 42) def retorno(self): ''' Función para invertir el escalado de los datos de prueba. ''' try: if self.Scaler: self.X_test = self.Scaler.inverse_transform(self.X_test) self.X_test = pd.DataFrame(data=self.X_test, columns=self.__columns) except NameError: if self.norma: self.X_test = self.norma.inverse_transform(self.X_test) self.X_test = pd.DataFrame(data=self.X_test, columns=self.__columns) except: print('No se han empleado métodos de reescalado.') ##### Modelos de Clasificación ##### def NN_Clas_model(self, neuronas=[512, 512, 256, 256, 128], dropouts=[0.4, 0.4, 0.3, 0.3], epochs=150, split=0.2, size=11640): ''' Función para construir un modelo de clasificación basado en una red neuronal, mediante el módulo keras de tensorflow. Se establece, por resutados anteriores de prueba y error un número de 5 capas. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. neuronas (int): Lista con el número de neuronas por capa, excepto la última, que solo tiene una. dropouts (int): Lista con la tasa de neuronas en drop-out por capa. epoch (int): Número de veces que la red recorre el dataset. split (int): Tasa de valores del conjunto de entrenamiento empleados como validación. size (int): valor del batch_size. Return: self.model (modelo): Modelo de la red neuronal. self.history (modelo): Modelo entrenado. ''' self.model_type = 'NN' tf.keras.backend.clear_session() self.model = models.Sequential( [ layers.Dense(units=neuronas[0], input_dim=self.X_train.shape[1], activation='relu'), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[0]), layers.Dense(units=neuronas[1], activation='relu'), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[1]), layers.Dense(units=neuronas[2], activation='relu'), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[2]), layers.Dense(units=neuronas[3], activation='relu'), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[3]), layers.Dense(units=neuronas[4], activation='relu'), layers.LeakyReLU(), layers.Dense(units=1, activation="sigmoid"), ], name="Modelo de Clasificación con Redes Neuronales", ) self.model.compile(optimizer=optimizers.Adam(), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy]) self.history = self.model.fit(self.X_train, self.y_train.tolist(), epochs=epochs, batch_size=size, validation_split=split, verbose=1) def RFC(self, max_depth=50, n_estimators=150): ''' Función para construir un modelo de clasificación basado en el algoritmo RandomForestClassifier. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. max_depth (int): Profundidad máxima de los árboles. n_estimators (int): Número de árboles. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'rfc' self.model = RandomForestClassifier(max_depth = max_depth, n_estimators =\ n_estimators) self.model.fit(self.X_train, self.y_train) def SVClass(self, C=16): ''' Función para construir un modelo a partir de los algorimos de clasificación de maquinas de soporte de vectores. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. C (int): Parámetro de regularización. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'svc' self.model = SVC(C=C) self.model.fit(self.X_train, self.y_train) def LogReg(self, C=5): ''' Función para construir un modelo a partir del algoritmo de regresión logística de sklearn. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. C (int): Parámetro de regularización. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'logreg' self.model = LogisticRegression(C=C) self.model.fit(self.X_train, self.y_train) def KNN(self, n_neighbors=5): ''' Función para construir un modelo a partir del algoritmo de K-vecinos de sklearn. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. n_neighboors (int): Número de vecinos. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'knn' self.model = KNeighborsClassifier(n_neighbors=n_neighbors) self.model.fit(self.X_train, self.y_train) def StackModel(self): ''' Función para construir un modelo a partir de los algoritmos SVClassifier, RandomForestClassifier y Regresión logística, mediante la función Stacking Classifier de sklearn. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'stack' estimators = estimators = [('svc', SVC()), ('rf', RandomForestClassifier(n_estimators=100, max_depth=50))] self.model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) self.model.fit(self.X_train, self.y_train) ##### Modelo de Regresión ##### def NN_Reg_model(self, neuronas=[1024, 512, 512, 256, 256, 128], epochs=250, dropouts=[0.4, 0.3, 0.3, 0.2, 0.2], split=0.2, size=11640, lr=0.02, decay=6e-4): ''' Función para construir un modelo de clasificación basado en una red neuronal, mediante el módulo keras de tensorflow. Se establece, por resutados anteriores de prueba y error un número de 5 capas. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. neuronas (int): Lista con el número de neuronas por capa, excepto la última, que solo tiene una. dropouts (float): Lista con la tasa de neuronas en drop-out por capa. epoch (int): Número de veces que la red recorre el dataset. split (float): Tasa de valores del conjunto de entrenamiento empleados como validación. size (int): valor del batch_size. lr (float): Valor del learning rate. decay (float): Valor del decaimiento del ratio de aprendizaje. Return: self.model (modelo): Modelo de la red neuronal. self.history (modelo): Modelo entrenado. ''' self.model_type = 'NN' tf.keras.backend.clear_session() self.model = models.Sequential( [ layers.Dense(units=neuronas[0], input_dim=self.X_train.shape[1]), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[0]), layers.Dense(units=neuronas[1]), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[1]), layers.Dense(units=neuronas[2]), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[2]), layers.Dense(units=neuronas[3]), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[3]), layers.Dense(units=neuronas[4]), layers.LeakyReLU(), layers.BatchNormalization(), layers.Dropout(dropouts[4]), layers.Dense(units=neuronas[5]), layers.LeakyReLU(), layers.Dense(units=1, activation="linear"), ], name="Modelo de Regresión con Redes Neuronales", ) self.model.compile(optimizer=optimizers.Adam(lr=lr, decay=decay), loss=losses.mae, metrics=[metrics.mse]) self.history = self.model.fit(self.X_train, self.y_train.tolist(), epochs=epochs, batch_size=size, validation_split=split, verbose=1) def LinReg(self): ''' Función para construir un modelo de regresión basado en regresión lineal. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'linreg' self.model = LinearRegression() self.model.fit(self.X_train, self.y_train) def GradBoost(self, n_estimators=200, max_depth=10, learning_rate=0.3): ''' Función para construir un modelo de regresión basado en Gradient Boosting. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'gradboost' self.model = GradientBoostingRegressor(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators) self.model.fit(self.X_train, self.y_train) def RFR(self, n_estimators=150, max_depth=20): ''' Función para construir un modelo de regresión basado en Bosques aleatorios. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. n_estimators (int): Número de árboles. max_depth (int): Profundidad máxima de los árboles Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'rfr' self.model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators) self.model.fit(self.X_train, self.y_train) def KNNR(self, n_neigbors=12): ''' Función para construir un modelo de regresión basado en K-vecinos. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. n_neigbors (int): Número de vecinos a tener en cuenta. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'knn' self.model = KNeighborsRegressor(n_neighbors=n_neigbors) self.model.fit(self.X_train, self.y_train) ##### LigthGBM ##### def LGBModel(self, learning_rate=0.5, max_depth=15, n_estimators=100, epoch=100): ''' Función para construir un modelo basado en los algoritmos de LigthGBM, tanto clasificación como de regresión. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. learning_rate (int): Ratio de aprendizaje max_depth (int): Profundidad máxima de los árboles. n_estimators (int): Número de estimadores. epoch (int): Número de veces que se recorre el dataset. Return: self.model (modelo): Modelo de lgb. ''' self.model_type = 'lgb' d_train = lgb.Dataset(self.X_train, label=self.y_train) params = {} params['learning_rate'] = learning_rate params['boosting_type'] = 'gbdt' params['max_depth'] = max_depth params['use_missing'] = False if self.tipo == 'Clasificador': params['objective'] = 'binary' params['metric'] = 'binary_logloss' else: params['objective'] = 'regression' params['n_estimators'] = n_estimators self.model = lgb.train(params, d_train, epoch) ##### XGBoost ##### def XGBmodel(self, learning_rate=0.5, max_depth=10, n_estimators=100): ''' Función para construir modelos de predicción basados en la librería XGBoost. Parámetros: self.X_train (array): Array del conjunto de entrenamiento. self.y_train (array): Array de la clasificación real. learning_rate (int): Ratio de aprendizaje max_depth (int): Profundidad máxima de los árboles. n_estimators (int): Número de árboles. Return: self.model (modelo): Modelo entrenado. ''' self.model_type = 'XGB' if self.tipo == 'Clasificador': self.model = XGBClassifier(learning_rate=learning_rate, max_depth=max_depth) self.model.fit(self.X_train, self.y_train) else: self.model = XGBRegressor(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators) self.model.fit(self.X_train, self.y_train) ##### Predicción ##### def pred_class(self): ''' Función para la predicción de clases para los modelos de clasificación, y su regularización respecto a los valores reales. Parámetros: self.X_test (array): Conjunto de prueba. self.model (modelo): Modelo entrenado. self.model_type (str): Tipo de modelo empleado. Return: self.y_pred (array): Predicciones del modelo. ''' if self.model_type == 'NN': self.y_pred = self.model.predict_classes(self.X_test).reshape(-1) elif self.model_type == 'lgb': self.y_pred = self.model.predict(self.X_test).round(0) else: self.y_pred = self.model.predict(self.X_test) def pred(self): ''' Función para la predicción en los modelos de regresión, y su regularización respecto a los valores reales. Parámetros: self.X_test (array): Conjunto de prueba. self.model (modelo): Modelo entrenado. self.model_type (str): Tipo de modelo empleado. Return: self.y_pred (array): Predicciones del modelo. ''' self.y_pred = self.model.predict(self.X_test) ##### Visualización ##### def Graficar_Perdida(self): ''' Función para graficar la función perdida del set de entrenamiento y el de validación durante el proceso de entrenamiento de una red neuronal. Parámetros: self.model (model): Modelo entrenado. Return: fig (Figure): Gráfica. ''' trace0 = go.Scatter(y=self.history.history['loss'], x=self.history.epoch, mode='lines', marker=dict(color="blue", size=5, opacity=0.5), name="Training Loss") trace1 = go.Scatter(y=self.history.history['val_loss'], x=self.history.epoch, mode='lines', marker=dict(color="red", size=5, opacity=0.5), name="Validation Loss") data = [trace0, trace1] fig = go.Figure(data=data, layout=go.Layout(title="Curva de aprendizaje", yaxis=dict(title="Pérdida"), xaxis=dict(title="Epoch"), legend=dict(yanchor='top', xanchor='center'))) return fig def Feature_importance(self, importance_type='gain', color='green'): ''' Función para graficar la importancia de las características de un modelo. No vale para redes neuronales. Parámetros: self.model (model): Modelo entrenado. importance_type (str): Tipo de importancia a graficar. color (str): Color de representación de la gráfica. figsize (int): Tupla con las dimensiones deseadas de la figura. Return: fig (Figure): figura. ''' if self.model_type == 'lgb': #Valores de las características del modelo lgb. valores = dict(zip(self.X_train.columns, self.model.feature_importance(importance_type = \ importance_type))) elif self.model_type == 'XGB': #Valores de las características del modelo de XGBoost. valores = self.model.get_booster().get_score( importance_type=importance_type) elif self.model_type == 'logreg': #Valores de las características del modelo de regresión logística valores = dict(zip(self.X_train.columns, self.model.coef_[0])) elif self.model_type == 'linreg': #Valores de las características del modelo de regresión lineal valores = dict(zip(self.X_train.columns, self.model.coef_)) else: #Valores de las características de modelos basados en algoritmos de sklearn. valores = dict( zip(self.X_train.columns, self.model.feature_importances_)) #Ordenar los valores de mayor a menor. sorted_tuples = sorted(valores.items(), key=lambda item: item[1]) valores = {k: v for k, v in sorted_tuples} fig = go.Figure( go.Bar(x=list(valores.values()), y=list(valores.keys()), orientation='h')) fig.update_traces(marker_color=color, marker_line_color='black', marker_line_width=1.5, opacity=0.8) fig.update_layout(xaxis_title='Feature importance', yaxis_title='Feature', title='Importancia de las características', width=900, height=850) return fig def Plot_conf_mat(self, colorscale='Jet'): ''' Función para graficar la matriz de confusión como un mapa de calor. Parámetros: self.y_test (array): Array con las observaciones reales. self.y_pred (array): Array con las predicciones. colorscale (str): Escala de color. Return: fig (Figure): Figura. ''' z = confusion_matrix(self.y_test, self.y_pred) x = ['Izquierda', 'Derecha'] y = ['Izquierda', 'Derecha'] z_text = [[str(y) for y in x] for x in z] fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale=colorscale) fig.update_layout(title_text='<i><b>Matriz de confusión</b></i>', xaxis_title='Predicción', yaxis_title='Valor Real') fig.update_layout(margin=dict(t=50, l=200)) fig['data'][0]['showscale'] = True return fig ##### Métricas ##### def conf_mat(self): ''' Función para calcular la especifidad y sensibilidad de un modelo de clasificación a partir de la función confusion_matrix de sklearn. Parámetros: self.y_test (array): Array con los resultados reales de la clasificación. self.y_pred (array): Array con los resultados del modelo. Return: conf_mat (array): Lista con los valores de la matriz de confusión, con orden [TN, FP, FN, TN] ''' return print(confusion_matrix(self.y_test, self.y_pred)) def class_report(self): ''' Función para recibir el reporte del modelo de clasificación. Parámetros: self.y_test (array): Array con los resultados reales de la clasificación. self.y_pred (array): Array con los resultados del modelo. Return: classification_report ''' return print(classification_report(self.y_test, self.y_pred)) def balance(self): ''' Función para recibir el balance de precisión del modelo de clasificación. Parámetros: self.y_test (array): Array con los resultados reales de la clasificación. self.y_pred (array): Array con los resultados del modelo. Return: balanced_accuracy_score ''' return print(balanced_accuracy_score(self.y_test, self.y_pred)) def mae(self): ''' Función para recibir el error absoluto medio del modelo de regresión. Parámetros: self.y_test (array): Array con los resultados reales de la clasificación. self.y_pred (array): Array con los resultados del modelo. Return: mean_absolute_error ''' return print(mean_absolute_error(self.y_test, self.y_pred)) def mse(self): ''' Función para recibir el error cuadrático medio del modelo de regresión. Parámetros: self.y_test (array): Array con los resultados reales de la clasificación. self.y_pred (array): Array con los resultados del modelo. Return: mean_absolute_error ''' return print(mean_squared_error(self.y_test, self.y_pred)) def explain_variance(self): ''' Función para recibir la explained variance score medio del modelo de regresión. Parámetros: self.y_test (array): Array con los resultados reales de la clasificación. self.y_pred (array): Array con los resultados del modelo. Return: explained_variance_score ''' return print(explained_variance_score(self.y_test, self.y_pred))
def get_xgboost_fe(train, targets, test, sub, xgb_params, importance_type='weight', NFOLDS=7, verbosity=0): """ :param train: :param targets: :param test: :param sub: :param xgb_params: :param importance_type: (def.: 'weight') also choice ['weight', 'gain', 'cover', 'total_gain', 'total_cover'] :param NFOLDS: :param verbosity: :return: """ train_score = targets train = train.iloc[:, 1:] test = test.iloc[:, 1:] train_score = targets.iloc[:, 1:] sample = sub cols = train_score.columns submission = sample.copy() submission.loc[:, train_score.columns] = 0 # test_preds = np.zeros((test.shape[0], train_score.shape[1])) oof_loss = 0 start_time = datetime.now() fe_dict = {} for column in train.columns.values: fe_dict[column] = 0.0 for c, column in enumerate(tqdm(cols, 'models_one_cols'), 1): y = train_score[column] total_loss = 0 # cv = KFold(n_splits=NFOLDS, shuffle=True).split(train) CV = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=42).split(X=train, y=targets) start_time_loc = datetime.now() for fn, (trn_idx, val_idx) in enumerate(CV): if verbosity > 1: print('\rFold: ', fn + 1, end='') X_train, X_val = train.iloc[trn_idx], train.iloc[val_idx] y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx] model = XGBRegressor(**xgb_params) model.fit( X_train, y_train, ) importance = model.get_booster().get_score( importance_type=importance_type).items() if len(importance) < 1: if verbosity: print( f"[column {c} ({column}), CV {fn}] importance len < 1") else: for k, v in importance: # print(f"{k}: {v}") fe_dict[k] += v / len(cols) pred = model.predict(X_val) # pred = [n if n>0 else 0 for n in pred] loss = metric(y_val, pred) total_loss += loss predictions = model.predict(test) # predictions = [n if n>0 else 0 for n in predictions] submission[column] += predictions / NFOLDS stop_time_loc = datetime.now() submission[column] = submission[column] / NFOLDS oof_loss += total_loss / NFOLDS if verbosity > 1: print(f"\r[{stop_time_loc - start_time_loc}] Model " + str(c) + ": Loss =" + str(total_loss / NFOLDS)) stop_time = datetime.now() if verbosity: print( f"[{stop_time - start_time}] oof_loss/len(cols): {oof_loss/len(cols)}" ) # submission.loc[test['cp_type'] == 1, train_score.columns] = 0 return { k: v for k, v in sorted(fe_dict.items(), key=lambda kv: kv[1], reverse=True) }
from sklearn.datasets import load_iris, load_boston, load_iris from xgboost import XGBClassifier, XGBRegressor import pandas as pd boston = load_boston() train = pd.DataFrame(boston['data']) label = pd.Series(boston["target"], name='label') full = pd.concat((train, label), axis=1) print(full) model = XGBRegressor(n_estimators=3, max_depth=1, reg_lambda=0, reg_alpha=0) model.fit(train, label) xgb_res = model.get_booster().trees_to_dataframe() print(xgb_res) # import pdb;pdb.set_trace() # 回归任务,损失函数为均方误差 full["g"] = full["label"] - 0 # 为什么是-0.5 full["h"] = 1 root_score = full["g"].sum()**2 / full.shape[0] # 根节点506个数据 left_df = full[full.iloc[:, 5] < 6.9410] right_df = full[full.iloc[:, 5] >= 6.9410] left_leaf = left_df["g"].sum() / left_df["h"].sum() # 左子树430个数据 right_leaf = right_df["g"].sum() / right_df["h"].sum() # 右子树76个数据 # left_df = full[full.iloc[:,12] < 9.725] # right_df = full[full.iloc[:,12] >= 9.725] left_score = left_df["g"].sum()**2 / left_df.shape[0] right_score = right_df["g"].sum()**2 / right_df.shape[0] print('The Gain for Root is left node score {} + right node score {} - root score {} = {},\nleft leaf: {}, right_leaf: {}' \
def train(): data = pd.read_csv('DATA\\data.csv', header=None, sep=' ') score = pd.read_csv('DATA\\score.csv', header=None, sep=' ') param_test1 = { 'max_depth': [i for i in range(20, 30)], 'learning_rate': [0.05 * i for i in range(1, 10)], 'min_child_weight': [0.25 * i for i in range(1, 10)], 'subsample': [0.5 + 0.05 * i for i in range(10)], 'gamma': [0.002 * i for i in range(50)], 'colsample_bytree': [0.5 + 0.02 * i for i in range(25)] } for iters in range(1): x_train, x_test, y_train, y_test = train_test_split(data, score, test_size=0.2, random_state=0) print('Starting training...') count = 0 for depth in param_test1['max_depth']: for learning_r in param_test1['learning_rate']: # for min_child_weigh in param_test1['min_child_weight']: for sub_sample in param_test1["subsample"]: # for gammas in param_test1["gamma"]: for colsample in param_test1['colsample_bytree']: try: xgb1 = XGBRegressor( learning_rate=learning_r, max_depth=depth, # min_child_weight=min_child_weigh, subsample=sub_sample, colsample_bytree=colsample, # gamma=gammas, eval_metric='rmse', nthread=4, # scale_pos_weight=1, n_estimators=1500) xgb1.fit(x_train, y_train) y_pred = xgb1.predict(x_test) # predictions = [round(value,2) for value in y_pred] # 计算rmse rmse_new = math.sqrt( sklearn.metrics.mean_squared_error( y_test, y_pred)) print(rmse_new) if rmse_new < rmse: current_path = os.path.join( 'result-xgboost', str(rmse)) os.mkdir(current_path) print(rmse_new) rmse = rmse_new x_test.to_csv(os.path.join( current_path, 'test_feat.csv'), index=False, header=False) y_test.to_csv(os.path.join( current_path, 'test_score.csv'), index=False, header=False) xgb1.get_booster().save_model( os.path.join(current_path, 'xgb.model')) print("saved") count += 1 except: continue
ax = sns.scatterplot(y, y_predicted_xgb_cv, alpha=.3, color=colors[4]) ax.plot(np.arange(0, max(y)), np.arange(0, max(y)), c='grey') ax.set_xlim(min(y), max(y)) ax.set_xlabel('True age') ax.set_ylabel('Predicted age') plt.title('Crossvalidated predictions (XGB)') plt.show() # FEATURE IMPORTANCES # ####################### feature_importances = pd.Series(data=xgb.feature_importances_, index=features.columns) feature_importances.to_csv(join(hlp.DATA_DIR, 'feature_importances.csv')) fig, ax = plt.subplots(figsize=(14, 10)) xgb.get_booster().feature_names = list(features.columns) plot_importance(xgb, max_num_features=30, ax=ax, importance_type='gain') plt.show() feature_importances = feature_importances.sort_values(ascending=False) proportions = [] for i in range(1, len(feature_importances)): temp = feature_importances[:i] total = len(temp) md = len([f for f in temp.iteritems() if f[0].startswith('md')]) proportions.append(md / i) plt.plot(proportions, label='Proportion in top') plt.title('Importance of theory-driven features') plt.xlabel('Top n features') plt.axhline(y=23 / features.shape[1], color='grey',
def __init__(self, model: XGBRegressor, feature_names: List[str]): super().__init__(model.get_booster(), feature_names, model.base_score, model.objective)
def GBDT_main(): '''GBDT主函数''' #载入数据 # dataset = boston_1() dataset = DataGenerator() #画图横纵坐标序列 X, Y = [], [] #初始化MSE和交叉验证折数fold MSE, fold = 0, 1 #子学习器个数 n_estimators = 1 #设置误差阈值:三个误差评估设置# Threshold = 70000000 # k-fold对象,用于生成训练集和交叉验证集数据 kf = model_selection.KFold(n_splits=5, shuffle=False, random_state=32) #生成GBDT模型 model = XGBRegressor( max_depth=7, # 树的最大深度(可调) learning_rate=0.1, # 学习率(可调) n_estimators=n_estimators, # 树的个数 objective='reg:linear', # 损失函数类型 nthread=4, # 线程数 gamma=0.1, # 节点分裂时损失函数所需最小下降值(可调) min_child_weight=1, # 叶子结点最小权重 subsample=1., # 随机选择样本比例建立决策树 colsample_bytree=1., # 随机选择样本比例建立决策树 reg_lambda=3, # 二阶范数正则化项权衡值(可调) scale_pos_weight=1., # 解决样本个数不平衡问题 random_state=1000, # 随机种子设定值 ) while 1: # 定义最终输出的target= target - pre_target,数据维度同target fin_GBDT_error_target = np.array(None) for train_data_index, cv_data_index in kf.split(dataset): #找到对应索引数据 train_data, cv_data = dataset[train_data_index], dataset[ cv_data_index] # 训练数据 model.fit(X=train_data[:, :4], y=train_data[:, -1]) # 对验证集进行预测 pred_cv = model.predict(cv_data[:, :4]) #每次对验证集都需要计算误差向量 fold_error = cv_data[:, -1] - pred_cv fin_GBDT_error_target = fold_error if fin_GBDT_error_target.any() == None else \ np.hstack((fin_GBDT_error_target, fold_error)) # 对测试集进行MSE计算 MSE = ((fold - 1) * MSE + mean_squared_error(cv_data[:, -1], pred_cv)) / fold fold += 1 print('CART树个数: %s, 验证集MSE: %s' % (model.n_estimators, MSE)) X = [1] if X == [] else X + [X[-1] + 1] Y.append(MSE) if MSE < Threshold: break else: MSE, fold = 0, 1 # 如果验证集MSE值大于阈值则将GBDT中弱学习器数量自增1 model.n_estimators += 1 # print(fin_GBDT_error_target, fin_GBDT_error_target.shape) # print(X) ###################################实验数据需要修改############################### data = np.hstack((dataset[:, 4:-1], fin_GBDT_error_target[:, np.newaxis])) ################################################################################# print(data.shape) SaveFile(data) #保存模型 model.get_booster().save_model('GBDT.model') # 显示重要参数以及验证集误差随学习器个数的变化曲线 plt.plot(X, Y) # plot_importance(model) plt.show() # 模型可视化 digraph = xgb.to_graphviz(model, num_trees=4) digraph.format = 'png' digraph.view('./boston_xgb')
def TA_screening(stock): #print(stocklist) #tic = time.perf_counter() index = [] start_date = datetime.datetime.now() - datetime.timedelta(days=59) end_date = datetime.datetime.now() df = pdr.get_data_yahoo(stock, start=start_date, end=end_date, interval="2m", prepost=True) #df = pdr.get_data_yahoo(stock, period = "max", interval = "1d", prepost = True) df.index = df.index.tz_localize(None) #print(df.size) '''#2 min ticker # 30 intervals = 1 hour # 195 intervals = trading day''' # < old # there are more intervals that we can use / change #1 interval = 1 day really_fast = 30 fast = 60 slow = 120 # these are the overlap studies def add_indicators(): upper_band, mid_band, lower_band = BBANDS(df['Adj Close'], timeperiod=really_fast, nbdevup=2, nbdevdn=2, matype=0) d_ema = DEMA(df['Adj Close'], timeperiod=really_fast) E_M_A = EMA(df['Adj Close'], timeperiod=fast) ht_trend = HT_TRENDLINE(df['Adj Close']) kama = KAMA(df['Adj Close'], timeperiod=fast) ma = MA(df['Adj Close'], timeperiod=fast, matype=0) #mama, fama = MAMA(df['Adj Close'], fastlimit=really_fast, slowlimit=slow) < this gave me issues? #mavp = MAVP(df['Adj Close']) mid = MIDPOINT(df['Adj Close'], timeperiod=fast) mid_price = MIDPRICE(df['High'], df['Low'], timeperiod=fast) sar = SAR(df['High'], df['Low'], acceleration=.02, maximum=.2) sarext = SAREXT(df['High'], df['Low'], startvalue=0, offsetonreverse=0, accelerationinitlong=.02, accelerationlong=.02, accelerationmaxlong=.2, accelerationinitshort=.02, accelerationshort=.02, accelerationmaxshort=.2) sma = SMA(df['Adj Close'], timeperiod=slow) tema = TEMA(df['Adj Close'], timeperiod=slow) trima = TRIMA(df['Adj Close'], timeperiod=slow) wma = WMA(df['Adj Close'], timeperiod=slow) #this is some of the beginning stuff O_B_V = OBV(df['Adj Close'], df['Volume']) A_D_O_S_C = ADOSC(df['High'], df['Low'], df['Adj Close'], df['Volume'], fastperiod=fast, slowperiod=slow) O_G_chaikin = AD(df['High'], df['Low'], df['Adj Close'], df['Volume']) HT_DCper = HT_DCPERIOD(df['Adj Close']) HT_DCphase = HT_DCPHASE(df['Adj Close']) inphase, quad = HT_PHASOR(df['Adj Close']) r_sin, leadsin = HT_SINE(df['Adj Close']) #volatility atr = ATR(df['High'], df['Low'], df['Adj Close'], timeperiod=slow) natr = NATR(df['High'], df['Low'], df['Adj Close'], timeperiod=slow) t_range = TRANGE(df['High'], df['Low'], df['Adj Close']) #below here are momentum ind adx = ADX(df['High'], df['Low'], df['Adj Close'], timeperiod=fast) adxr = ADXR(df['High'], df['Low'], df['Adj Close'], timeperiod=fast) apo = APO(df['Adj Close'], fastperiod=really_fast, slowperiod=fast, matype=0) aroon_d, aroon_u = AROON(df['High'], df['Low'], timeperiod=fast) aroon_osc = AROONOSC(df['High'], df['Low'], timeperiod=fast) bop = BOP(df['Open'], df['High'], df['Low'], df['Adj Close']) cci = CCI(df['High'], df['Low'], df['Adj Close'], timeperiod=fast) cmo = CMO(df['Adj Close'], timeperiod=fast) dx = DX(df['High'], df['Low'], df['Adj Close'], timeperiod=fast) macd, macdsig, macdhist = MACD(df['Adj Close'], fastperiod=fast, slowperiod=slow, signalperiod=really_fast) macdex, macdexsig, macdexhist = MACDEXT(df['Adj Close'], fastperiod=fast, fastmatype=0, slowperiod=slow, slowmatype=0, signalperiod=really_fast, signalmatype=0) macdfixd, macdfixdsig, macdfixdhist = MACDFIX(df['Adj Close'], signalperiod=really_fast) # more momo's mfi = MFI(df['High'], df['Low'], df['Adj Close'], df['Volume'], timeperiod=fast) min_di = MINUS_DI(df['High'], df['Low'], df['Adj Close'], timeperiod=fast) min_dm = MINUS_DM(df['High'], df['Low'], timeperiod=fast) momo = MOM(df['Adj Close'], timeperiod=really_fast) plus_di = PLUS_DI(df['High'], df['Low'], df['Adj Close'], timeperiod=fast) plus_dm = PLUS_DM(df['High'], df['Low'], timeperiod=fast) ppo = PPO(df['Adj Close'], fastperiod=really_fast, slowperiod=fast, matype=0) roc = ROC(df['Adj Close'], timeperiod=fast) rocp = ROCP(df['Adj Close'], timeperiod=fast) rocr = ROCR(df['Adj Close'], timeperiod=fast) rocr_hund = ROCR100(df['Adj Close'], timeperiod=fast) rsi_fastk, rsi_fastd = STOCHRSI(df['Adj Close'], timeperiod=fast, fastk_period=slow, fastd_period=really_fast, fastd_matype=0) trix = TRIX(df['Adj Close'], timeperiod=slow) ult_osc = ULTOSC(df['High'], df['Low'], df['Adj Close'], timeperiod1=really_fast, timeperiod2=fast, timeperiod3=slow) #old some of the first added R_S_I = RSI(df['Adj Close'], timeperiod=slow) slowk, slowd = STOCH(df['High'], df['Low'], df['Adj Close'], fastk_period=fast, slowk_period=slow, slowk_matype=0, slowd_period=slow, slowd_matype=0) fastk, fastd = STOCHF(df['High'], df['Low'], df['Adj Close'], fastk_period=fast, fastd_period=really_fast, fastd_matype=0) real = WILLR(df['High'], df['Low'], df['Adj Close'], timeperiod=slow) # below are the TA indicators two_crows = CDL2CROWS(df['Open'], df['High'], df['Low'], df['Adj Close']) three_crows = CDL3BLACKCROWS(df['Open'], df['High'], df['Low'], df['Adj Close']) three_inside = CDL3INSIDE(df['Open'], df['High'], df['Low'], df['Adj Close']) three_line = CDL3LINESTRIKE(df['Open'], df['High'], df['Low'], df['Adj Close']) three_out = CDL3OUTSIDE(df['Open'], df['High'], df['Low'], df['Adj Close']) three_stars = CDL3STARSINSOUTH(df['Open'], df['High'], df['Low'], df['Adj Close']) three_soldier = CDL3WHITESOLDIERS(df['Open'], df['High'], df['Low'], df['Adj Close']) baby = CDLABANDONEDBABY(df['Open'], df['High'], df['Low'], df['Adj Close'], penetration=0) adv = CDLADVANCEBLOCK(df['Open'], df['High'], df['Low'], df['Adj Close']) belt_hold = CDLBELTHOLD(df['Open'], df['High'], df['Low'], df['Adj Close']) breakaway = CDLBREAKAWAY(df['Open'], df['High'], df['Low'], df['Adj Close']) closingmara = CDLCLOSINGMARUBOZU(df['Open'], df['High'], df['Low'], df['Adj Close']) baby_swallow = CDLCONCEALBABYSWALL(df['Open'], df['High'], df['Low'], df['Adj Close']) #more TA counter = CDLCOUNTERATTACK(df['Open'], df['High'], df['Low'], df['Adj Close']) dark_cloud = CDLDARKCLOUDCOVER(df['Open'], df['High'], df['Low'], df['Adj Close'], penetration=0) doji = CDLDOJI(df['Open'], df['High'], df['Low'], df['Adj Close']) doji_star = CDLDOJISTAR(df['Open'], df['High'], df['Low'], df['Adj Close']) dragon_doji = CDLDRAGONFLYDOJI(df['Open'], df['High'], df['Low'], df['Adj Close']) engulf = CDLENGULFING(df['Open'], df['High'], df['Low'], df['Adj Close']) evening_star = CDLEVENINGSTAR(df['Open'], df['High'], df['Low'], df['Adj Close']) gapside = CDLGAPSIDESIDEWHITE(df['Open'], df['High'], df['Low'], df['Adj Close']) gravestone = CDLGRAVESTONEDOJI(df['Open'], df['High'], df['Low'], df['Adj Close']) hammer = CDLHAMMER(df['Open'], df['High'], df['Low'], df['Adj Close']) hang_man = CDLHANGINGMAN(df['Open'], df['High'], df['Low'], df['Adj Close']) harami = CDLHARAMI(df['Open'], df['High'], df['Low'], df['Adj Close']) harami_cross = CDLHARAMICROSS(df['Open'], df['High'], df['Low'], df['Adj Close']) #more TA high_wave = CDLHIGHWAVE(df['Open'], df['High'], df['Low'], df['Adj Close']) hikkake = CDLHIKKAKE(df['Open'], df['High'], df['Low'], df['Adj Close']) hikkake_mod = CDLHIKKAKEMOD(df['Open'], df['High'], df['Low'], df['Adj Close']) pidgeon = CDLHOMINGPIGEON(df['Open'], df['High'], df['Low'], df['Adj Close']) id_three_crows = CDLIDENTICAL3CROWS(df['Open'], df['High'], df['Low'], df['Adj Close']) in_neck = CDLINNECK(df['Open'], df['High'], df['Low'], df['Adj Close']) inv_hammer = CDLINVERTEDHAMMER(df['Open'], df['High'], df['Low'], df['Adj Close']) kicking = CDLKICKING(df['Open'], df['High'], df['Low'], df['Adj Close']) kicking_len = CDLKICKINGBYLENGTH(df['Open'], df['High'], df['Low'], df['Adj Close']) ladder_bot = CDLLADDERBOTTOM(df['Open'], df['High'], df['Low'], df['Adj Close']) doji_long = CDLLONGLEGGEDDOJI(df['Open'], df['High'], df['Low'], df['Adj Close']) long_line = CDLLONGLINE(df['Open'], df['High'], df['Low'], df['Adj Close']) marabozu = CDLMARUBOZU(df['Open'], df['High'], df['Low'], df['Adj Close']) #more TA match_glow = CDLMATCHINGLOW(df['Open'], df['High'], df['Low'], df['Adj Close']) mat_hold = CDLMATHOLD(df['Open'], df['High'], df['Low'], df['Adj Close'], penetration=0) morning_doji = CDLMORNINGDOJISTAR(df['Open'], df['High'], df['Low'], df['Adj Close'], penetration=0) morning_star = CDLMORNINGSTAR(df['Open'], df['High'], df['Low'], df['Adj Close'], penetration=0) on_neck = CDLONNECK(df['Open'], df['High'], df['Low'], df['Adj Close']) pierce = CDLPIERCING(df['Open'], df['High'], df['Low'], df['Adj Close']) rickshaw = CDLRICKSHAWMAN(df['Open'], df['High'], df['Low'], df['Adj Close']) rise_fall = CDLRISEFALL3METHODS(df['Open'], df['High'], df['Low'], df['Adj Close']) sep_line = CDLSEPARATINGLINES(df['Open'], df['High'], df['Low'], df['Adj Close']) shooting_star = CDLSHOOTINGSTAR(df['Open'], df['High'], df['Low'], df['Adj Close']) sl_candle = CDLSHORTLINE(df['Open'], df['High'], df['Low'], df['Adj Close']) spin_top = CDLSPINNINGTOP(df['Open'], df['High'], df['Low'], df['Adj Close']) stalled = CDLSTALLEDPATTERN(df['Open'], df['High'], df['Low'], df['Adj Close']) #more TA stick_sand = CDLSTICKSANDWICH(df['Open'], df['High'], df['Low'], df['Adj Close']) takuri = CDLTAKURI(df['Open'], df['High'], df['Low'], df['Adj Close']) tasuki_gap = CDLTASUKIGAP(df['Open'], df['High'], df['Low'], df['Adj Close']) thrust = CDLTHRUSTING(df['Open'], df['High'], df['Low'], df['Adj Close']) tristar = CDLTRISTAR(df['Open'], df['High'], df['Low'], df['Adj Close']) three_river = CDLUNIQUE3RIVER(df['Open'], df['High'], df['Low'], df['Adj Close']) ud_two_gap = CDLUPSIDEGAP2CROWS(df['Open'], df['High'], df['Low'], df['Adj Close']) down_three_gap = CDLXSIDEGAP3METHODS(df['Open'], df['High'], df['Low'], df['Adj Close']) #76 vars #are_all_zero = (test_TA == 0).all() #true if all values are 0 #false if contain a non 0''' df.drop(['Close'], axis=1, inplace=True) df['upper_band'] = upper_band df['lower_band'] = lower_band df['mid_band'] = mid_band df['d_ema'] = d_ema df['ht_trend'] = ht_trend df['kama'] = kama df['ma'] = ma #df['mama'] = mama df['mid'] = mid df['mid_price'] = mid_price df['sar'] = sar df['sarext'] = sarext df['sma'] = sma df['tema'] = tema df['trima'] = trima df['wma'] = wma #df['fama'] = fama df['EMA'] = E_M_A df['SlowK'] = slowk df['SlowD'] = slowd df['R_S_I'] = R_S_I df['FastK'] = fastk df['FastD'] = fastd df['WilliamsR'] = real df['atr'] = atr df['natr'] = natr df['t_range'] = t_range #df['na_tr'] = natr df['OBV'] = O_B_V df['ADOSC'] = A_D_O_S_C df['ogchaikin'] = O_G_chaikin df['HTDCperiod'] = HT_DCper df['HTDCphase'] = HT_DCphase df['inphase'] = inphase df['quad'] = quad df['rsin'] = r_sin df['leadsin'] = leadsin df['mfi'] = mfi df['min_di'] = min_di df['min_dm'] = min_dm df['momo'] = momo df['plus_di'] = plus_di df['plus_dm'] = plus_dm df['ppo'] = ppo df['roc'] = roc df['rocp'] = rocp df['rocr'] = rocr df['rocr_hund'] = rocr_hund df['rsi_fastk'] = rsi_fastk df['rsi_fastd'] = rsi_fastd df['trix'] = trix df['ult_osc'] = ult_osc df['adx'] = adx df['adxr'] = adxr df['apo'] = apo df['aroon_d'] = aroon_d df['aroon_u'] = aroon_u df['aroon_osc'] = aroon_osc df['bop'] = bop df['cci'] = cci df['cmo'] = cmo df['dx'] = dx df['macd'] = macd df['macdsig'] = macdsig df['macdhist'] = macdhist df['macdex'] = macdex df['macdexsig'] = macdexsig df['macdexhist'] = macdexhist df['macdfixd'] = macdfixd df['macdfixdsig'] = macdfixdsig df['macdfixdhist'] = macdfixdhist df['two_crows'] = two_crows df['three_crows'] = three_crows df['three_inside'] = three_inside df['three_line'] = three_line df['three_out'] = three_out df['three_stars'] = three_stars df['three_soldier'] = three_soldier df['baby'] = baby df['adv'] = adv df['belt_hold'] = belt_hold df['breakaway'] = breakaway df['closingmara'] = closingmara df['baby_swallow'] = belt_hold df['counter'] = counter df['dark_cloud'] = dark_cloud df['doji'] = doji df['doji_star'] = doji_star df['dragon_doji'] = dragon_doji df['engulf'] = engulf df['evening_star'] = evening_star df['gapside'] = gapside df['gravestone'] = gravestone df['hammer'] = hammer df['hang_man'] = hang_man df['harami'] = harami df['harami_cross'] = harami_cross df['high_wave'] = high_wave df['hikkake'] = hikkake df['hikkake_mod'] = hikkake_mod df['pidgeon'] = pidgeon df['id_three_crows'] = id_three_crows df['in_neck'] = in_neck df['inv_hammer'] = inv_hammer df['kicking'] = kicking df['kicking_len'] = kicking_len df['ladder_bot'] = ladder_bot df['doji_long'] = doji_long df['long_line'] = long_line df['marabozu'] = marabozu # this is a comment df['match_glow'] = match_glow df['mat_hold'] = mat_hold df['morning_doji'] = morning_doji df['morning_star'] = morning_star df['on_neck'] = on_neck df['pierce'] = pierce df['rickshaw'] = rickshaw df['rise_fall'] = rise_fall df['sep_line'] = sep_line df['shooting_star'] = shooting_star df['sl_candle'] = sl_candle df['spin_top'] = spin_top df['stalled'] = stalled df['stick_sand'] = stick_sand df['takuri'] = takuri df['tasuki_gap'] = tasuki_gap df['thrust'] = thrust df['tristar'] = tristar df['three_river'] = three_river df['ud_two_gap'] = ud_two_gap df['down_three_gap'] = down_three_gap add_indicators() # Convert Date column to datetime df.reset_index(level=0, inplace=True) # Change all column headings to be lower case, and remove spacing df.columns = [str(x).lower().replace(' ', '_') for x in df.columns] # Get difference between high and low of each day df['range_hl'] = df['high'] - df['low'] df.drop(['high', 'low'], axis=1, inplace=True) # Get difference between open and close of each day df['range_oc'] = df['open'] - df['adj_close'] df.drop(['open'], axis=1, inplace=True) # Add a column 'order_day' to indicate the order of the rows by date df['order_day'] = [x for x in list(range(len(df)))] # merging_keys merging_keys = ['order_day'] #define shift range # 2 min intervals - 30 = 1hr N = 15 lag_cols = [ 'ema', 'slowk', 'slowd', 'r_s_i', 'fastk', 'fastd', 'williamsr', 'volume', 'range_hl', 'range_oc', 'adj_close', 'obv', 'adosc', 'ogchaikin', 'htdcperiod', 'htdcphase', 'inphase', 'quad', 'rsin', 'leadsin', 'two_crows', 'three_crows', 'three_inside', 'three_line', 'three_out', 'three_stars', 'three_soldier', 'baby', 'adv', 'belt_hold', 'breakaway', 'closingmara', 'baby_swallow', 'counter', 'dark_cloud', 'doji', 'doji_star', 'dragon_doji', 'engulf', 'evening_star', 'gapside', 'gravestone', 'hammer', 'hang_man', 'harami', 'harami_cross', 'high_wave', 'hikkake', 'hikkake_mod', 'pidgeon', 'id_three_crows', 'in_neck', 'inv_hammer', 'kicking', 'kicking_len', 'ladder_bot', 'doji_long', 'long_line', 'marabozu', 'match_glow', 'mat_hold', 'morning_doji', 'morning_star', 'on_neck', 'pierce', 'rickshaw', 'rise_fall', 'sep_line', 'shooting_star', 'sl_candle', 'spin_top', 'stalled', 'stick_sand', 'takuri', 'tasuki_gap', 'thrust', 'tristar', 'three_river', 'ud_two_gap', 'down_three_gap', 'upper_band', 'lower_band', 'mid_band', 'd_ema', 'ht_trend', 'kama', 'ma', 'mid', 'mid_price', 'sar', 'sarext', 'sma', 'tema', 'trima', 'wma', 'adx', 'adxr', 'apo', 'aroon_d', 'aroon_u', 'aroon_osc', 'bop', 'cci', 'cmo', 'dx', 'macd', 'macdsig', 'macdhist', 'macdex', 'macdexsig', 'macdexhist', 'macdfixd', 'macdfixdsig', 'macdfixdhist', 'mfi', 'min_di', 'min_dm', 'momo', 'plus_di', 'plus_dm', 'ppo', 'roc', 'rocp', 'rocr', 'rocr_hund', 'rsi_fastk', 'rsi_fastd', 'trix', 'ult_osc', 'atr', 'natr', 't_range' ] shift_range = [x + 1 for x in range(N)] for shift in shift_range: train_shift = df[merging_keys + lag_cols].copy() # E.g. order_day of 0 becomes 1, for shift = 1. # So when this is merged with order_day of 1 in df, this will represent lag of 1. train_shift['order_day'] = train_shift['order_day'] + shift foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x train_shift = train_shift.rename(columns=foo) df = pd.merge(df, train_shift, on=merging_keys, how='left') #.fillna(0) del train_shift df.fillna(0, inplace=True) # other ways to render the NAN values exist #defining test and train len num_test = int(.05 * len(df)) num_train = len(df) - num_test # Split into train, cv, and test train = df[:num_train] test = df[num_train:] cols_to_scale = [ 'ema', 'slowk', 'slowd', 'r_s_i', 'fastk', 'fastd', 'williamsr', 'volume', 'range_hl', 'range_oc', 'adj_close', 'obv', 'adosc', 'ogchaikin', 'htdcperiod', 'htdcphase', 'inphase', 'quad', 'rsin', 'leadsin', 'two_crows', 'three_crows', 'three_inside', 'three_line', 'three_out', 'three_stars', 'three_soldier', 'baby', 'adv', 'belt_hold', 'breakaway', 'closingmara', 'baby_swallow', 'counter', 'dark_cloud', 'doji', 'doji_star', 'dragon_doji', 'engulf', 'evening_star', 'gapside', 'gravestone', 'hammer', 'hang_man', 'harami', 'harami_cross', 'high_wave', 'hikkake', 'hikkake_mod', 'pidgeon', 'id_three_crows', 'in_neck', 'inv_hammer', 'kicking', 'kicking_len', 'ladder_bot', 'doji_long', 'long_line', 'marabozu', 'match_glow', 'mat_hold', 'morning_doji', 'morning_star', 'on_neck', 'pierce', 'rickshaw', 'rise_fall', 'sep_line', 'shooting_star', 'sl_candle', 'spin_top', 'stalled', 'stick_sand', 'takuri', 'tasuki_gap', 'thrust', 'tristar', 'three_river', 'ud_two_gap', 'down_three_gap', 'upper_band', 'lower_band', 'mid_band', 'd_ema', 'ht_trend', 'kama', 'ma', 'mid', 'mid_price', 'sar', 'sarext', 'sma', 'tema', 'trima', 'wma', 'adx', 'adxr', 'apo', 'aroon_d', 'aroon_u', 'aroon_osc', 'bop', 'cci', 'cmo', 'dx', 'macd', 'macdsig', 'macdhist', 'macdex', 'macdexsig', 'macdexhist', 'macdfixd', 'macdfixdsig', 'macdfixdhist', 'mfi', 'min_di', 'min_dm', 'momo', 'plus_di', 'plus_dm', 'ppo', 'roc', 'rocp', 'rocr', 'rocr_hund', 'rsi_fastk', 'rsi_fastd', 'trix', 'ult_osc', 'atr', 'natr', 't_range' ] for i in range(1, N + 1): cols_to_scale.append("ema_lag_" + str(i)) cols_to_scale.append("slowk_lag_" + str(i)) cols_to_scale.append("slowd_lag_" + str(i)) cols_to_scale.append("r_s_i_lag_" + str(i)) cols_to_scale.append("fastk_lag_" + str(i)) cols_to_scale.append("fastd_lag_" + str(i)) cols_to_scale.append("williamsr_lag_" + str(i)) cols_to_scale.append("volume_lag_" + str(i)) cols_to_scale.append("range_hl_lag_" + str(i)) cols_to_scale.append("range_oc_lag_" + str(i)) cols_to_scale.append("adj_close_lag_" + str(i)) cols_to_scale.append("upper_band_lag_" + str(i)) cols_to_scale.append("lower_band_lag_" + str(i)) cols_to_scale.append("mid_band_lag_" + str(i)) cols_to_scale.append("d_ema_lag_" + str(i)) cols_to_scale.append("ht_trend_lag_" + str(i)) cols_to_scale.append("kama_lag_" + str(i)) cols_to_scale.append("ma_lag_" + str(i)) cols_to_scale.append("mid_lag_" + str(i)) cols_to_scale.append("mid_price_lag_" + str(i)) cols_to_scale.append("sar_lag_" + str(i)) cols_to_scale.append("sarext_lag_" + str(i)) cols_to_scale.append("sma_lag_" + str(i)) cols_to_scale.append("tema_lag_" + str(i)) cols_to_scale.append("trima_lag_" + str(i)) cols_to_scale.append("wma_lag_" + str(i)) cols_to_scale.append("atr_lag_" + str(i)) cols_to_scale.append("natr_lag_" + str(i)) cols_to_scale.append("t_range_lag_" + str(i)) #momentum indicator lag cols cols_to_scale.append("adx_lag_" + str(i)) cols_to_scale.append("adxr_lag_" + str(i)) cols_to_scale.append("apo_lag_" + str(i)) cols_to_scale.append("aroon_d_lag_" + str(i)) cols_to_scale.append("aroon_u_lag_" + str(i)) cols_to_scale.append("aroon_osc_lag_" + str(i)) cols_to_scale.append("bop_lag_" + str(i)) cols_to_scale.append("cci_lag_" + str(i)) cols_to_scale.append("cmo_lag_" + str(i)) cols_to_scale.append("dx_lag_" + str(i)) cols_to_scale.append("macd_lag_" + str(i)) cols_to_scale.append("macdsig_lag_" + str(i)) cols_to_scale.append("macdhist_lag_" + str(i)) cols_to_scale.append("macdex_lag_" + str(i)) cols_to_scale.append("mfi_lag_" + str(i)) cols_to_scale.append("min_di_lag_" + str(i)) cols_to_scale.append("min_dm_lag_" + str(i)) cols_to_scale.append("momo_lag_" + str(i)) cols_to_scale.append("plus_di_lag_" + str(i)) cols_to_scale.append("plus_dm_lag_" + str(i)) cols_to_scale.append("ppo_lag_" + str(i)) cols_to_scale.append("roc_lag_" + str(i)) cols_to_scale.append("rocp_lag_" + str(i)) cols_to_scale.append("rocr_lag_" + str(i)) cols_to_scale.append("rocr_hund_lag_" + str(i)) cols_to_scale.append("rsi_fastk_lag_" + str(i)) cols_to_scale.append("rsi_fastd_lag_" + str(i)) cols_to_scale.append("trix_lag_" + str(i)) cols_to_scale.append("ult_osc_lag_" + str(i)) cols_to_scale.append("macdexsig_lag_" + str(i)) cols_to_scale.append("macdexhist_lag_" + str(i)) cols_to_scale.append("macdfixd_lag_" + str(i)) cols_to_scale.append("macdfixdsig_lag_" + str(i)) cols_to_scale.append("macdfixdhist_lag_" + str(i)) #cols_to_scale.append("mama_lag_"+str(i)) #cols_to_scale.append("NATR_lag_"+str(i)) cols_to_scale.append("obv_lag_" + str(i)) cols_to_scale.append("adosc_lag_" + str(i)) cols_to_scale.append("ogchaikin_lag_" + str(i)) cols_to_scale.append("htdcperiod_lag_" + str(i)) cols_to_scale.append("htdcphase_lag_" + str(i)) cols_to_scale.append("inphase_lag_" + str(i)) cols_to_scale.append("quad_lag_" + str(i)) cols_to_scale.append("rsin_lag_" + str(i)) cols_to_scale.append("leadsin_lag_" + str(i)) #cols_to_scale.append("fama_lag_"+str(i)) cols_to_scale.append("two_crows_lag_" + str(i)) cols_to_scale.append("three_crows_lag_" + str(i)) cols_to_scale.append("three_inside_lag_" + str(i)) cols_to_scale.append("three_line_lag_" + str(i)) cols_to_scale.append("three_out_lag_" + str(i)) cols_to_scale.append("three_stars_lag_" + str(i)) cols_to_scale.append("three_soldier_lag_" + str(i)) cols_to_scale.append("baby_lag_" + str(i)) cols_to_scale.append("adv_lag_" + str(i)) cols_to_scale.append("belt_hold_lag_" + str(i)) cols_to_scale.append("breakaway_lag_" + str(i)) cols_to_scale.append("closingmara_lag_" + str(i)) cols_to_scale.append("baby_swallow_lag_" + str(i)) cols_to_scale.append("counter_lag_" + str(i)) cols_to_scale.append("dark_cloud_lag_" + str(i)) cols_to_scale.append("doji_lag_" + str(i)) cols_to_scale.append("doji_star_lag_" + str(i)) cols_to_scale.append("dragon_doji_lag_" + str(i)) cols_to_scale.append("engulf_lag_" + str(i)) cols_to_scale.append("evening_star_lag_" + str(i)) cols_to_scale.append("gapside_lag_" + str(i)) cols_to_scale.append("gravestone_lag_" + str(i)) cols_to_scale.append("hammer_lag_" + str(i)) cols_to_scale.append("hang_man_lag_" + str(i)) cols_to_scale.append("harami_lag_" + str(i)) cols_to_scale.append("harami_cross_lag_" + str(i)) cols_to_scale.append("high_wave_lag_" + str(i)) cols_to_scale.append("hikkake_lag_" + str(i)) cols_to_scale.append("hikkake_mod_lag_" + str(i)) cols_to_scale.append("pidgeon_lag_" + str(i)) cols_to_scale.append("id_three_crows_lag_" + str(i)) cols_to_scale.append("in_neck_lag_" + str(i)) cols_to_scale.append("inv_hammer_lag_" + str(i)) cols_to_scale.append("kicking_lag_" + str(i)) cols_to_scale.append("kicking_len_lag_" + str(i)) cols_to_scale.append("ladder_bot_lag_" + str(i)) cols_to_scale.append("doji_long_lag_" + str(i)) cols_to_scale.append("long_line_lag_" + str(i)) cols_to_scale.append("marabozu_lag_" + str(i)) cols_to_scale.append("match_glow_lag_" + str(i)) cols_to_scale.append("mat_hold_lag_" + str(i)) cols_to_scale.append("morning_doji_lag_" + str(i)) cols_to_scale.append("morning_star_lag_" + str(i)) cols_to_scale.append("on_neck_lag_" + str(i)) cols_to_scale.append("pierce_lag_" + str(i)) cols_to_scale.append("rickshaw_lag_" + str(i)) cols_to_scale.append("rise_fall_lag_" + str(i)) cols_to_scale.append("sep_line_lag_" + str(i)) cols_to_scale.append("shooting_star_lag_" + str(i)) cols_to_scale.append("sl_candle_lag_" + str(i)) cols_to_scale.append("spin_top_lag_" + str(i)) cols_to_scale.append("stalled_lag_" + str(i)) cols_to_scale.append("stick_sand_lag_" + str(i)) cols_to_scale.append("takuri_lag_" + str(i)) cols_to_scale.append("tasuki_gap_lag_" + str(i)) cols_to_scale.append("thrust_lag_" + str(i)) cols_to_scale.append("tristar_lag_" + str(i)) cols_to_scale.append("three_river_lag_" + str(i)) cols_to_scale.append("ud_two_gap_lag_" + str(i)) cols_to_scale.append("down_three_gap_lag_" + str(i)) #print(train.columns.tolist()) # Do scaling for train set # Here we only scale the train dataset, and not the entire dataset to prevent information leak scaler = StandardScaler() scaler.fit(train[cols_to_scale]) train_scaled = scaler.transform(train[cols_to_scale]) # Convert the numpy array back into pandas dataframe #print(cols_to_scale) train_scaled = pd.DataFrame(train_scaled, columns=cols_to_scale) #duplicate_columns = train_scaled.columns[train_scaled.columns.duplicated()] #print(duplicate_columns) #df.to_csv(file_name) # this may be a good place to save to a .csv file and export the data to matlab # / do diagnostic visualizations #print(train_scaled.columns.tolist()) #train_scaled['datetime'] = train.reset_index()['datetime'] #print("train_scaled.shape = " + str(train_scaled.shape)) #print(train_scaled.head(5)) #this line is needed for the PCA #train_scaled = train_scaled[100:] scaler_2 = StandardScaler() scaler_2.fit(test[cols_to_scale]) test_scaled = scaler_2.transform(test[cols_to_scale]) # Convert the numpy array back into pandas dataframe test_scaled = pd.DataFrame(test_scaled, columns=cols_to_scale) features = [] target = "adj_close" features = cols_to_scale features.remove(target) # Split into X and y X_train_scaled = train_scaled[features] y_train_scaled = train_scaled[target] X_test_scaled = test_scaled[features] y_test_scaled = test_scaled[target] ## PCA testing needs to be done here to see what should / should not be included. #print(X_sample_scaled.columns.tolist()) #print(type(X_train_scaled)) #testing = X_train_scaled.to_numpy() #print(np.isnan(testing.any())) #print(np.isfinite(testing)) #pca = PCA(n_components = 80).fit(X_train_scaled) #print(pca.explained_variance_ratio_) #print(pca.singular_values_) #X_train_scaled, y_train_scaled, X_sample_scaled, y_sample_scaled = preprocessing_data(stock = 'BB') ## these values can be adjusted to customize the model #rand = np.random.randint(low=1,high = 999) model = XGBRegressor(seed=100, n_estimators=200, max_depth=20, learning_rate=0.1, min_child_weight=1, subsample=1, colsample_bytree=1, colsample_bylevel=1, gamma=0.1) # Train the regressor model.fit(X_train_scaled, y_train_scaled) #xgb.plot_importance(model) #feat_list = xgb.plot_importance(model).get_yticklabels()[::-1] #print(feat_list) #xgb.plot_tree(model) path_out = r'C:\\Users\\Michael\\Desktop\\Python\Stonks\\YF & modeling\\TestSP500Out\\' feat_save_name = path_out + stock + "features" tree_save_name = path_out + stock + "tree" xgb.plot_importance(model).figure.savefig(feat_save_name, dpi=600) xgb.plot_tree(model).figure.savefig(tree_save_name, dpi=600) feature_important = model.get_booster().get_score(importance_type='weight') keys = list(feature_important.keys()) values = list(feature_important.values()) #print(keys) #print(values) data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=False) print(data.head(5)) #data.plot(kind='barh') #plt.show() #doing predictions on model #print(X_train_scaled) test_pred = model.predict(X_test_scaled) #insert back into test_scaled array test_scaled['adj_close'] = test_pred ''' there is some consideration to be made if we can grab the top 20-30 most influential features from xgboost and use them to train a different model type''' ''' there is also consideration to be made about exporting these models''' # this methodology works for saving a trained model pickle.dump(model, open("test.model", "wb")) #unscaling pred_unscaled = scaler_2.inverse_transform(test_scaled) plt.figure() #plotting plt.plot(test_scaled.index, test[target]) plt.plot(test_scaled.index, pred_unscaled[:, 10]) plt.legend(('True', 'est'), loc='upper left') plt.title(str(stock)) plt.xlabel("Intervals") plt.ylabel('$') stonk_path_out = path_out + stock plt.savefig(stonk_path_out) test_true_num = test[target].iloc[-1] test_pred_num = pred_unscaled[-1, 10] is_going_up = test_pred_num > test_true_num print(stock) print(is_going_up) #change intervals back to date-time '''toc = time.perf_counter()