def data_prepare(self): self.__digists = load_digits(n_class=2) self.__X = self.__digists.data self.__y = self.__digists.target self.__train, self.__test, self.__train_label, self.__test_label = train_test_split( self.__X, self.__y, test_size=0.2, random_state=9) # standard scaler scaler = StandardScaler().fit(self.__train) self.__train = scaler.transform(self.__train) self.__test = scaler.transform(self.__test) # gp feature function_set = ("add", "sub", "mul", "div", "sqrt", "log", "abs", "neg", "inv", "max", "min") gp = SymbolicTransformer(generations=5, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) # 使用 stacking 的方式得到 generic feature 感觉更为合理 gp.fit(self.__train, self.__train_label) self.__train_gfeature = np.hstack( (self.__train, gp.transform(self.__train))) self.__test_gfeature = np.hstack( (self.__test, gp.transform(self.__test)))
def test_symbolic_transformer(): """Check that SymbolicTransformer example works""" rng = check_random_state(0) boston = load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] est = Ridge() est.fit(boston.data[:300, :], boston.target[:300]) assert_almost_equal(est.score(boston.data[300:, :], boston.target[300:]), 0.759319453049884) function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) gp.fit(boston.data[:300, :], boston.target[:300]) gp_features = gp.transform(boston.data) new_boston = np.hstack((boston.data, gp_features)) est = Ridge() est.fit(new_boston[:300, :], boston.target[:300]) assert_almost_equal(est.score(new_boston[300:, :], boston.target[300:]), 0.8418372105182055)
def symbolic_transformer(X, y, encoder=None): """Transform features using multiple operations. This will add new features to the data frame. Args: X (DataFrame): Independent features y (Series): Dependen feature or target encoder (obj, optional): Object of the type 'SymbolicTransformer'. Defaults to None. Returns: DataFrame: Additional columns calculated by the algorithm """ if encoder is None: function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] encoder = SymbolicTransformer(generations=10, population_size=1000, hall_of_fame=100, n_components=12, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=123, n_jobs=-1) encoder.fit(X, y) gp_features = encoder.transform(X) return gp_features, encoder
def test_transformer_iterable(): """Check that the transformer is iterable""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] est = SymbolicTransformer(population_size=500, generations=2, function_set=function_set, random_state=0) # Check unfitted unfitted_len = len(est) unfitted_iter = [gp.length_ for gp in est] expected_iter = [] assert_true(unfitted_len == 0) assert_true(unfitted_iter == expected_iter) # Check fitted est.fit(X, y) fitted_len = len(est) fitted_iter = [gp.length_ for gp in est] expected_iter = [8, 12, 2, 29, 9, 33, 9, 8, 4, 22] assert_true(fitted_len == 10) assert_true(fitted_iter == expected_iter) # Check IndexError assert_raises(IndexError, est.__getitem__, 10)
def test_transformer_iterable(): """Check that the transformer is iterable""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) est = SymbolicTransformer(generations=2, random_state=0) # Check unfitted unfitted_len = len(est) unfitted_iter = [gp.length_ for gp in est] expected_iter = [] assert_true(unfitted_len == 0) assert_true(unfitted_iter == expected_iter) # Check fitted est.fit(X, y) fitted_len = len(est) fitted_iter = [gp.length_ for gp in est] expected_iter = [15, 19, 19, 12, 9, 10, 7, 14, 6, 21] assert_true(fitted_len == 10) assert_true(fitted_iter == expected_iter) # Check IndexError assert_raises(IndexError, est.__getitem__, 10)
def test_output_shape(): """Check output shape is as expected""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) # Check the transformer est = SymbolicTransformer(n_components=5, generations=2, random_state=0) est.fit(X, y) assert_true(est.transform(X).shape == (5, 5))
class GplearnDemo(object): def __init__(self): # data prepare self.__boston = None self.__boston_feature = None self.__boston_label = None self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_label, self.__test_label = [None for _ in range(2)] self.__transformer = None self.__gp_train_feature = None self.__gp_test_feature = None # model fit self.__regressor = None def data_prepare(self): self.__boston = load_boston() self.__boston_feature = pd.DataFrame( self.__boston.data, columns=self.__boston.feature_names) self.__boston_label = pd.Series( self.__boston.target).to_frame("TARGET").squeeze() self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = ( train_test_split(self.__boston_feature, self.__boston_label, test_size=0.5, shuffle=True)) # 不能有缺失值 self.__transformer = SymbolicTransformer(n_jobs=4) self.__transformer.fit(self.__train_feature, self.__train_label) self.__gp_train_feature = self.__transformer.transform( self.__train_feature) self.__gp_test_feature = self.__transformer.transform( self.__test_feature) def model_fit_predict(self): self.__regressor = Ridge() self.__regressor.fit(self.__train_feature, self.__train_label) print( mean_squared_error(self.__test_label, self.__regressor.predict(self.__test_feature))) self.__regressor = Ridge() self.__regressor.fit( np.hstack((self.__train_feature.values, self.__gp_train_feature)), self.__train_label) print( mean_squared_error( self.__test_label, self.__regressor.predict( np.hstack((self.__test_feature.values, self.__gp_test_feature)))))
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer coly = pars['coly'] colX = [t for t in col if t not in [coly]] train_X = df[colX] train_y = df[coly] function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] pars_genetic = pars.get('pars_genetic', { 'generations': 20, 'n_components': 10, 'population_size': 200 }) gp = SymbolicTransformer(hall_of_fame=100, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6, **pars_genetic) gp.fit(train_X, train_y) df_genetic = gp.transform(train_X) df_genetic = pd.DataFrame( df_genetic, columns=["gen_" + str(a) for a in range(df_genetic.shape[1])]) df_genetic.index = train_X.index col_genetic = list(df_genetic.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_genetic, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = {'model': gp, 'pars': pars_genetic} col_pars['cols_new'] = { 'col_genetic': col_genetic ### list } return df_genetic, col_pars
def test_custom_transformer_metrics(): """Check whether greater_is_better works for SymbolicTransformer.""" est_gp = SymbolicTransformer(generations=2, population_size=100, hall_of_fame=10, n_components=1, metric='pearson', random_state=415) est_gp.fit(boston.data, boston.target) for program in est_gp: formula = program.__str__() expected_formula = ('sub(div(mul(X4, X12), div(X9, X9)), ' 'sub(div(X11, X12), add(X12, X0)))') assert_equal(expected_formula, formula, True) def _neg_weighted_pearson(y, y_pred, w): """Calculate the weighted Pearson correlation coefficient.""" with np.errstate(divide='ignore', invalid='ignore'): y_pred_demean = y_pred - np.average(y_pred, weights=w) y_demean = y - np.average(y, weights=w) corr = ( (np.sum(w * y_pred_demean * y_demean) / np.sum(w)) / np.sqrt( (np.sum(w * y_pred_demean**2) * np.sum(w * y_demean**2)) / (np.sum(w)**2))) if np.isfinite(corr): return -1 * np.abs(corr) return 0. neg_weighted_pearson = make_fitness(function=_neg_weighted_pearson, greater_is_better=False) c_est_gp = SymbolicTransformer(generations=2, population_size=100, hall_of_fame=10, n_components=1, stopping_criteria=-1, metric=neg_weighted_pearson, random_state=415) c_est_gp.fit(boston.data, boston.target) for program in c_est_gp: c_formula = program.__str__() assert_equal(expected_formula, c_formula, True)
def test_function_in_program(): """Check that using a custom function in a program works""" def logic(x1, x2, x3, x4): return np.where(x1 > x2, x3, x4) logical = make_function(function=logic, name='logical', arity=4) function_set = ['add', 'sub', 'mul', 'div', logical] est = SymbolicTransformer(generations=2, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) est.fit(boston.data[:300, :], boston.target[:300]) formula = est._programs[0][906].__str__() expected_formula = 'sub(logical(X6, add(X11, 0.898), X10, X2), X5)' assert_equal(expected_formula, formula, True)
def getSymbolTrans(train, valid, y, random_state=888): X_train = train.copy() X_valid = valid.copy() y_train = y.copy() function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=0, random_state=0, n_jobs=3) gp.fit(X_train, y_train) gp_features_train = gp.transform(X_train) dt_gp_features_train = pd.DataFrame(gp_features_train) dt_gp_features_train.columns = [ "ST_" + str(i) for i in range(1, dt_gp_features_train.shape[1] + 1) ] X_train = X_train.join(dt_gp_features_train) X_train = X_train.fillna(0) gp_features_valid = gp.transform(X_valid) dt_gp_features_valid = pd.DataFrame(gp_features_valid) dt_gp_features_valid.columns = [ "ST_" + str(i) for i in range(1, dt_gp_features_valid.shape[1] + 1) ] X_valid = X_valid.join(dt_gp_features_valid) X_valid = X_valid.fillna(0) return (X_train, X_valid)
def test_custom_functions(): """Test the custom programs example works""" rng = check_random_state(0) boston = load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] def logic(x1, x2, x3, x4): return np.where(x1 > x2, x3, x4) logical = make_function(function=logic, name='logical', arity=4) function_set = ['add', 'sub', 'mul', 'div', logical] gp = SymbolicTransformer(generations=2, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) gp.fit(boston.data[:300, :], boston.target[:300]) assert_equal(gp._programs[0][906].__str__(), 'sub(logical(X6, add(X11, 0.898), X10, X2), X5)') dot_data = gp._programs[0][906].export_graphviz() expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", ' 'fillcolor="#136ed4"] ;\n1 [label="logical", ' 'fillcolor="#136ed4"] ;\n2 [label="X6", fillcolor="#60a6f6"] ' ';\n3 [label="add", fillcolor="#136ed4"] ;\n4 [label="X11", ' 'fillcolor="#60a6f6"] ;\n5 [label="0.898", ' 'fillcolor="#60a6f6"] ;\n3 -> 5 ;\n3 -> 4 ;\n6 [label="X10", ' 'fillcolor="#60a6f6"] ;\n7 [label="X2", fillcolor="#60a6f6"] ' ';\n1 -> 7 ;\n1 -> 6 ;\n1 -> 3 ;\n1 -> 2 ;\n8 [label="X5", ' 'fillcolor="#60a6f6"] ;\n0 -> 8 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected)
def get_feature_symbolic_learning(df, gp_config): """ Parameters ---------- df: pd.DataFrame,the input dataFrame. gp_config: GPConfig object, the config object of gplearn.SymbolicTransformer. Returns ------- df_t: pd.DataFrame, df with the features of SymbolicTransformer trans. The new features named like 'symbolic_component_{0 to n}'(n is the n_components) """ gp = SymbolicTransformer( generations=gp_config.generation, population_size=gp_config.population_size, hall_of_fame=gp_config.hall_of_fame, n_components=gp_config.n_components, function_set=gp_config.function_set, parsimony_coefficient=gp_config.parsimony_coefficient, max_samples=gp_config.max_samples, verbose=1, random_state=0, n_jobs=3) X = df[gp_config.feature_cols] y = df[gp_config.target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) gp.fit(X_train, y_train) names = [ "symbolic_component_" + str(i) for i in range(gp_config.n_components) ] res = pd.DataFrame(gp.transform(X), columns=names) df_t = pd.concat([df, res], axis=1) return df_t
def gp_features(df, target, random_state, generations=5, function_set=['add', 'sub', 'mul', 'div']): X = df.loc[:, (df.columns != target)] y = df.loc[:, target] gp = SymbolicTransformer(generations=generations, population_size=1000, hall_of_fame=100, n_components=12, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=0, random_state=random_state, n_jobs=-1) gp.fit(pd.get_dummies(X), y) df = gp_transform(df, gp.transform, X) return df, gp.transform
def fit(self, X, y=None, state={}): exponential = make_function(function=exponent, name='exp', arity=1) function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'tan', 'sin', 'cos', exponential] gp = SymbolicTransformer(generations=self.generations, population_size=self.population, hall_of_fame=self.hall_of_fame, n_components=self.components, function_set=function_set, parsimony_coefficient='auto', max_samples=0.6, verbose=1, metric=self.metric, random_state=0, n_jobs=7) self.state['genetic'] = {} self.state['genetic']['fit'] = gp.fit(X, y) return self
boston.target = boston.target[perm] est = Ridge() est.fit(boston.data[:300, :], boston.target[:300]) print(est.score(boston.data[300:, :], boston.target[300:])) del est function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) gp.fit(boston.data[:300, :], boston.target[:300]) gp_features = gp.transform(boston.data) new_boston = np.hstack((boston.data, gp_features)) est = Ridge() est.fit(new_boston[:300, :], boston.target[:300]) print(est.score(new_boston[300:, :], boston.target[300:]))
generations = 3 # 进化世代数 population_size = 1000 # 每一代中的公式数量 tournament_size = 200 # 每一代中被随机选中计算适应度的公式数 const_range = (0.0, 10.0) function_set = init_function + user_function # 函数算子 metric = rankIC_metric # 目标函数作为适应度 random_state = 200812 # 设置随机种子 factor_gp = SymbolicTransformer(feature_names=fields, function_set=function_set, generations=generations, population_size=population_size, tournament_size=tournament_size, const_range=const_range, random_state=random_state) #, metric=metric) factor_gp.fit(stock_price, target) with open(r'D:\work\back_test_system\FactorBackTest\gp_model.pkl', 'wb') as f: pickle.dump(factor_gp, f) best_programs = factor_gp._best_programs best_programs_dict = {} for p in best_programs: factor_name = 'alpha_' + str(best_programs.index(p) + 1) best_programs_dict[factor_name] = { 'fitness': p.fitness_, 'expression': str(p), 'depth': p.depth_, 'length': p.length_ } best_programs_dict = pd.DataFrame(best_programs_dict).T
# 使用gplearn的genetic方法组合特征 data = datasets.load_boston() # 加载数据集 x, y = data.data, data.target # 分割形成x和y print(x.shape) # 查看x的形状 print(x[0]) # 查看x的第一条数据 model_symbolic = SymbolicTransformer(n_components=5, generations=18, function_set=('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'), max_samples=0.9, metric='pearson', random_state=0, n_jobs=2) model_symbolic.fit(x, y) # 训练数据 symbolic_features = model_symbolic.transform(x) # 转换数据 print(symbolic_features.shape) # 打印形状 print(symbolic_features[0]) # 打印第1条数据 print(model_symbolic) # 输出公式 #读者可取消注释执行下面的代码段 #%% ''' # 本段示例代码将输出重复的重复特征 reg_data = np.loadtxt('data5.txt') x, y = reg_data[:, :-1], reg_data[:, -1] model_symbolic = SymbolicTransformer(n_components=5, generations=18, function_set=( 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv','max', 'min'),
numeric_feats = tt.dtypes[tt.dtypes == np.float64].index numeric_feats = numeric_feats.drop('target') function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv'] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gp.fit(train[numeric_feats], train['target']) gp_feats = gp.transform(tt[numeric_feats]) tt = pd.concat([tt, pd.DataFrame(gp_feats)], axis=1) ### box cox transform ''' #numeric_feats = tt.dtypes[tt.dtypes != 'object'].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) skewed_feats = skewed_feats[skewed_feats > 0.2] skewed_feats = skewed_feats.index for feat in skewed_feats: tt[feat] = tt[feat] +10 (tt[feat], lam) = boxcox(tt[feat])
min_data_in_leaf=6, min_sum_hessian_in_leaf=11) cv = KFold(n_splits=6, shuffle=True, random_state=42) results = [] feature_import = pd.DataFrame() sub_array = [] # feature_import['name'] = train.columns y_train = y_train.values y_mean = np.mean(y_train) for model in [model_lgb]: for traincv, testcv in cv.split(train, y_train): gp.fit(train[traincv], y_train[traincv]) gp_features = gp.transform(train) print(gp_features) train = np.hstack((train, gp_features)) m = model.fit(train[traincv], y_train[traincv], eval_set=[(train[testcv], y_train[testcv])], early_stopping_rounds=150) y_tmp = m.predict(train[testcv], num_iteration=m.best_iteration) res = mean_squared_error(y_train[testcv], (y_tmp)) / 2 results.append(res) t_gp_features = gp.transform(test)
generations = 3 # 进化世代数 population_size = 1000 # 每一代中的公式数量 tournament_size = 20 # 每一代中被随机选中计算适应度的公式数 const_range = (0.0, 10.0) function_set = init_function + user_function # 函数算子 metric = my_metric # 目标函数作为适应度 random_state = 316 # 设置随机种子 est_gp = SymbolicTransformer(feature_names=fields, function_set=function_set, generations=generations, metric=metric, population_size=population_size, tournament_size=tournament_size, const_range=const_range, random_state=random_state) est_gp.fit(X_train, y_train) with open(r'D:\work\back_test_system\FactorBackTest\gp_model.pkl', 'wb') as f: pickle.dump(est_gp, f) best_programs = est_gp._best_programs best_programs_dict = {} for p in best_programs: factor_name = 'alpha_' + str(best_programs.index(p) + 1) best_programs_dict[factor_name] = { 'fitness': p.fitness_, 'expression': str(p), 'depth': p.depth_, 'length': p.length_ }
'min' ] gp = SymbolicTransformer(generations=10, population_size=50000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=42, n_jobs=4) # Fit & save to dataframe gp.fit(total_df.iloc[train_idx], y) gp_features = gp.transform(total_df) genetic_df = pd.DataFrame( gp_features, columns=[f'Genetic_{i}' for i in range(gp_features.shape[1])]) def series_to_supervised(data, n_in=1, n_out=1, dropnan=True): """ Frame a time series as a supervised learning dataset. Taken from: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/ Arguments: data: Sequence of observations as a list or NumPy array. n_in: Number of lag observations as input (X). n_out: Number of observations as output (y). dropnan: Boolean whether or not to drop rows with NaN values. Returns:
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer from gplearn.functions import make_function import random colX = col # [col_ for col_ in col if col_ not in coly] train_X = df[colX].fillna(method='ffill') feature_name_ = colX def squaree(x): return x * x square_ = make_function(function=squaree, name='square_', arity=1) function_set = pars.get('function_set', [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan', square_ ]) pars_genetic = pars.get( 'pars_genetic', { 'generations': 5, 'population_size': 10, ### Higher than nb_features 'metric': 'spearman', 'tournament_size': 20, 'stopping_criteria': 1.0, 'const_range': (-1., 1.), 'p_crossover': 0.9, 'p_subtree_mutation': 0.01, 'p_hoist_mutation': 0.01, 'p_point_mutation': 0.01, 'p_point_replace': 0.05, 'parsimony_coefficient': 0.005, #### 0.00005 Control Complexity 'max_samples': 0.9, 'verbose': 1, #'n_components' ### Control number of outtput features : n_components 'random_state': 0, 'n_jobs': 4, }) if 'path_pipeline' in pars: #### Inference time gp = load(pars['path_pipeline'] + f"/{prefix}_model.pkl") pars = load(pars['path_pipeline'] + f"/{prefix}_pars.pkl") else: ### Training time coly = pars['coly'] train_y = pars['dfy'] gp = SymbolicTransformer( hall_of_fame=train_X.shape[1] + 1, ### Buggy n_components=pars_genetic.get('n_components', train_X.shape[1]), feature_names=feature_name_, function_set=function_set, **pars_genetic) gp.fit(train_X, train_y) ##### Transform Data ######################################### df_genetic = gp.transform(train_X) tag = random.randint(0, 10) #### UNIQUE TAG col_genetic = [f"gen_{tag}_{i}" for i in range(df_genetic.shape[1])] df_genetic = pd.DataFrame(df_genetic, columns=col_genetic, index=train_X.index) df_genetic.index = train_X.index pars_gen_all = {'pars_genetic': pars_genetic, 'function_set': function_set} ##### Formulae Exrraction ##################################### formula = str(gp).replace("[", "").replace("]", "") flist = formula.split(",\n") form_dict = {x: flist[i] for i, x in enumerate(col_genetic)} pars_gen_all['formulae_dict'] = form_dict log("########## Formulae ", form_dict) # col_pars['map_dict'] = dict(zip(train_X.columns.to_list(), feature_name_)) col_new = col_genetic ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_gen_all, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") # save(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.pkl") save_json(form_dict, pars['path_pipeline_export'] + f"/{prefix}_formula.json") ### Human readable col_pars = { 'prefix': prefix, 'path': pars.get('path_pipeline_export', pars.get('path_pipeline', None)) } col_pars['cols_new'] = { prefix: col_new ### list } return df_genetic, col_pars
generations = 50 function_set = init_function + user_function metric = MSLE population_size = 100 random_state = 0 est_gp = SymbolicTransformer( feature_names=fields, function_set=function_set, generations=generations, metric=metric, population_size=population_size, tournament_size=20, random_state=random_state, ) est_gp.fit(train_X, train_y) best_programs = est_gp._best_programs best_programs_dict = {} for p in best_programs: factor_name = str(best_programs.index(p) + 1) best_programs_dict[factor_name] = { 'fitness': p.fitness_, 'expression': str(p), 'depth': p.depth_, 'length': p.length_ } best_programs_dict = pd.DataFrame(best_programs_dict).T best_programs_dict = best_programs_dict.sort_values(by='fitness')
def run_ga_industry(industry_sym, industry_code, data_directory, start_date_int, end_date_int, metric, signal_ref_data, q_lower, q_upper, strat_flag, population_size, tournament_size, generations, hall_of_fame, n_components, factor_filter, n_jobs=1, verbose=0): industry_data = pd.read_parquet(f'{data_directory}/{industry_code}.parq') industry_data = industry_data.loc[start_date_int:end_date_int, :].copy() data0 = industry_data.copy() data0['pct1'] = np.log(data0['close_' + industry_sym]).diff().shift(-1) dataset = data0.dropna() data = dataset.drop('pct1', axis=1).values ga_train_fields = dataset.drop('pct1', axis=1).columns target = dataset['pct1'].values test_size = 0.1 test_num = int(len(data) * test_size) X_train = data[:-test_num].copy() X_train_df = dataset[ga_train_fields].iloc[:-test_num].copy() # X_train = ut.min_max_scaling(X_train) y_train = np.nan_to_num(target[:-test_num].copy()) test_backward_i0 = test_num + signal_ref_data - 1 # X_test = data[-test_backward_i0:].copy() X_test_df = dataset[ga_train_fields].iloc[-test_backward_i0:].copy() # X_test = ut.min_max_scaling(X_test) # y_test = np.nan_to_num(target[-test_backward_i0:].copy()) # ================================================================================ # Fitting # -------------------------------------------------------------------------------- # SymbolicTransformer est_gp = SymbolicTransformer( population_size=population_size, # 1000 tournament_size=tournament_size, # 20 generations=generations, # 20 hall_of_fame=hall_of_fame, # 100 n_components=n_components, # 10 stopping_criteria=np.inf, # 1.0 const_range=None, # (-1., 1.) # init_depth=(2, 6), # (2, 6) # init_method='half and half', # 'half and half' function_set=function_set, # ('add', 'sub', 'mul', 'div') metric=metric, # 'pearson' # metric=gp_sharpe, parsimony_coefficient=0.0001, # 0.001 # p_crossover=0.9, # 0.9 # p_subtree_mutation=0.01, # 0.01 # p_hoist_mutation=0.01, # 0.01 # p_point_mutation=0.01, # 0.01 # p_point_replace=0.05, # 0.05 max_samples= 1.0, # 1.0 || The fraction of samples to draw from X to evaluate each program on. feature_names=ga_train_fields, # None # warm_start=False, # False # low_memory=False, # False n_jobs=n_jobs, # 1 verbose=verbose, # 0 random_state=10, # None ) est_gp.fit(X_train, y_train, sample_weight=None) # ================================================================================ # Process programs # -------------------------------------------------------------------------------- program_df = clean_gplearn_programs(est_gp._programs, verbose=0) function_expressions = program_df['expression'].values # ================================================================================ # Backtest Overview # -------------------------------------------------------------------------------- # train set logret = dataset.iloc[:-test_num]['pct1'].values alpha_train_overview_list = [] for expr in function_expressions: factor_values = eval(expr, function_set_dict, X_train_df.to_dict(orient="series")) signal = _generate_signal(factor_values, n=signal_ref_data, q_lower=q_lower, q_upper=q_upper, flag=strat_flag) factor_return = np.sum(signal * logret) alpha_train_overview_list.append([expr, factor_return]) train_ov = pd.DataFrame(alpha_train_overview_list, columns=['expression', 'totret_is']).set_index('expression') best_train_factor = train_ov.sort_values( 'totret_is').iloc[-factor_filter[0]:].index.tolist() # test set logret = dataset.iloc[-test_backward_i0:]['pct1'].values alpha_test_overview_list = [] for expr in best_train_factor: factor_values = eval(expr, function_set_dict, X_test_df.to_dict(orient="series")) signal = _generate_signal(factor_values, n=signal_ref_data, q_lower=q_lower, q_upper=q_upper, flag=strat_flag) factor_return = np.sum(signal * logret) alpha_test_overview_list.append([expr, factor_return]) test_ov = pd.DataFrame(alpha_test_overview_list, columns=['expression', 'totret_oos']).set_index('expression') best_factors = test_ov.sort_values( "totret_oos").iloc[-factor_filter[1]:].index.tolist() _ref_data = industry_data.iloc[-signal_ref_data * 2:].to_dict(orient="series") best_opinions = [ _generate_signal(eval(expr, function_set_dict, _ref_data), n=signal_ref_data, q_lower=q_lower, q_upper=q_upper, flag=strat_flag)[-1] for expr in best_factors ] ew_opinion = np.sum(best_opinions) output = [industry_sym, ew_opinion, best_opinions, best_factors] return output
class GplearnGenerateFeature(object): def __init__(self, input_path, output_path): self.__input_path, self.__output_path = input_path, output_path # data prepare self.__feature_importance = None self.__feature_top_column = None self.__train, self.__test = [None for _ in range(2)] self.__train_label = None self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__categorical_columns = None self.__encoder = None self.__numeric_columns = None self.__filler = None # feature generate self.__genetic_transformer = None self.__genetic_train_feature = None self.__genetic_test_feature = None def data_prepare(self): self.__feature_importance = pd.read_csv( os.path.join(self.__input_path, "feature_importance_feature_data_V5.csv")) self.__feature_importance = (self.__feature_importance.groupby([ "feature" ])["importance"].mean().to_frame("importance").reset_index( drop=False)).sort_values("importance", ascending=False).reset_index(drop=True) self.__feature_top_column = list(self.__feature_importance.iloc[0:200, 0]) self.__train = pd.read_csv( os.path.join(self.__input_path, "train_select_feature_df.csv"), usecols=self.__feature_top_column + ["TARGET"]) self.__test = pd.read_csv(os.path.join(self.__input_path, "test_select_feature_df.csv"), usecols=self.__feature_top_column) self.__train_label = self.__train["TARGET"] self.__train_feature = self.__train.drop("TARGET", axis=1) self.__test_feature = self.__test[ self.__train_feature.columns.tolist()] # encoder self.__categorical_columns = self.__train_feature.select_dtypes( include="object").columns.tolist() self.__encoder = TargetEncoder() self.__encoder.fit(self.__train_feature[self.__categorical_columns], self.__train_label) self.__train_feature[ self.__categorical_columns] = self.__encoder.transform( self.__train_feature[self.__categorical_columns]) self.__test_feature[ self.__categorical_columns] = self.__encoder.transform( self.__test_feature[self.__categorical_columns]) # filler self.__numeric_columns = self.__train_feature.select_dtypes( exclude="object").columns.tolist() self.__filler = Imputer(strategy="median") self.__filler.fit(self.__train_feature[self.__numeric_columns]) self.__train_feature[self.__numeric_columns] = self.__filler.transform( self.__train_feature[self.__numeric_columns]) self.__test_feature[self.__numeric_columns] = self.__filler.transform( self.__test_feature[self.__numeric_columns]) def feature_generate(self): self.__genetic_transformer = SymbolicTransformer(population_size=10000, generations=200, tournament_size=200, metric="spearman", n_jobs=-1, verbose=1) self.__genetic_transformer.fit(self.__train_feature, self.__train_label) self.__genetic_train_feature = self.__genetic_transformer.transform( self.__train_feature) self.__genetic_test_feature = self.__genetic_transformer.transform( self.__test_feature) def data_output(self): self.__genetic_train_feature = pd.DataFrame( self.__genetic_train_feature, columns=[ "Genetic_" + str(i) for i in range(self.__genetic_train_feature.shape[1]) ]) self.__genetic_test_feature = pd.DataFrame( self.__genetic_test_feature, columns=[ "Genetic_" + str(i) for i in range(self.__genetic_test_feature.shape[1]) ]) self.__genetic_train_feature.to_csv(os.path.join( self.__output_path, "genetic_train_feature.csv"), index=False) self.__genetic_test_feature.to_csv(os.path.join( self.__output_path, "genetic_test_feature.csv"), index=False)