def genetic_feat(df, num_gen=20, num_comp=10): from gplearn.genetic import SymbolicTransformer function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] gp = SymbolicTransformer(generations=num_gen, population_size=200, hall_of_fame=100, n_components=num_comp, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(df.drop("close", axis=1), df["close"]) df.iloc[:, :8] gen_feats = pd.DataFrame( gen_feats, columns=["gen_" + str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = df.index return pd.concat((df, gen_feats), axis=1)
def test_symbolic_transformer(): """Check that SymbolicTransformer example works""" rng = check_random_state(0) boston = load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] est = Ridge() est.fit(boston.data[:300, :], boston.target[:300]) assert_almost_equal(est.score(boston.data[300:, :], boston.target[300:]), 0.759319453049884) function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) gp.fit(boston.data[:300, :], boston.target[:300]) gp_features = gp.transform(boston.data) new_boston = np.hstack((boston.data, gp_features)) est = Ridge() est.fit(new_boston[:300, :], boston.target[:300]) assert_almost_equal(est.score(new_boston[300:, :], boston.target[300:]), 0.8418372105182055)
def test_transformer_iterable(): """Check that the transformer is iterable""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) est = SymbolicTransformer(generations=2, random_state=0) # Check unfitted unfitted_len = len(est) unfitted_iter = [gp.length_ for gp in est] expected_iter = [] assert_true(unfitted_len == 0) assert_true(unfitted_iter == expected_iter) # Check fitted est.fit(X, y) fitted_len = len(est) fitted_iter = [gp.length_ for gp in est] expected_iter = [15, 19, 19, 12, 9, 10, 7, 14, 6, 21] assert_true(fitted_len == 10) assert_true(fitted_iter == expected_iter) # Check IndexError assert_raises(IndexError, est.__getitem__, 10)
def pd_colcat_symbolic(df, col, pars): """ https://github.com/arita37/deltapy pip install deltapy """ pars_encoder = pars pars_encoder['cols'] = col if 'path_pipeline_export' in pars: try: pars_encoder = load(pars['path_pipeline_export'] + '/col_genetic_pars.pkl') model_encoder = load(pars['path_pipeline_export'] + '/col_genetic_model.pkl') col_encoder = load(pars['path_pipeline_export'] + '/col_genetic.pkl') except: pass ################################################################################### coly = pars['coly'] from gplearn.genetic import SymbolicTransformer function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] gp = SymbolicTransformer(generations=20, population_size=200, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(df[col], df[coly]) gen_feats = pd.DataFrame( gen_feats, columns=["gen_" + str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = df.index dfnew = gen_feats dfnew.columns = [t for t in dfnew.columns] ################################################################################### colnew = list(dfnew.columns) if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(dfnew, 'dfgen', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + "/col_genetic_model.pkl") save(pars_encoder, pars['path_pipeline_export'] + "/col_genetic_pars.pkl") save(colnew, pars['path_pipeline_export'] + "/col_genetic.pkl") col_pars = {'model': gp} col_pars['cols_new'] = { 'col_genetic': colnew ### list } return dfnew, col_pars
def pd_col_genetic_transform(df=None, col=None, pars=None): num_gen=20 num_comp=10 function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv','tan'] gp = SymbolicTransformer(generations=num_gen, population_size=200, hall_of_fame=100, n_components=num_comp, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6) gen_feats = gp.fit_transform(train_X, train_y) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = train_X.index train_X_all=pd.concat((train_X,gen_feats),axis=1) gen_feats = gp.transform(test_X) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = test_X.index test_X_all=pd.concat((test_X,gen_feats),axis=1) gen_feats = gp.transform(val_X) gen_feats = pd.DataFrame(gen_feats, columns=["gen_"+str(a) for a in range(gen_feats.shape[1])]) gen_feats.index = val_X.index val_X_all=pd.concat((val_X,gen_feats),axis=1) return train_X_all,test_X_all,val_X_all
def symbolic_transformer(X, y, encoder=None): """Transform features using multiple operations. This will add new features to the data frame. Args: X (DataFrame): Independent features y (Series): Dependen feature or target encoder (obj, optional): Object of the type 'SymbolicTransformer'. Defaults to None. Returns: DataFrame: Additional columns calculated by the algorithm """ if encoder is None: function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min'] encoder = SymbolicTransformer(generations=10, population_size=1000, hall_of_fame=100, n_components=12, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=123, n_jobs=-1) encoder.fit(X, y) gp_features = encoder.transform(X) return gp_features, encoder
def test_sample_weight(): """Check sample_weight param works""" # Check constant sample_weight has no effect sample_weight = np.ones(boston.target.shape[0]) est1 = SymbolicRegressor(generations=2, random_state=0) est1.fit(boston.data, boston.target) est2 = SymbolicRegressor(generations=2, random_state=0) est2.fit(boston.data, boston.target, sample_weight=sample_weight) # And again with a scaled sample_weight est3 = SymbolicRegressor(generations=2, random_state=0) est3.fit(boston.data, boston.target, sample_weight=sample_weight * 1.1) assert_almost_equal(est1._program.fitness_, est2._program.fitness_) assert_almost_equal(est1._program.fitness_, est3._program.fitness_) # And again for the transformer sample_weight = np.ones(boston.target.shape[0]) est1 = SymbolicTransformer(generations=2, random_state=0) est1 = est1.fit_transform(boston.data, boston.target) est2 = SymbolicTransformer(generations=2, random_state=0) est2 = est2.fit_transform(boston.data, boston.target, sample_weight=sample_weight) assert_array_almost_equal(est1, est2)
def Genetic_P(dataset, target): append = 'mean_per_hour' a = dataset[append] y = dataset[target] X = dataset.copy() X = X.drop(target, axis=1) X = X.drop(append, axis=1) function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'sin', 'cos', 'tan' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=15, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=random_seed, n_jobs=3) gp_features = gp.fit_transform(X, y) print('Number of features created out of genetic programing: {}'.format( gp_features.shape)) n = pd.DataFrame(gp_features) n = n.set_index(dataset.index.values) new_X = pd.concat([dataset, n], axis=1) new_X = new_X.dropna() return new_X
def test_transformer_iterable(): """Check that the transformer is iterable""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min' ] est = SymbolicTransformer(population_size=500, generations=2, function_set=function_set, random_state=0) # Check unfitted unfitted_len = len(est) unfitted_iter = [gp.length_ for gp in est] expected_iter = [] assert_true(unfitted_len == 0) assert_true(unfitted_iter == expected_iter) # Check fitted est.fit(X, y) fitted_len = len(est) fitted_iter = [gp.length_ for gp in est] expected_iter = [8, 12, 2, 29, 9, 33, 9, 8, 4, 22] assert_true(fitted_len == 10) assert_true(fitted_iter == expected_iter) # Check IndexError assert_raises(IndexError, est.__getitem__, 10)
def test_output_shape(): """Check output shape is as expected""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) # Check the transformer est = SymbolicTransformer(n_components=5, generations=2, random_state=0) est.fit(X, y) assert_true(est.transform(X).shape == (5, 5))
class GplearnDemo(object): def __init__(self): # data prepare self.__boston = None self.__boston_feature = None self.__boston_label = None self.__train_feature, self.__test_feature = [None for _ in range(2)] self.__train_label, self.__test_label = [None for _ in range(2)] self.__transformer = None self.__gp_train_feature = None self.__gp_test_feature = None # model fit self.__regressor = None def data_prepare(self): self.__boston = load_boston() self.__boston_feature = pd.DataFrame( self.__boston.data, columns=self.__boston.feature_names) self.__boston_label = pd.Series( self.__boston.target).to_frame("TARGET").squeeze() self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = ( train_test_split(self.__boston_feature, self.__boston_label, test_size=0.5, shuffle=True)) # 不能有缺失值 self.__transformer = SymbolicTransformer(n_jobs=4) self.__transformer.fit(self.__train_feature, self.__train_label) self.__gp_train_feature = self.__transformer.transform( self.__train_feature) self.__gp_test_feature = self.__transformer.transform( self.__test_feature) def model_fit_predict(self): self.__regressor = Ridge() self.__regressor.fit(self.__train_feature, self.__train_label) print( mean_squared_error(self.__test_label, self.__regressor.predict(self.__test_feature))) self.__regressor = Ridge() self.__regressor.fit( np.hstack((self.__train_feature.values, self.__gp_train_feature)), self.__train_label) print( mean_squared_error( self.__test_label, self.__regressor.predict( np.hstack((self.__test_feature.values, self.__gp_test_feature)))))
def pd_col_genetic_transform(df=None, col=None, pars=None): """ Find Symbolic formulae for faeture engineering """ prefix = 'col_genetic' ###################################################################################### from gplearn.genetic import SymbolicTransformer coly = pars['coly'] colX = [t for t in col if t not in [coly]] train_X = df[colX] train_y = df[coly] function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'tan' ] pars_genetic = pars.get('pars_genetic', { 'generations': 20, 'n_components': 10, 'population_size': 200 }) gp = SymbolicTransformer(hall_of_fame=100, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=6, **pars_genetic) gp.fit(train_X, train_y) df_genetic = gp.transform(train_X) df_genetic = pd.DataFrame( df_genetic, columns=["gen_" + str(a) for a in range(df_genetic.shape[1])]) df_genetic.index = train_X.index col_genetic = list(df_genetic.columns) ################################################################################### if 'path_features_store' in pars and 'path_pipeline_export' in pars: save_features(df_genetic, 'df_genetic', pars['path_features_store']) save(gp, pars['path_pipeline_export'] + f"/{prefix}_model.pkl") save(col_genetic, pars['path_pipeline_export'] + f"/{prefix}.pkl") save(pars_genetic, pars['path_pipeline_export'] + f"/{prefix}_pars.pkl") col_pars = {'model': gp, 'pars': pars_genetic} col_pars['cols_new'] = { 'col_genetic': col_genetic ### list } return df_genetic, col_pars
def feature_generate(self): self.__genetic_transformer = SymbolicTransformer(population_size=10000, generations=200, tournament_size=200, metric="spearman", n_jobs=-1, verbose=1) self.__genetic_transformer.fit(self.__train_feature, self.__train_label) self.__genetic_train_feature = self.__genetic_transformer.transform( self.__train_feature) self.__genetic_test_feature = self.__genetic_transformer.transform( self.__test_feature)
def symbolic_features(p_x, p_y): """ Funcion para crear regresores no lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_y: pd.DataFrame with variable to predict Returns ------- results: model """ model = SymbolicTransformer(function_set=[ "sub", "add", 'inv', 'mul', 'div', 'abs', 'log', "max", "min", "sin", "cos" ], population_size=5000, hall_of_fame=100, n_components=20, generations=20, tournament_size=20, stopping_criteria=.05, const_range=None, init_depth=(4, 12), metric='pearson', parsimony_coefficient=0.001, p_crossover=0.4, p_subtree_mutation=0.2, p_hoist_mutation=0.1, p_point_mutation=0.3, p_point_replace=.05, verbose=1, random_state=None, n_jobs=-1, feature_names=p_x.columns, warm_start=True) init = model.fit_transform(p_x[:'01-01-2019'], p_y[:'01-01-2019']) model_params = model.get_params() gp_features = model.transform(p_x) model_fit = np.hstack((p_x, gp_features)) results = {'fit': model_fit, 'params': model_params, 'model': model} return results
def main(): with timer('读取文件时间'): train = pd.read_csv('train_541.csv', nrows=10000) test = pd.read_csv('test_541.csv', nrows=10000) print('Training set full shape: ', train.shape) print('Testing set full shape: ', test.shape) function_set = [ 'add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min' ] gp1 = SymbolicTransformer(generations=1, population_size=1000, hall_of_fame=600, n_components=100, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) train.fillna('median', inplace=True) test.fillna('mdeian', inplace=True) print('填充完毕') with timer('test pg1'): test_gp(gp1, 100, train, test, foldername='pg1')
def test_pipeline(): """Check that SymbolicRegressor/Transformer can work in a pipeline""" # Check the regressor est = make_pipeline(StandardScaler(), SymbolicRegressor(population_size=50, generations=5, tournament_size=5, random_state=0)) est.fit(boston.data, boston.target) assert_almost_equal(est.score(boston.data, boston.target), -4.00270923) # Check the classifier est = make_pipeline(StandardScaler(), SymbolicClassifier(population_size=50, generations=5, tournament_size=5, random_state=0)) est.fit(cancer.data, cancer.target) assert_almost_equal(est.score(cancer.data, cancer.target), 0.934973637961) # Check the transformer est = make_pipeline(SymbolicTransformer(population_size=50, hall_of_fame=20, generations=5, tournament_size=5, random_state=0), DecisionTreeRegressor()) est.fit(boston.data, boston.target) assert_almost_equal(est.score(boston.data, boston.target), 1.0)
def test_pickle(): """Check pickability""" # Check the regressor est = SymbolicRegressor(generations=2, random_state=0) est.fit(boston.data[:100, :], boston.target[:100]) score = est.score(boston.data[500:, :], boston.target[500:]) pickle_object = pickle.dumps(est) est2 = pickle.loads(pickle_object) assert_equal(type(est2), est.__class__) score2 = est2.score(boston.data[500:, :], boston.target[500:]) assert_equal(score, score2) # Check the transformer est = SymbolicTransformer(generations=2, random_state=0) est.fit(boston.data[:100, :], boston.target[:100]) X_new = est.transform(boston.data[500:, :]) pickle_object = pickle.dumps(est) est2 = pickle.loads(pickle_object) assert_equal(type(est2), est.__class__) X_new2 = est2.transform(boston.data[500:, :]) assert_array_almost_equal(X_new, X_new2) # Check the classifier est = SymbolicClassifier(generations=2, random_state=0) est.fit(cancer.data[:100, :], cancer.target[:100]) score = est.score(cancer.data[500:, :], cancer.target[500:]) pickle_object = pickle.dumps(est) est2 = pickle.loads(pickle_object) assert_equal(type(est2), est.__class__) score2 = est2.score(cancer.data[500:, :], cancer.target[500:]) assert_equal(score, score2)
def test_parallel_train(): """Check predictions are the same for different n_jobs""" # Check the regressor ests = [ SymbolicRegressor(population_size=100, generations=4, n_jobs=n_jobs, random_state=0).fit(boston.data[:100, :], boston.target[:100]) for n_jobs in [1, 2, 3, 8, 16] ] preds = [e.predict(boston.data[500:, :]) for e in ests] for pred1, pred2 in zip(preds, preds[1:]): assert_array_almost_equal(pred1, pred2) lengths = np.array([[gp.length_ for gp in e._programs[-1]] for e in ests]) for len1, len2 in zip(lengths, lengths[1:]): assert_array_almost_equal(len1, len2) # Check the transformer ests = [ SymbolicTransformer(population_size=100, hall_of_fame=50, generations=4, n_jobs=n_jobs, random_state=0).fit(boston.data[:100, :], boston.target[:100]) for n_jobs in [1, 2, 3, 8, 16] ] preds = [e.transform(boston.data[500:, :]) for e in ests] for pred1, pred2 in zip(preds, preds[1:]): assert_array_almost_equal(pred1, pred2) lengths = np.array([[gp.length_ for gp in e._programs[-1]] for e in ests]) for len1, len2 in zip(lengths, lengths[1:]): assert_array_almost_equal(len1, len2)
def fit(self, X, y=None, state={}): exponential = make_function(function=exponent, name='exp', arity=1) function_set = ['add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min', 'tan', 'sin', 'cos', exponential] gp = SymbolicTransformer(generations=self.generations, population_size=self.population, hall_of_fame=self.hall_of_fame, n_components=self.components, function_set=function_set, parsimony_coefficient='auto', max_samples=0.6, verbose=1, metric=self.metric, random_state=0, n_jobs=7) self.state['genetic'] = {} self.state['genetic']['fit'] = gp.fit(X, y) return self
def test_early_stopping(): """Check that early stopping works""" est1 = SymbolicRegressor(stopping_criteria=10, random_state=0) est1.fit(boston.data[:400, :], boston.target[:400]) assert_true(len(est1._programs) == 1) est1 = SymbolicTransformer(stopping_criteria=0.5, random_state=0) est1.fit(boston.data[:400, :], boston.target[:400]) assert_true(len(est1._programs) == 1)
def symbolicLearning(df_list): ''' :param df_list: :return: ''' df_list = pd.DataFrame(df_list) function_set = ['add', 'sub', 'mul', 'div', 'log', 'sqrt', 'abs', 'neg', 'max', 'min'] gp = SymbolicTransformer(generations=10, population_size=1000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) gp_feature = gp.transform(df_list) new_feature_name = [str(i) + 'V' for i in range(1, len(function_set)+1)] new_feature = pd.DataFrame(gp_feature, columns=new_feature_name) return new_feature
def data_prepare(self): self.__boston = load_boston() self.__boston_feature = pd.DataFrame( self.__boston.data, columns=self.__boston.feature_names) self.__boston_label = pd.Series( self.__boston.target).to_frame("TARGET").squeeze() self.__train_feature, self.__test_feature, self.__train_label, self.__test_label = ( train_test_split(self.__boston_feature, self.__boston_label, test_size=0.5, shuffle=True)) # 不能有缺失值 self.__transformer = SymbolicTransformer(n_jobs=4) self.__transformer.fit(self.__train_feature, self.__train_label) self.__gp_train_feature = self.__transformer.transform( self.__train_feature) self.__gp_test_feature = self.__transformer.transform( self.__test_feature)
def test_function_in_program(): """Check that using a custom function in a program works""" def logic(x1, x2, x3, x4): return np.where(x1 > x2, x3, x4) logical = make_function(function=logic, name='logical', arity=4) function_set = ['add', 'sub', 'mul', 'div', logical] est = SymbolicTransformer(generations=2, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) est.fit(boston.data[:300, :], boston.target[:300]) formula = est._programs[0][906].__str__() expected_formula = 'sub(logical(X6, add(X11, 0.898), X10, X2), X5)' assert_equal(expected_formula, formula, True)
def data_prepare(self): self.__digists = load_digits(n_class=2) self.__X = self.__digists.data self.__y = self.__digists.target self.__train, self.__test, self.__train_label, self.__test_label = train_test_split( self.__X, self.__y, test_size=0.2, random_state=9) # standard scaler scaler = StandardScaler().fit(self.__train) self.__train = scaler.transform(self.__train) self.__test = scaler.transform(self.__test) # gp feature function_set = ("add", "sub", "mul", "div", "sqrt", "log", "abs", "neg", "inv", "max", "min") gp = SymbolicTransformer(generations=5, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=1, random_state=0, n_jobs=3) # 使用 stacking 的方式得到 generic feature 感觉更为合理 gp.fit(self.__train, self.__train_label) self.__train_gfeature = np.hstack( (self.__train, gp.transform(self.__train))) self.__test_gfeature = np.hstack( (self.__test, gp.transform(self.__test)))
def test_custom_functions(): """Test the custom programs example works""" rng = check_random_state(0) boston = load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] def logic(x1, x2, x3, x4): return np.where(x1 > x2, x3, x4) logical = make_function(function=logic, name='logical', arity=4) function_set = ['add', 'sub', 'mul', 'div', logical] gp = SymbolicTransformer(generations=2, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, random_state=0) gp.fit(boston.data[:300, :], boston.target[:300]) assert_equal(gp._programs[0][906].__str__(), 'sub(logical(X6, add(X11, 0.898), X10, X2), X5)') dot_data = gp._programs[0][906].export_graphviz() expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", ' 'fillcolor="#136ed4"] ;\n1 [label="logical", ' 'fillcolor="#136ed4"] ;\n2 [label="X6", fillcolor="#60a6f6"] ' ';\n3 [label="add", fillcolor="#136ed4"] ;\n4 [label="X11", ' 'fillcolor="#60a6f6"] ;\n5 [label="0.898", ' 'fillcolor="#60a6f6"] ;\n3 -> 5 ;\n3 -> 4 ;\n6 [label="X10", ' 'fillcolor="#60a6f6"] ;\n7 [label="X2", fillcolor="#60a6f6"] ' ';\n1 -> 7 ;\n1 -> 6 ;\n1 -> 3 ;\n1 -> 2 ;\n8 [label="X5", ' 'fillcolor="#60a6f6"] ;\n0 -> 8 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected)
def gp_features(df, target, random_state, generations=5, function_set=['add', 'sub', 'mul', 'div']): X = df.loc[:, (df.columns != target)] y = df.loc[:, target] gp = SymbolicTransformer(generations=generations, population_size=1000, hall_of_fame=100, n_components=12, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=0, random_state=random_state, n_jobs=-1) gp.fit(pd.get_dummies(X), y) df = gp_transform(df, gp.transform, X) return df, gp.transform
def get_feature_symbolic_learning(df, gp_config): """ Parameters ---------- df: pd.DataFrame,the input dataFrame. gp_config: GPConfig object, the config object of gplearn.SymbolicTransformer. Returns ------- df_t: pd.DataFrame, df with the features of SymbolicTransformer trans. The new features named like 'symbolic_component_{0 to n}'(n is the n_components) """ gp = SymbolicTransformer( generations=gp_config.generation, population_size=gp_config.population_size, hall_of_fame=gp_config.hall_of_fame, n_components=gp_config.n_components, function_set=gp_config.function_set, parsimony_coefficient=gp_config.parsimony_coefficient, max_samples=gp_config.max_samples, verbose=1, random_state=0, n_jobs=3) X = df[gp_config.feature_cols] y = df[gp_config.target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) gp.fit(X_train, y_train) names = [ "symbolic_component_" + str(i) for i in range(gp_config.n_components) ] res = pd.DataFrame(gp.transform(X), columns=names) df_t = pd.concat([df, res], axis=1) return df_t
def test_early_stopping(): """Check that early stopping works""" est1 = SymbolicRegressor(population_size=100, generations=2, stopping_criteria=10, random_state=0) est1.fit(boston.data[:400, :], boston.target[:400]) assert(len(est1._programs) == 1) est1 = SymbolicTransformer(population_size=100, generations=2, stopping_criteria=0.5, random_state=0) est1.fit(boston.data[:400, :], boston.target[:400]) assert(len(est1._programs) == 1) est1 = SymbolicClassifier(population_size=100, generations=2, stopping_criteria=.9, random_state=0) est1.fit(cancer.data[:400, :], cancer.target[:400]) assert(len(est1._programs) == 1)
def test_input_shape(): """Check changed dimensions cause failure""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) X2 = np.reshape(random_state.uniform(size=45), (5, 9)) # Check the regressor est = SymbolicRegressor(generations=2, random_state=0) est.fit(X, y) assert_raises(ValueError, est.predict, X2) # Check the transformer est = SymbolicTransformer(generations=2, random_state=0) est.fit(X, y) assert_raises(ValueError, est.transform, X2)
def test_custom_transformer_metrics(): """Check whether greater_is_better works for SymbolicTransformer.""" est_gp = SymbolicTransformer(generations=2, population_size=100, hall_of_fame=10, n_components=1, metric='pearson', random_state=415) est_gp.fit(boston.data, boston.target) for program in est_gp: formula = program.__str__() expected_formula = ('sub(div(mul(X4, X12), div(X9, X9)), ' 'sub(div(X11, X12), add(X12, X0)))') assert_equal(expected_formula, formula, True) def _neg_weighted_pearson(y, y_pred, w): """Calculate the weighted Pearson correlation coefficient.""" with np.errstate(divide='ignore', invalid='ignore'): y_pred_demean = y_pred - np.average(y_pred, weights=w) y_demean = y - np.average(y, weights=w) corr = ( (np.sum(w * y_pred_demean * y_demean) / np.sum(w)) / np.sqrt( (np.sum(w * y_pred_demean**2) * np.sum(w * y_demean**2)) / (np.sum(w)**2))) if np.isfinite(corr): return -1 * np.abs(corr) return 0. neg_weighted_pearson = make_fitness(function=_neg_weighted_pearson, greater_is_better=False) c_est_gp = SymbolicTransformer(generations=2, population_size=100, hall_of_fame=10, n_components=1, stopping_criteria=-1, metric=neg_weighted_pearson, random_state=415) c_est_gp.fit(boston.data, boston.target) for program in c_est_gp: c_formula = program.__str__() assert_equal(expected_formula, c_formula, True)
def getSymbolTrans(train, valid, y, random_state=888): X_train = train.copy() X_valid = valid.copy() y_train = y.copy() function_set = [ 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'abs', 'neg', 'inv', 'max', 'min' ] gp = SymbolicTransformer(generations=20, population_size=2000, hall_of_fame=100, n_components=10, function_set=function_set, parsimony_coefficient=0.0005, max_samples=0.9, verbose=0, random_state=0, n_jobs=3) gp.fit(X_train, y_train) gp_features_train = gp.transform(X_train) dt_gp_features_train = pd.DataFrame(gp_features_train) dt_gp_features_train.columns = [ "ST_" + str(i) for i in range(1, dt_gp_features_train.shape[1] + 1) ] X_train = X_train.join(dt_gp_features_train) X_train = X_train.fillna(0) gp_features_valid = gp.transform(X_valid) dt_gp_features_valid = pd.DataFrame(gp_features_valid) dt_gp_features_valid.columns = [ "ST_" + str(i) for i in range(1, dt_gp_features_valid.shape[1] + 1) ] X_valid = X_valid.join(dt_gp_features_valid) X_valid = X_valid.fillna(0) return (X_train, X_valid)