def test_symbolic_classifier(): """Check that SymbolicClassifier example works""" rng = check_random_state(0) cancer = load_breast_cancer() perm = rng.permutation(cancer.target.size) cancer.data = cancer.data[perm] cancer.target = cancer.target[perm] est = SymbolicClassifier(parsimony_coefficient=.01, feature_names=cancer.feature_names, random_state=1) est.fit(cancer.data[:400], cancer.target[:400]) y_true = cancer.target[400:] y_score = est.predict_proba(cancer.data[400:])[:, 1] assert_almost_equal(roc_auc_score(y_true, y_score), 0.96937869822485212) dot_data = est._program.export_graphviz() expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", ' 'fillcolor="#136ed4"] ;\n1 [label="div", fillcolor="#136ed4"] ' ';\n2 [label="worst fractal dimension", fillcolor="#60a6f6"] ' ';\n3 [label="mean concave points", fillcolor="#60a6f6"] ' ';\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", fillcolor="#136ed4"] ' ';\n5 [label="mean concave points", fillcolor="#60a6f6"] ;\n6 ' '[label="area error", fillcolor="#60a6f6"] ;\n4 -> 6 ;\n4 -> ' '5 ;\n0 -> 4 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected)
def test_symbolic_classifier_comparison(): """Test the classifier comparison example works""" X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] scores = [] for ds in datasets: X, y = ds X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=.4, random_state=42) clf = SymbolicClassifier(random_state=0) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) scores.append(('%.2f' % score).lstrip('0')) assert_equal(scores, ['.95', '.93', '.95'])
def main(train_file_name,valid_file_name,test_file_name): X_train, y_train, X_validation, y_validation, X_test = \ load_process_data(train_file_name, valid_file_name, test_file_name) gp_classifier = SymbolicClassifier(population_size=20, generations=65, tournament_size=3, const_range=None, init_depth=(4, 12), parsimony_coefficient=0.00000000000000000000000000000001, # parsimony_coefficient=0.0, # init_method='full', function_set=('add', 'sub', 'mul', 'div'), # make_function(my_sqr, "sqr", arity=2, wrap=False)), transformer='sigmoid', #metric=f_beta, p_crossover=0.85, p_subtree_mutation=0.04, p_hoist_mutation=0.01, p_point_mutation=0.04, p_point_replace=0.005, max_samples=1.0, feature_names=None, warm_start=False, low_memory=True, n_jobs=8, verbose=1, random_state=None) gp_classifier.fit(X_train, y_train) y_val_proba = gp_classifier.predict_proba(X_validation) y_train_proba = gp_classifier.predict_proba(X_train) best_threshold = get_best_threshold(y_val_proba, y_validation) y_train_pred = np.where(y_train_proba[:, 1] > best_threshold, 1, 0) y_val_pred = np.where(y_val_proba[:, 1] > best_threshold, 1, 0) str_header = "$"*78 print(str_header) print(str_header) print('Train accuracy', accuracy_score(y_train, y_train_pred)) print('Validation accuracy', accuracy_score(y_validation, y_val_pred)) print('Train precision', precision_score(y_train, y_train_pred)) print('Validation precision', precision_score(y_validation, y_val_pred)) print('Train recall', recall_score(y_train, y_train_pred)) print('Validation recall', recall_score(y_validation, y_val_pred)) print('Train f-beta score', fbeta_score(y_train, y_train_pred, beta=0.25)) validation_beta_score = fbeta_score(y_validation, y_val_pred, beta=0.25) print(f'Validation f-beta score {validation_beta_score}') print(str_header) print(str_header)
def train(): est_gp = SymbolicClassifier(population_size=250, generations=20, tournament_size=20, stopping_criteria=0.01, parsimony_coefficient=0.001, p_crossover=0.9, p_subtree_mutation=0.05, p_hoist_mutation=0.0025, p_point_mutation=0.01, p_point_replace=0.0025, verbose=1, max_samples=0.9, feature_names=feature_names) est_gp.fit(X_train, y_train) print(est_gp._program) print(est_gp.score(X_train, y_train)) print(est_gp.score(X_test, y_test))
def Symbolic_reg_expr(X, y): est_gp = SymbolicClassifier(parsimony_coefficient=.01, random_state=1) est_gp = SymbolicRegressor(population_size=5000, generations=20, stopping_criteria=0.01, p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1, max_samples=0.9, verbose=0, parsimony_coefficient=0.01, random_state=0) est_gp.fit(X, y) sym_expr = str(est_gp._program) X0, X1, X2, X3, X4, X5, X6, X7, X8, X9 = symbols( 'X0 X1 X2 X3 X4 X5 X6 X7 X8 X9') converter = { 'sub': lambda x, y: x - y, 'div': lambda x, y: x / y, 'mul': lambda x, y: x * y, 'add': lambda x, y: x + y, 'neg': lambda x: -x, 'pow': lambda x, y: x**y } sym_reg = simplify(sympify(sym_expr, locals=converter)) sym_reg = sym_reg.subs((X0, X1, X2, X3, X4, X5, X6, X7, X8, X9), (X0, X1, X2, X3, X4, X5, X6, X7, X8, X9)) vars_ = [X0, X1, X2, X3, X4, X5, X6, X7, X8, X9] gradients_ = [] for var in vars_: gradients_.append(diff(sym_reg, var)) return sym_reg, gradients_
def main(): seed = 0 np.random.seed(seed) df = Dataset('ml_project1_data.xlsx').rm_df y = df['Response'] X = df.drop(columns='Response') training, testing, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) training['Response'] = y_train testing['Response'] = y_test pr = Processor(training,testing,seed = 0) fe = FeatureEngineer(pr.training,pr.unseen,seed = 0) training = fe.training testing = fe.unseen est = SymbolicClassifier(generations = 200, random_state = 0) est.fit(training.drop('Response',axis = 1), training['Response']) assess_generalization_auroc(est,testing,True) y_pred = est.predict_proba(testing.drop('Response',axis = 1))[:,1] y_true = testing['Response'] print(profit(y_true, y_pred)) #+++++++++++++++++ 5) modelling #Create Optimizer ''' mlp_param_grid = {'mlpc__hidden_layer_sizes': [(3), (6), (3, 3), (5, 5)], 'mlpc__learning_rate_init': [0.001, 0.01]} mlp_gscv = bayes_optimization_MLP(fe.training,mlp_param_grid, cv = 5,seed = 0) #mlp_gscv.fit(training.loc[:, (training.columns != "Response")].values, training["Response"].values) print("Best parameter set: ", mlp_gscv.best_params_) # pd.DataFrame.from_dict(mlp_gscv.cv_results_).to_excel("D:\\PipeLines\\project_directory\\data\\mlp_gscv.xlsx") #+++++++++++++++++ 6) retraining & assessment of generalization ability #auprc,precision, recall = assess_generalization_auroc(mlp_gscv.best_estimator_, testing) #print("AUPRC: {:.2f}".format(auprc)) ''' plt.show()
def test_parallel_custom_transformer(): """Regression test for running parallel training with custom transformer""" def _sigmoid(x1): with np.errstate(over='ignore', under='ignore'): return 1 / (1 + np.exp(-x1)) sigmoid = make_function(function=_sigmoid, name='sig', arity=1) est = SymbolicClassifier(generations=2, transformer=sigmoid, random_state=0, n_jobs=2) est.fit(cancer.data, cancer.target) _ = pickle.dumps(est) # Unwrapped functions should fail sigmoid = make_function(function=_sigmoid, name='sig', arity=1, wrap=False) est = SymbolicClassifier(generations=2, transformer=sigmoid, random_state=0, n_jobs=2) est.fit(cancer.data, cancer.target) assert_raises(AttributeError, pickle.dumps, est) # Single threaded will also fail in non-interactive sessions est = SymbolicClassifier(generations=2, transformer=sigmoid, random_state=0) est.fit(cancer.data, cancer.target) assert_raises(AttributeError, pickle.dumps, est)
def test_custom_classifier_metrics(): """Check whether greater_is_better works for SymbolicClassifier.""" x_data = check_random_state(0).uniform(-1, 1, 100).reshape(50, 2) y_true = x_data[:, 0] ** 2 + x_data[:, 1] ** 2 y_true = (y_true < y_true.mean()).astype(int) est_gp = SymbolicClassifier(metric='log loss', stopping_criteria=0.000001, random_state=415, parsimony_coefficient=0.01, init_method='full', init_depth=(2, 4)) est_gp.fit(x_data, y_true) formula = est_gp.__str__() expected_formula = 'sub(0.364, mul(add(X0, X0), add(X0, X0)))' assert_equal(expected_formula, formula, True) def negative_log_loss(y, y_pred, w): """Calculate the log loss.""" eps = 1e-15 y_pred = np.clip(y_pred, eps, 1 - eps) score = y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred) return np.average(score, weights=w) customized_fitness = make_fitness(negative_log_loss, greater_is_better=True) c_est_gp = SymbolicClassifier(metric=customized_fitness, stopping_criteria=0.000001, random_state=415, parsimony_coefficient=0.01, init_method='full', init_depth=(2, 4)) c_est_gp.fit(x_data, y_true) c_formula = c_est_gp.__str__() assert_equal(expected_formula, c_formula, True)
if int(data[10]) == 2: # Not cancerous benign.append("benign") else: # Is cancerous benign.append("malignant") classifier = SymbolicClassifier( # Prevents 'bloat' used for large programs when evolution is increasing the size of the program with an # insignificant increase in fitness parsimony_coefficient=.01, # The list of attributes names, used in producing the final equation feature_names=attributes, # Displays each evolutionary state and fitness after each tournament is run # Note: If commented the user will need to be patient before final results are displayed verbose=1, # Stops the program early if the criteria is met. This is to prevent long computation time for minimal gain stopping_criteria=0.15, # When the population is 500 = ~85% 1000 = ~90% 2000 = ~95% population_size=2000, # basic functions are all that is required the inclusion of log functions provides roughly 5% increase in fitness function_set={"mul", "div", "add", "sub", "log"}) # The first 400 values in the file are trained and tested against the first 400 known values to be benign classifier.fit(values[:400], benign[:400]) # Returns the accuracy as a percentage from the fitness function print("Accuracy: " + (classifier.score(values[:400], benign[:400]) * 100).__str__() + "%") # Returns the function that achieves the above fitness to be entered into a tree in a breadth first fashion print("Function: " + str(classifier._program))
def test_program_input_validation_classifier(): """Check that guarded input validation raises errors""" # Check too much proba est = SymbolicClassifier(p_point_mutation=.5) assert_raises(ValueError, est.fit, cancer.data, cancer.target) # Check invalid init_method est = SymbolicClassifier(init_method='ni') assert_raises(ValueError, est.fit, cancer.data, cancer.target) # Check invalid const_ranges est = SymbolicClassifier(const_range=2) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(const_range=[2, 2]) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(const_range=(2, 2, 2)) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(const_range='ni') assert_raises(ValueError, est.fit, cancer.data, cancer.target) # And check acceptable, but strange, representations of const_range est = SymbolicClassifier(generations=2, const_range=(2, 2)) est.fit(cancer.data, cancer.target) est = SymbolicClassifier(generations=2, const_range=None) est.fit(cancer.data, cancer.target) est = SymbolicClassifier(generations=2, const_range=(4, 2)) est.fit(cancer.data, cancer.target) # Check invalid init_depth est = SymbolicClassifier(init_depth=2) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(init_depth=2) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(init_depth=[2, 2]) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(init_depth=(2, 2, 2)) assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(init_depth='ni') assert_raises(ValueError, est.fit, cancer.data, cancer.target) est = SymbolicClassifier(init_depth=(4, 2)) assert_raises(ValueError, est.fit, cancer.data, cancer.target) # And check acceptable, but strange, representations of init_depth est = SymbolicClassifier(generations=2, init_depth=(2, 2)) est.fit(cancer.data, cancer.target) # Check classifier metrics for m in ['log loss']: est = SymbolicClassifier(generations=2, metric=m) est.fit(cancer.data, cancer.target) # And check a fake one est = SymbolicClassifier(generations=2, metric='the larch') assert_raises(ValueError, est.fit, cancer.data, cancer.target) # Check classifier transformers for t in ['sigmoid']: est = SymbolicClassifier(generations=2, transformer=t) est.fit(cancer.data, cancer.target) # And check an incompatible one with wrong arity est = SymbolicClassifier(generations=2, transformer=sub2) assert_raises(ValueError, est.fit, cancer.data, cancer.target) # And check a fake one est = SymbolicClassifier(generations=2, transformer='the larch') assert_raises(ValueError, est.fit, cancer.data, cancer.target)
if len(data) > 9: temp = [] for i in range(1, 10): x = int(data[i]) if data[i] != "?" else -1 temp.append(x) values.append(temp) if int(data[10]) == 2: alive.append("benign") else: alive.append("malignant") est = SymbolicClassifier(parsimony_coefficient=.01, feature_names=attributes, random_state=10000, verbose=1, stopping_criteria=0.15, population_size=2000, function_set={"mul", "div", "add", "sub", "log"}) est.fit(values[:400], alive[:400]) print("Accuracy: " + est.score(values[:400], alive[:400]).__str__()) # noinspection PyProtectedMember # print("Function: " + str(est._program)) # noinspection PyProtectedMember # graph = pydotplus.graphviz.graph_from_dot_data(est._program.export_graphviz()) # Image(graph.create_png()) # graph.write_png("dtree.png")
accuracy = make_fitness(_accuracy, greater_is_better=True) est_gp = SymbolicClassifier( population_size=1000, generations=200, stopping_criteria=0.01, p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1, max_samples=0.9, verbose=1, feature_names=('V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36'), function_set=('add', 'sub', 'mul', 'div')) est_gp.fit(X_train, y_train) print('The best individual is : ') print(est_gp) print('Training set accuracy is %0.2f%%' % (100 * est_gp.score(X_train, y_train))) Predict_value = est_gp.predict(X_test) count = 0 for i in range(len(Predict_value)): if Predict_value[i] == y_test[i]: count += 1 print('Test set accuracy is %0.2f%%' % (100 * count / len(Predict_value)))
import pandas as pd from gplearn.genetic import SymbolicClassifier from sklearn.metrics import roc_auc_score from sklearn.utils import shuffle if __name__ == '__main__': # creating data structures train_set = pd.read_csv("training.txt", sep=" ") test_set = pd.read_csv("test.txt", sep=" ") x_train = train_set.drop("Target", axis=1) y_train = train_set["Target"] x_test = test_set.drop("Target", axis=1) y_test = test_set["Target"] est = SymbolicClassifier(parsimony_coefficient=.01, stopping_criteria=0.01, feature_names=list(x_train.columns.values), random_state=3) est.fit(x_train, y_train) y_true = y_test y_score = est.predict_proba(x_test)[:, 1] print("Accuracy:", roc_auc_score(y_true, y_score), "Program:", est._program)