def svm_ga(X, y, rfe=True, paramgrid=None): # feature selection fltr = RFE(ReliefF(), n_features_to_select=5, step=0.5) if rfe else ReliefF(n_features_to_select=5, n_neighbors=3) clf = SVC() param_grid = { "svc__kernel": ["rbf"], 'svc__C': [10e-2, 10e-1, 10, 10e1, 10e2, 10e3, 10e4], 'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 1.1] } if paramgrid is None else paramgrid # make pipeline pipe = make_pipeline(preprocessing.StandardScaler(), fltr, clf) from evolutionary_search import EvolutionaryAlgorithmSearchCV cv = EvolutionaryAlgorithmSearchCV(estimator=pipe, params=param_grid, scoring="accuracy", cv=10, verbose=1, population_size=50, gene_mutation_prob=0.1, gene_crossover_prob=0.8, tournament_size=10, generations_number=25) cv.fit(X, y) print(cv.best_params_) print(cv.best_score_)
def readme(): data = sklearn.datasets.load_digits() X = data["data"] y = data["target"] paramgrid = { "kernel": ["rbf"], "C": np.logspace(-9, 9, num=25, base=10), "gamma": np.logspace(-9, 9, num=25, base=10) } random.seed(1) cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=10, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5) cv.fit(X, y) return cv
def get_GeneticGridSearchCV(model, params, X, y): from evolutionary_search import EvolutionaryAlgorithmSearchCV print("performing genetic grid search ...") grid = EvolutionaryAlgorithmSearchCV(estimator=model, params=params, scoring="r2", # cv=StratifiedKFold(n_splits=2), verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=3, generations_number=10, # pmap = pool.map, ) grid.fit(X, y.ravel()) # fit the model and parameters # our classical metric for performance print("Best Accuracy: {}".format(grid.best_score_)) # the best parameters that caused the best accuracy print("Best Parameters: {}".format(grid.best_params_)) # the average time it took a model to fit to the data (in seconds) print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3))) # the average time it took a model to predict out of sample data (in seconds) # this metric gives us insight into how this model will perform in real-time analysis print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3))) print(pd.DataFrame(grid.cv_results_).sort_values("mean_test_score", ascending=False).head())
def evo_search(xtrain, xtest, ytrain, ytest): layers = [[a, a] for a in range(10, 500, 100)] print(layers) parameters = { 'activation': ['identity', 'logistic', 'tanh', 'relu'], # 'solver': ['lbfgs', 'sgd', 'adam'], # 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'batch_size': [5, 10, 20, 50, 100], 'learning_rate_init': [0.0001, 0.001, 0.01, 0.1], # 'hidden_layer_sizes': generate_networks(), 'hidden_layer_sizes': layers } print(parameters) print('Starting evolutionary search') cv = EvolutionaryAlgorithmSearchCV(estimator=MLPClassifier(random_state=42, max_iter=20000), params=parameters, scoring=make_scorer(f1_score), #cv=StratifiedKFold(n_splits=4), verbose=10, population_size=20, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=10, n_jobs=1) cv.fit(xtrain, ytrain.values.ravel()) print_classifier_stats(cv.best_estimator_, xtrain, xtest, ytrain, ytest) print('Evo search done...')
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values #X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values # log-uniform: understand as search over p = exp(x) by varying x opt = EvolutionaryAlgorithmSearchCV( estimator=SVC(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.py params={ "kernel": ["rbf"], "C": np.logspace(1e-6, 1e+6, num=20, base=10), "gamma": np.logspace(3.0517578125e-05, 8, num=20, base=10), 'decision_function_shape': ['ovo', 'ovr'], 'degree': list(range(2, 5)), 'coef0': np.logspace(-1, 1, num=20, base=10), 'coef0': np.logspace(1e-5, 1e-1, num=20, base=10), }, cv=StratifiedKFold(n_splits=10, shuffle=True), scoring="accuracy", verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=3, generations_number=10, ) opt.fit(X_train, y_train)
def LR2(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate C_range = np.linspace(1, 50, 50) tol_range = np.linspace(0.001, 0.01, 50) param_dist = dict(tol=tol_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV( estimator=LogisticRegression(penalty='l2'), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("LR2_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('C: {}\n'.format(rnds.best_estimator_.C)) file.write('tol: {}\n'.format(rnds.best_estimator_.tol)) file.close() if not tune_only: # apply best parameters l2r = LogisticRegression(C=rnds.best_estimator_.C, tol=rnds.best_estimator_.tol, random_state=SEED) l2r.fit(X_train_pca, y_train) sc_tr = cross_validate(l2r, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(l2r, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = l2r.predict(X_test_pca) pred_train = l2r.predict(X_train_pca) output_report("LR2", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
def SVM(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.svm import SVC from sklearn.model_selection import cross_validate C_range = np.linspace(1, 10, 101) gamma_range = np.linspace(3000, 4000, 100) param_dist = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV(estimator=SVC(max_iter=200), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("SVM_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('C: {}\n'.format(rnds.best_estimator_.C)) file.write('gamma: {}\n'.format(rnds.best_estimator_.gamma)) file.close() if not tune_only: # apply best parameters svc = SVC(max_iter=200, C=rnds.best_estimator_.C, gamma=rnds.best_estimator_.gamma, random_state=SEED) svc.fit(X_train_pca, y_train) sc_tr = cross_validate(svc, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(svc, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = svc.predict(X_test_pca) pred_train = svc.predict(X_train_pca) output_report("SVM", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
def sk_params_search_best( clf, X, y, param_grid={"alpha": np.linspace(0, 1, 5)}, method="gridsearch", param_search={ "scorename": "r2", "cv": 5, "population_size": 5, "generations_number": 3 }, ): """ Genetic: population_size=5, ngene_mutation_prob=0.10,,gene_crossover_prob=0.5, tournament_size=3, generations_number=3 :param X: :param y: :param clf: :param param_grid: :param method: :param param_search: :return: """ p = param_search myscore = sk_score_get(p["scorename"]) if method == "gridsearch": from sklearn.model_selection import GridSearchCV grid = GridSearchCV(clf, param_grid, cv=p["cv"], scoring=myscore) grid.fit(X, y) return grid.best_score_, grid.best_params_ if method == "genetic": from evolutionary_search import EvolutionaryAlgorithmSearchCV from sklearn.model_selection import StratifiedKFold # paramgrid = {"alpha": np.linspace(0,1, 20) , "l1_ratio": np.linspace(0,1, 20) } cv = EvolutionaryAlgorithmSearchCV( estimator=clf, params=param_grid, scoring=myscore, cv=StratifiedKFold(y), verbose=True, population_size=p["population_size"], gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=p["generations_number"], ) cv.fit(X, y) return cv.best_score_, cv.best_params_
def ev_tree(self): ev_params = self.default_evparams ev_params['estimator'] = RandomForestClassifier() ev_params['params'] = self.frst_space cv = EvolutionaryAlgorithmSearchCV(**ev_params) cv.fit(self.X_insample, self.y_insample) clf = cv.best_estimator_ self.frst_called = True self.opt_frst = clf
def ev_svm(self): ev_params = self.default_evparams ev_params['estimator'] = SVC(probability=True) ev_params['params'] = self.svm_space cv = EvolutionaryAlgorithmSearchCV(**ev_params) cv.fit(self.X_insample, self.y_insample) clf = cv.best_estimator_ self.svm_called = True self.opt_svm = clf
def NB(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.naive_bayes import BernoulliNB from sklearn.model_selection import cross_validate alpha_range = np.linspace(0, 500, 500) param_dist = dict(alpha=alpha_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2) rnds = EvolutionaryAlgorithmSearchCV(estimator=BernoulliNB(), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("NB_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('alpha: {}\n'.format(rnds.best_estimator_.alpha)) file.close() if not tune_only: # apply best parameters gnb = BernoulliNB(alpha=rnds.best_estimator_.alpha) gnb.fit(X_train_pca, y_train) sc_tr = cross_validate(gnb, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(gnb, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = gnb.predict(X_test_pca) pred_train = gnb.predict(X_train_pca) output_report("NB", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
def geneticGridTest(self): print("performing Genetic grid search...") gridSearch = EvolutionaryAlgorithmSearchCV(estimator=self.classifier, params=self.gridParams, cv=self.kfold, scoring='accuracy', verbose=True, iid='False', n_jobs=4, population_size=20, gene_mutation_prob=0.30, tournament_size=2, generations_number=5) gridSearch.fit(self.X, self.y)
def GA_tune_lgbm(cls, x, y): tuner = EvolutionaryAlgorithmSearchCV( estimator=LGBMClassifier(), params=cls.lgbm_paramgrid, scoring="accuracy", cv=TimeSeriesSplit(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.2, gene_crossover_prob=0.5, tournament_size=3, generations_number=20, ) tuner.fit(x, y) return tuner.best_params_
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, evo=None, population_size=5): if score_func: if evo: gs = EvolutionaryAlgorithmSearchCV(pipeline, grid=parameters, scoring=score_func, n_jobs=n_jobs, population_size=population_size) else: gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func) else: if evo: gs = EvolutionaryAlgorithmSearchCV(pipeline, grid=parameters, scoring=None, verbose=True, n_jobs=4, population_size=population_size) else: gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds) gs.fit(X, y) print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_ best = gs.best_estimator_ return best
def RF_DT(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_validate min_samples_leaf_range = np.round(np.linspace(1, 10, 10)).astype(int) max_depth_range = np.round(np.linspace(1, 30, 30)).astype(int) param_dist = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range) num_features = len(X_train_little[0]) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV( estimator = RandomForestClassifier(n_estimators=int((1+num_features/2))), params = param_dist, scoring = "f1", cv = cv, verbose = 1, population_size = 50, gene_mutation_prob = 0.10, gene_crossover_prob = 0.5, tournament_size = 3, generations_number = 6, n_jobs = 4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("RF_DT_best_parameters.txt","w") file.write("{}\n".format(rnds.best_score_)) file.write('min_samples_leaf: {}\n'.format(rnds.best_estimator_.min_samples_leaf)) file.write('max_depth: {}\n'.format(rnds.best_estimator_.max_depth)) file.close() if not tune_only: # apply best parameters RF rfc = RandomForestClassifier(n_estimators = int((1+num_features/2)), min_samples_leaf = rnds.best_estimator_.min_samples_leaf, max_depth = rnds.best_estimator_.max_depth, random_state = SEED) rfc.fit(X_train_pca,y_train) sc_tr = cross_validate(rfc, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(rfc, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = rfc.predict(X_test_pca) pred_train = rfc.predict(X_train_pca) output_report("RF", y_train, pred_train, y_test, pred, sc_tr, sc_ts) return pred, pred_train
def main(): paramgrid = {"kernel": ["rbf"], "C" : np.logspace(-9, 9, num=25, base=10), "gamma" : np.logspace(-9, 9, num=25, base=10)} random.seed(1) from evolutionary_search import EvolutionaryAlgorithmSearchCV cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=5, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) cv.fit(X, y)
def tune(model, X, y, cv): C = np.round(np.linspace(1, 10, 10)).astype(int) param_dist = dict(C=C, ) #num_features = len(X[0]) best_model = EvolutionaryAlgorithmSearchCV(estimator=model, params=param_dist, scoring="f1_weighted", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) best_model.fit(X, y) return best_model
def tune(model, X, y, cv): min_samples_leaf_range = np.round(np.linspace(1, 10, 10)).astype(int) max_depth_range = np.round(np.linspace(1, 30, 30)).astype(int) param_dist = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range) best_model = EvolutionaryAlgorithmSearchCV( estimator = model, params = param_dist, scoring = "f1_weighted", cv = cv, verbose = 1, population_size = 50, gene_mutation_prob = 0.10, gene_crossover_prob = 0.5, tournament_size = 3, generations_number = 6, n_jobs = 4) best_model.fit(X, y) return best_model
for index, row in data.iterrows():#iterate over csv file if index==limit: break img=cv2.imread(root_path + '/images/' + str(row['image_id']) + '.jpg') histogram=np.zeros((3, 256)) for i in range(3):#calc hist for each channel histogram[i] = cv2.calcHist([img],[i],None,[256],[0,255]).ravel() X[index]=histogram.ravel()#to 1d array return X X = extract_features(train_data, 1500) y = train_data['image_label'].values[:X.shape[0]].ravel() grid = { 'knn__n_neighbors': [1, 10, 20, 30, 40, 60, 75, 100, 120, 160, 200], 'knn__metric': ['euclidean', 'manhattan', 'chebyshev'], 'knn__weights': ['uniform', 'distance'], 'preprocess__norm': ['l1', 'l2', 'max'] } pipeline = Pipeline(steps=[ ('preprocess', preprocessing.Normalizer()), ('knn', neighbors.KNeighborsClassifier()) ]) model = EvolutionaryAlgorithmSearchCV(pipeline, grid, scoring='roc_auc', verbose=True, n_jobs=4, population_size=10) model.fit(X, y) preds = model.predict_proba(extract_features(test_data))[:, 1] test_data = test_data.drop('image_url', 1) test_data['image_label'] = preds test_data.to_csv(root_path + '/res.csv', index=False)
# use a full grid over all parameters # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) start = time() grid_search.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.cv_results_['params']))) #report(grid_search.cv_results_) print(grid_search.best_score_) # run evolutionary_ evolution_search = EvolutionaryAlgorithmSearchCV( estimator=clf, params=param_grid, #scoring="accuracy", verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=4, ) start = time() evolution_search.fit(X, y) print("evolution_searchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print(evolution_search.cv_results_)
gene_crossover_prob=0.25, tournament_size=2, generations_number=3, n_jobs=2) #print(model.wv.most_similar('sensitive')) cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist()) #cv.fit(MeanEmbeddingVectorizer(w2v).transform(X), y['section'].tolist()) ''' #--------------------------------- GA-SVC --------------------------------- paramgrid = {"C": np.logspace(-9, 9, num=25, base=10)} cv = EvolutionaryAlgorithmSearchCV(estimator=LinearSVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
X = hyper_data.values[:, 15:] y = hyper_data.values[:2] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.25) paramgrid = { 'kernel': ['rbf'], 'C': np.logspace(-9, 9, num=25, base=10), 'gamma': np.logspace(-9, 9, num=25, base=10) } random.seed(1) cv = EvolutionaryAlgorithmSearchCV(estimator=RandomForestClassifier(), params=paramgrid, scoring='accuracy', cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) cv.fit(X_train, y_train)
"degree":[3] } random.seed(1) cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(trainlabel, n_folds=10), verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=10, generations_number=5, n_jobs = 5) cv.fit(train, trainlabel) print cv.best_score_, cv.best_params_ ############################################## #ff = mysvc.training_manCV() #ff.train_gene(train, trainlabel, 'poly', Cmin=-10, Cmax=10, numC=21, rmin=-10, rmax=10, numr=21, degree = 3) #------------------------------------------------------------------------------ #------------------------------------------------------------- print df, df_this #------------------------------------------------------------------------------ #df.to_csv('/home/peng/git/Machine_learning_for_reliability_analysis/Test_1/Results/poly_cm_10CV_d4_n10_p10_21.csv', header = True) #df_this.to_csv('/home/peng/git/Machine_learning_for_reliability_analysis/Test_1/Results/Try_this_score.csv', header = True)
print c_range param_dist = dict(C=c_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=2018) rnds = EvolutionaryAlgorithmSearchCV( estimator=LogisticRegression(random_state=0), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X, y) best_C = rnds.best_estimator_.C # apply best parameters lr = RandomizedLogisticRegression(C=best_C, random_state=0, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25) lr.fit(X, y) importances = lr.scores_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:")
def NN(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate num_features = len(X_train_little[0]) # prepare parameter grid alpha_range = np.linspace(0.005, 0.015, 50) learning_rate_range = np.linspace(0.01, 0.07, 50) epsilon_range = np.logspace(-9, -6, 50) beta_1_range = np.linspace(0.3, 0.7, 50) beta_2_range = np.linspace(0.3, 0.7, 50) a = int((num_features + 1) / 2) b = int((num_features + 1) / 2 + 10) med_layer_range = np.arange(a, b) param_dist = dict(alpha=alpha_range, hidden_layer_sizes=(num_features, med_layer_range, 1), learning_rate_init=learning_rate_range, epsilon=epsilon_range, beta_1=beta_1_range, beta_2=beta_2_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV( estimator=MLPClassifier(early_stopping=True), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("MLP_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('alpha: {}\n'.format(rnds.best_estimator_.alpha)) file.write('hidden_layer_sizes: {}\n'.format( rnds.best_estimator_.hidden_layer_sizes)) file.write('learning_rate_init: {}\n'.format( rnds.best_estimator_.learning_rate_init)) file.write('epsilon: {}\n'.format(rnds.best_estimator_.epsilon)) file.write('beta_1: {}\n'.format(rnds.best_estimator_.beta_1)) file.write('beta_2: {}\n'.format(rnds.best_estimator_.beta_2)) file.close() if not tune_only: # apply best parameters mlp = MLPClassifier( hidden_layer_sizes=rnds.best_estimator_.hidden_layer_sizes, early_stopping=True, alpha=rnds.best_estimator_.alpha, learning_rate_init=rnds.best_estimator_.learning_rate_init, epsilon=rnds.best_estimator_.epsilon, beta_1=rnds.best_estimator_.beta_1, beta_2=rnds.best_estimator_.beta_2, random_state=SEED) mlp.fit(X_train_pca, y_train) sc_tr = cross_validate(mlp, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(mlp, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = mlp.predict(X_test_pca) pred_train = mlp.predict(X_train_pca) output_report("MLP", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
C_range = np.linspace(1, 10, 100) gamma_range = np.linspace(3000, 4000, 100) param_dist = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42) rnds = EvolutionaryAlgorithmSearchCV(estimator=SVC(max_iter=200), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search print(rnds.best_score_) print('\nC: ') print(rnds.best_estimator_.C) print('\ngamma: ') print(rnds.best_estimator_.gamma) # apply best parameters svc = SVC(max_iter=400, C=rnds.best_estimator_.C, gamma=rnds.best_estimator_.gamma) svc.fit(X_train_pca, y_train) pred = svc.predict(X_test_pca) pred_train = svc.predict(X_train_pca) if learner: # NNs
sizes=['10','50','100','150','200','250'] methods=['MRMR','JMI','JMIM'] targets=np.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) for method in methods: for size in sizes: random.seed(1) X=X_original indices= joblib.load(method+' PICKLES/selected_indices_'+method+'.joblib.pkl') X=np.array(X)[:,indices] indices= joblib.load(method+' PICKLES/'+size+'-'+method+'.joblib.pkl') X=np.array(X)[:,indices] f=open('genetic/'+method+'-'+size+'.txt','w') print size print method print "svm.SVC" f.write("svm.SVC\n") cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(targets, n_folds=10), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=-1) cv.fit(X, targets) f.write('\n=======================\n')
class Wrap: """use GridSearchCV, RandomizedSearchCV and Evolutionary Search with this class. Methods ------- grid_method(self) uses the GridSearchCV object and .fit() :returns grid_history = grid.fit() random_method(self) uses the RandomizedSearchCV object and .fit() :returns rand_history = rand.fit() Attributes ---------- data_obj : (DataManager) passed in object from DataManager class network_obj: (NeuralArch) architecture of the neural network, so we can use it in KerasRegressor keras_regressor: (KerasRegressor) a KerasRegressor object with build_fn=network_obj.build_nn""" def __init__(self, data_obj, patience=10): # todo explain why do we have these different object in the class self.data_obj = data_obj network_obj = Net(self.data_obj) self.keras_regressor = KerasRegressor(build_fn=network_obj.build_nn) self.callback = MyCallbacks(patience=patience) def grid_method(self): """grid_method(self) uses the GridSearchCV object and .fit() :returns grid_history = grid.fit()""" params = dict(epochs=[200], batch_size=[4, 8]) cv = [(slice(None), slice(None)) ] # why have i written this over and over?? es = self.callback.es mc = self.callback.mc tb = self.callback.tb csv_log = self.callback.csv_log my_callbacks = [es, mc, csv_log] self.grid = GridSearchCV(estimator=self.keras_regressor, param_grid=params, cv=cv) grid_history = self.grid.fit( X=self.data_obj.x_train, y=self.data_obj.y_train, validation_data=(self.data_obj.x_validation, self.data_obj.y_validation), verbose=0, callbacks=my_callbacks) return grid_history def random_method(self): """grid_method(self) uses the GridSearchCV object and .fit() returns grid_history = grid.fit()""" params = dict(epochs=[100], batch_size=[2, 4, 8, 12, 16, 20, 24, 32, 36]) cv = [(slice(None), slice(None))] es = self.callback.es mc = self.callback.mc tb = self.callback.tb my_callbacks = [es, mc] self.rand = RandomizedSearchCV(estimator=self.keras_regressor, param_distributions=params, n_iter=8) rand_history = self.rand.fit( X=self.data_obj.x_train, y=self.data_obj.y_train, validation_data=(self.data_obj.x_validation, self.data_obj.y_validation), verbose=0, callbacks=my_callbacks) return rand_history def evolution_method(self): # this does not work, but we need to continue params = dict(epochs=[200], batch_size=[4, 8]) # cv = [(slice(None), slice(None))] es = self.callback.es mc = self.callback.mc tb = self.callback.tb my_callbacks = [es, mc] fit_params = { "epochs": 300, "validation_data": (self.data_obj.x_validation, self.data_obj.y_validation), "callbacks": my_callbacks } self.evo = EvolutionaryAlgorithmSearchCV( estimator=self.keras_regressor, params=params, verbose=0, population_size=10, fit_params=fit_params) evo_hist = self.evo.fit(X=self.data_obj.x_train, y=self.data_obj.y_train) return evo_hist
clf2 = EvolutionaryAlgorithmSearchCV( estimator=pipe_xg, # How will objective be evaluated params=parameters, # Parameters range scoring="accuracy", # Criteria cv=2, # No of folds verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=3, generations_number=10 ) start = time.time() clf2.fit(X_train, y_train) # 1hr 2 minute end = time.time() (end-start)/60 clf2.best_params_ # Our cvresults table (note, includes all individuals # with their mean, max, min, and std test score). out = pd.DataFrame( clf2.cv_results_ ) out = out.sort_values( "mean_test_score", ascending=False
def main(): rand_st = 42 classes = ["A", "B", "C", "D", "E", "F", "G", "H"] from itertools import combinations subsets = [] for subset in combinations(classes, 2): subsets.append(subset) try: os.makedirs(PREPROCESS_PATH + str(subset[0] + subset[1])) except: pass for sub in subsets: PATH = "/home/bruno/base-wipo/preprocess-artigo/" + str(sub[0]) + str( sub[1]) + "/" print(" --------------------------" + str(sub[0]) + str(sub[1]) + "--------------------------------- ") treinamento = "treinamento.csv" y = pd.read_csv(os.path.join(os.path.dirname(__file__), PATH + treinamento), header=0, delimiter=";", usecols=["section"], quoting=3) ''' X = pd.read_csv(os.path.join(os.path.dirname(__file__),PATH+treinamento), header=0,delimiter=";",usecols=["data"], quoting=3) X = X["data"].tolist() ''' X = TideneIterCSVGA(PATH + treinamento) tfidf_transformer = TfidfVectorizer() n = len(y) random.seed(1) from evolutionary_search import EvolutionaryAlgorithmSearchCV ''' #--------------------------------- GA-RF --------------------------------- from evolutionary_search import EvolutionaryAlgorithmSearchCV clf_RF_gs = RandomForestClassifier(random_state=rand_st, n_jobs=-1) clf_RF_pg = [{ 'max_depth': np.logspace(0.3,4,num = 10 ,base=10,dtype='int'), #[1, 5, 13, 34, 87, 226, 584, 1505, 3880, 10000] 'n_estimators' : np.logspace(0.1,3,num = 10 ,base=10,dtype='int'), #[1, 2, 5, 11, 24, 51, 107, 226, 476, 1000] 'min_samples_split' : np.logspace(0.4, 1, num=5, base=10, dtype='int'), #[2, 3, 5, 7, 10] 'min_samples_leaf' : np.logspace(0.1,1,num = 4 ,base=9,dtype='int'), #[1, 2, 4, 9] 'max_features' : ['auto', None] }] model_name = "100features_40minwords_10context" model = gensim.models.Word2Vec.load(model_name) w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)} cv = EvolutionaryAlgorithmSearchCV(estimator=clf_RF_gs, params=clf_RF_pg, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=10, gene_mutation_prob=0.05, gene_crossover_prob=0.25, tournament_size=2, generations_number=3, n_jobs=2) #print(model.wv.most_similar('sensitive')) cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist()) #cv.fit(MeanEmbeddingVectorizer(w2v).transform(X), y['section'].tolist()) ''' #--------------------------------- GA-SVC --------------------------------- paramgrid = {"C": np.logspace(-9, 9, num=25, base=10)} cv = EvolutionaryAlgorithmSearchCV(estimator=LinearSVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) out = cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
cv = EvolutionaryAlgorithmSearchCV ( estimator = odIfEstimator(), params = params, gene_type = [2, 2, 2], verbose = 1, population_size = 80, gene_mutation_prob = .1, gene_crossover_prob = .5, tournament_size = 3, generations_number = 8, # this is already validation set, no need for cross validation cv = ShuffleSplit(test_size=0.99, n_splits=1), n_jobs = 40) cv.fit(data, labels) params = { 'k': list(range(100,1000)), 'x': list(range(3,30)), 'qv': [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5] } print ('\nSDO\n----------------') class odSDOEstimator: def __init__(self, **kwargs): self.set_params (**kwargs) def get_params(self, deep = True): return self.params def set_params(self, **kwargs): self.params = kwargs
train_SJ_train, train_SJ_test = train_test_split(train_SJ, test_size=0.2, random_state=24) total_CASESJ_train, total_CASESJ_test = train_test_split(total_CASESJ, test_size=0.2, random_state=24) train_IQ_train, train_IQ_test = train_test_split(train_IQ, test_size=0.2, random_state=24) total_CASEIQ_train, total_CASEIQ_test = train_test_split(total_CASEIQ, test_size=0.2, random_state=24) rtreeForSJ.fit(train_SJ, total_CASESJ) rtreeForIQ.fit(train_IQ, total_CASEIQ) predictionsSJ = rtreeForSJ.predict(train_SJ_test) predictionsIQ = rtreeForIQ.predict(train_IQ_test) sjscore = mean_absolute_error(total_CASESJ_test, predictionsSJ) iqscore = mean_absolute_error(total_CASEIQ_test, predictionsIQ) print(sjscore) print(iqscore) # print(len(predictionsSJ)+len(predictionsIQ)) # print(len(predictionsIQ)) # finalArr = [] # for k in predictionsSJ: # finalArr.append(k) # for t in predictionsIQ: # finalArr.append(t)
alpha=1e-5, hidden_layer_sizes=(2), random_state=1) from evolutionary_search import EvolutionaryAlgorithmSearchCV cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=1) cv.fit(X, y) from evolutionary_search import maximize def func(x, y, m=1., z=False): return m * (np.exp(-(x**2 + y**2)) + float(z)) param_grid = {'x': [-1., 0., 1.], 'y': [-1., 0., 1.], 'z': [True, False]} args = {'m': 1.} best_params, best_score, score_results, hist, logbook = maximize(func, param_grid, args, verbose=False) print(best_params) print(best_score) print(score_results) #TODO: test this program with Neural Network Model