def test3(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': ['float', [0.5, 1.]], #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams #'tfidf__use_idf': (True, False), 'tfidf__norm': ['cat', ('l1', 'l2')], 'clf__alpha': ['float', [0.000001, 0.0001]], 'clf__penalty': ['cat', ['l2', 'elasticnet']] #'clf__n_iter': (10, 50, 80), } search = SmartSearch(parameters, estimator=pipeline, X=data.data, y=data.target, n_iter=30) search._fit()
def test2(): parameters = { 'kernel': ['cat', ['rbf', 'poly']], 'd': ['int', [1, 3]], 'C': ['float', [1, 10]] } def scoring_function(x): return [0.5] search = SmartSearch(parameters, model='GP', estimator=scoring_function, n_iter=15, n_init=10, n_final_iter=3) search._fit()
def test1(): iris = load_digits() X, y = iris.data, iris.target clf = RandomForestClassifier(n_estimators=20) # specify parameters parameters = { "max_depth": ['int', [3, 3]], "max_features": ['int', [1, 11]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]] } search = SmartSearch(parameters, estimator=clf, X=X, y=y, n_iter=20) search._fit()
def runExperiment(first_exp, n_exp, parameters, model = 'GCP', n_random_init = 10, n_total_iter = 30, n_candidates=500, corr_kernel='squared_exponential', acquisition_function = 'UCB', n_clusters = 1, cluster_evol = 'constant', GCP_mapWithNoise=False, GCP_useAllNoisyY=False, model_noise=None): last_exp = first_exp + n_exp print 'Run experiment',first_exp,'to',last_exp # Load data output = [] f =open(("scoring_function/output.csv"),'r') for l in f: l = l[1:-3] string_l = l.split(',') output.append( [ float(i) for i in string_l] ) f.close() print 'Loaded output file,',len(output),'rows' params = np.genfromtxt(("scoring_function/params.csv"),delimiter=',') print 'Loaded parameters file, shape :',params.shape KNN = NearestNeighbors() KNN.fit(params) # KNN.kneighbors(p,1,return_distance=False)[0] # function that retrieves a performance evaluation from the stored results def get_cv_res(p_dict): p = np.zeros(len(parameters)) for k in p_dict.keys(): p[int(k)] = p_dict[k] idx = KNN.kneighbors(p,1,return_distance=False)[0] all_o = output[idx] r = np.random.randint(len(all_o)/5) return all_o[(5*r):(5*r+5)] ### Run experiment ### for n_exp in range(first_exp,last_exp): print ' **** Run exp',n_exp,' ****' ### set directory if not os.path.exists("exp_results/exp"+str(n_exp)): os.mkdir("exp_results/exp"+str(n_exp)) else: print('Warning : directory already exists') search = SmartSearch(parameters, estimator = get_cv_res, corr_kernel = corr_kernel , GCP_mapWithNoise=GCP_mapWithNoise, GCP_useAllNoisyY=GCP_useAllNoisyY, model_noise = model_noise, model = model, n_candidates = n_candidates, n_iter = n_total_iter, n_init = n_random_init, n_clusters = n_clusters, cluster_evol = cluster_evol, verbose = 2, acquisition_function = acquisition_function, detailed_res = 2) all_parameters, all_search_path, all_raw_outputs,all_mean_outputs = search._fit() ## save experiment's data for i in range(len(all_raw_outputs)): f =open(("exp_results/exp"+str(n_exp)+"/output_"+str(i)+".csv"),'w') for line in all_raw_outputs[i]: print>>f,line f.close() np.savetxt(("exp_results/exp"+str(n_exp)+"/param_"+str(i)+".csv"),all_parameters[i], delimiter=",") np.savetxt(("exp_results/exp"+str(n_exp)+"/param_path_"+str(i)+".csv"),all_search_path[i], delimiter=",") print ' **** End experiment',n_exp,' ****\n'
n_iter = 100 nb_iter_final = 0 acquisition_function = 'UCB' def scoring_function(p_dict): x,y = p_dict['x'], p_dict['y'] x = x -5. y= y return branin(x,y) search = SmartSearch(parameters, estimator=scoring_function, corr_kernel = corr_kernel, acquisition_function = acquisition_function, GCP_mapWithNoise=mapWithNoise, model_noise = model_noise, model = sampling_model, n_candidates=n_candidates, n_iter = n_iter, n_init = n_random_init, n_final_iter=nb_iter_final, n_clusters=n_clusters, cluster_evol = cluster_evol, verbose=2, detailed_res = 0) search._fit()
n_random_init = 15 n_iter = 100 nb_iter_final = 0 acquisition_function = 'UCB' def scoring_function(p_dict): x, y = p_dict['x'], p_dict['y'] x = x - 5. y = y return branin(x, y) search = SmartSearch(parameters, estimator=scoring_function, corr_kernel=corr_kernel, acquisition_function=acquisition_function, GCP_mapWithNoise=mapWithNoise, model_noise=model_noise, model=sampling_model, n_candidates=n_candidates, n_iter=n_iter, n_init=n_random_init, n_final_iter=nb_iter_final, n_clusters=n_clusters, cluster_evol=cluster_evol, verbose=2, detailed_res=0) search._fit()
def gp_vs_random_search(test_name, n_tests, search_lenght, save_data=False): """ Compare GP-based search vs a simple random one Choose test_name in {'iris','text'} """ n_iter_search = search_lenght if (test_name == 'iris'): iris = load_digits() X, y = iris.data, iris.target pipeline = RandomForestClassifier() # specify parameters and distributions to sample from parameters = { "max_depth": ['int', [3, 3]], "max_features": ['int', [1, 11]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]] } elif (test_name == 'text'): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) X = data.data y = data.target # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': ['float', [0.5, 1.]], #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2'), 'clf__alpha': ['float', [0.000001, 0.00001]], 'clf__penalty': ['cat', ['l2', 'elasticnet']] #'clf__n_iter': (10, 50, 80), } else: print('Dataset not available for test') # GP UCB search all_gp_ucb_results = [] print 'GP_ucb search' for i in range(n_tests): ucb_search = SmartSearch(parameters, estimator=pipeline, X=X, y=y, acquisition_function='UCB', n_iter=n_iter_search, n_init=20, verbose=False) _, scores = ucb_search._fit() max_scores = [scores[0]] print 'Test', i, '-', len(scores), 'parameters tested' for j in range(1, len(scores)): max_scores.append(max(max_scores[j - 1], scores[j])) all_gp_ucb_results.append(extend_result(n_iter_search, max_scores)) all_gp_ucb_results = np.asarray(all_gp_ucb_results) print all_gp_ucb_results.shape if (save_data): np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',') # # GP EI search # all_gp_ei_results = [] # print 'GP_ei search' # for i in range(n_tests): # ei_search = SmartSearch(parameters,estimator=pipeline,X=X,y=y, # acquisition_function='EI', # n_iter=n_iter_search, n_init=20, verbose=False) # _,scores = ei_search._fit() # max_scores = [scores[0]] # print 'Test',i,'-',len(scores),'parameters tested' # for j in range(1,len(scores)): # max_scores.append(max(max_scores[j-1],scores[j])) # all_gp_ei_results.append(extend_result(n_iter_search,max_scores)) # all_gp_ei_results = np.asarray(all_gp_ei_results) # print all_gp_ei_results.shape # if(save_data): # np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',') # Randomized search print 'Random search' all_random_results = [] for i in range(n_tests): random_search = SmartSearch(parameters, estimator=pipeline, X=X, y=y, n_iter=n_iter_search, n_init=n_iter_search, verbose=False) _, scores = random_search._fit() max_scores = [scores[0]] print 'Test', i, '-', len(scores), 'parameters tested' for j in range(1, len(scores)): max_scores.append(max(max_scores[j - 1], scores[j])) all_random_results.append(extend_result(n_iter_search, max_scores)) all_random_results = np.asarray(all_random_results) if (save_data): np.savetxt('rand_scores.csv', all_random_results, delimiter=',') plt.figure() # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI') plt.plot(range(n_iter_search), np.mean(all_gp_ucb_results, axis=0), 'b', label='GP-UCB') plt.plot(range(n_iter_search), np.mean(all_random_results, axis=0), 'g', label='Random') plt.legend(loc=4) plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' + str(n_tests) + ' trials') plt.xlabel('Iterations') plt.ylabel('Max CV performance') plt.show()