def test3(): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': ['float', [0.5, 1.]], #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams #'tfidf__use_idf': (True, False), 'tfidf__norm': ['cat', ('l1', 'l2')], 'clf__alpha': ['float', [0.000001, 0.0001]], 'clf__penalty': ['cat', ['l2', 'elasticnet']] #'clf__n_iter': (10, 50, 80), } search = SmartSearch(parameters, estimator=pipeline, X=data.data, y=data.target, n_iter=30) search._fit()
def test__create_next_day_valid_input( self, input_target_day_of_week: int, input_target_hour: int, input_target_minute: int, input_timezone: str, ) -> None: """Tests the generated day of week. Args: input_target_day_of_week (int): Target ISO day of week number input_target_hour (int): Target hour of the day in 24hr time input_target_minute (int): Target minute of the hour input_timezone (str): Target timezone """ generated_test_value = SmartSearch._create_next_day( input_target_day_of_week, input_target_hour, input_target_minute, input_timezone, ) assert (generated_test_value.date() >= datetime.now( timezone(input_timezone)).date()) assert generated_test_value.isoweekday() == input_target_day_of_week assert generated_test_value.hour == input_target_hour assert generated_test_value.minute == input_target_minute
def test2(): parameters = { 'kernel': ['cat', ['rbf', 'poly']], 'd': ['int', [1, 3]], 'C': ['float', [1, 10]] } def scoring_function(x): return [0.5] search = SmartSearch(parameters, model='GP', estimator=scoring_function, n_iter=15, n_init=10, n_final_iter=3) search._fit()
def test1(): iris = load_digits() X, y = iris.data, iris.target clf = RandomForestClassifier(n_estimators=20) # specify parameters parameters = { "max_depth": ['int', [3, 3]], "max_features": ['int', [1, 11]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]] } search = SmartSearch(parameters, estimator=clf, X=X, y=y, n_iter=20) search._fit()
def test__has_desired_nbn(sample_listing: dict, desired_nbn: list, expected: bool) -> None: """Tests to see if the desired NBN technology is present. Args: sample_listing (dict): Listing with NBN information to test against desired_nbn (list): Desired NBN technology expected (bool): Expected outcome """ assert SmartSearch._has_desired_nbn(sample_listing, desired_nbn) == expected
def test__has_sufficient_walkscore(sample_listing: dict, walkscore_test_value: int, expected: bool) -> None: """Tests to see if the minimum walkscore threshold is satisfied. Args: sample_listing (dict): Listing with walkscore information to test against walkscore_test_value (int): Minimum acceptable walkscore expected (bool): Expected outcome """ assert (SmartSearch._has_sufficient_walkscore( sample_listing, walkscore_test_value) == expected)
def test__extract_distance_duration(input_distance_result: dict) -> None: """Tests extraction of distance information from gmaps result. Args: input_distance_result (dict): gmaps distance element """ result = SmartSearch._extract_distance_duration(input_distance_result) assert isinstance(result, dict) assert len(result.keys()) == 2 assert "distance" in result assert "duration" in result
def test__travel_time_less_than_threshold(input_travel_time: int, input_max_travel_time: int, expected: bool) -> None: """Tests staticmethod that determines if travel time is acceptable. Args: input_travel_time (int): Test actual travel time input_max_travel_time (int): Test max travel time expected (bool): Expected outcome """ assert (SmartSearch._travel_time_less_than_threshold( input_travel_time, input_max_travel_time) == expected)
def test__create_next_day_invalid_input( self, input_target_day_of_week: int, input_target_hour: int, input_target_minute: int, input_timezone: str, ) -> None: """Tests the generated day of week. Args: input_target_day_of_week (int): Target ISO day of week number input_target_hour (int): Target hour of the day in 24hr time input_target_minute (int): Target minute of the hour input_timezone (str): Target timezone """ with pytest.raises(ValueError): SmartSearch._create_next_day( input_target_day_of_week, input_target_hour, input_target_minute, input_timezone, )
def setup_smart_search() -> None: """Create an instance to test against. Returns: SmartSearch: Instantiated smart search class """ scopes = ["api_listings_read", "api_agencies_read"] searcher = SmartSearch( domain_client_id=os.getenv("CLIENT_ID"), domain_client_secret=os.getenv("CLIENT_SECRET"), domain_scopes=scopes, google_maps_key=os.getenv("GOOGLE_MAPS_KEY"), walkscore_api_key=os.getenv("WSAPIKEY"), ) return searcher
from response_parser import clean_response from smart_search import SmartSearch from utils import json_serial # Instantiate Logger as per config LOGGER = configure_logger() # Instantiate searcher scopes = ["api_listings_read", "api_agencies_read"] searcher = SmartSearch( domain_client_id=os.getenv("CLIENT_ID"), domain_client_secret=os.getenv("CLIENT_SECRET"), domain_scopes=scopes, google_maps_key=os.getenv("GOOGLE_MAPS_KEY"), walkscore_api_key=os.getenv("WSAPIKEY"), ) def search(event: dict, context: object) -> dict: """Entry point function for the API to manage & handle requests. Args: event (dict): API request including gateway information context (object): Methods and properties that provide information about the invocation, function, and execution environment Returns: dict: Filtered properties based on search criteria
def runExperiment(first_exp, n_exp, parameters, model = 'GCP', n_random_init = 10, n_total_iter = 30, n_candidates=500, corr_kernel='squared_exponential', acquisition_function = 'UCB', n_clusters = 1, cluster_evol = 'constant', GCP_mapWithNoise=False, GCP_useAllNoisyY=False, model_noise=None): last_exp = first_exp + n_exp print 'Run experiment',first_exp,'to',last_exp # Load data output = [] f =open(("scoring_function/output.csv"),'r') for l in f: l = l[1:-3] string_l = l.split(',') output.append( [ float(i) for i in string_l] ) f.close() print 'Loaded output file,',len(output),'rows' params = np.genfromtxt(("scoring_function/params.csv"),delimiter=',') print 'Loaded parameters file, shape :',params.shape KNN = NearestNeighbors() KNN.fit(params) # KNN.kneighbors(p,1,return_distance=False)[0] # function that retrieves a performance evaluation from the stored results def get_cv_res(p_dict): p = np.zeros(len(parameters)) for k in p_dict.keys(): p[int(k)] = p_dict[k] idx = KNN.kneighbors(p,1,return_distance=False)[0] all_o = output[idx] r = np.random.randint(len(all_o)/5) return all_o[(5*r):(5*r+5)] ### Run experiment ### for n_exp in range(first_exp,last_exp): print ' **** Run exp',n_exp,' ****' ### set directory if not os.path.exists("exp_results/exp"+str(n_exp)): os.mkdir("exp_results/exp"+str(n_exp)) else: print('Warning : directory already exists') search = SmartSearch(parameters, estimator = get_cv_res, corr_kernel = corr_kernel , GCP_mapWithNoise=GCP_mapWithNoise, GCP_useAllNoisyY=GCP_useAllNoisyY, model_noise = model_noise, model = model, n_candidates = n_candidates, n_iter = n_total_iter, n_init = n_random_init, n_clusters = n_clusters, cluster_evol = cluster_evol, verbose = 2, acquisition_function = acquisition_function, detailed_res = 2) all_parameters, all_search_path, all_raw_outputs,all_mean_outputs = search._fit() ## save experiment's data for i in range(len(all_raw_outputs)): f =open(("exp_results/exp"+str(n_exp)+"/output_"+str(i)+".csv"),'w') for line in all_raw_outputs[i]: print>>f,line f.close() np.savetxt(("exp_results/exp"+str(n_exp)+"/param_"+str(i)+".csv"),all_parameters[i], delimiter=",") np.savetxt(("exp_results/exp"+str(n_exp)+"/param_path_"+str(i)+".csv"),all_search_path[i], delimiter=",") print ' **** End experiment',n_exp,' ****\n'
print ' **** Run exp', n_exp, ' ****' ### set directory if not os.path.exists("exp_results/exp" + str(n_exp)): os.mkdir("exp_results/exp" + str(n_exp)) else: print('Warning : directory already exists') all_parameters,all_raw_outputs,all_mean_outputs, all_std_outputs, all_param_path = \ search = SmartSearch(parameters, estimator=scoring_function, corr_kernel = corr_kernel, acquisition_function = acquisition_function, GCP_mapWithNoise=mapWithNoise, model_noise = model_noise, model = sampling_model, n_candidates=n_candidates, n_iter = n_iter, n_init = n_random_init, n_final_iter=nb_iter_final, n_clusters=n_clusters, cluster_evol = cluster_evol, verbose=2, detailed_res = 2) all_parameters, all_search_path, all_raw_outputs, all_mean_outputs = search._fit( ) ## save experiment's data for i in range(len(all_raw_outputs)): f = open( ("exp_results/exp" + str(n_exp) + "/output_" + str(i) + ".csv"), 'w')
n_iter = 100 nb_iter_final = 0 acquisition_function = 'UCB' def scoring_function(p_dict): x,y = p_dict['x'], p_dict['y'] x = x -5. y= y return branin(x,y) search = SmartSearch(parameters, estimator=scoring_function, corr_kernel = corr_kernel, acquisition_function = acquisition_function, GCP_mapWithNoise=mapWithNoise, model_noise = model_noise, model = sampling_model, n_candidates=n_candidates, n_iter = n_iter, n_init = n_random_init, n_final_iter=nb_iter_final, n_clusters=n_clusters, cluster_evol = cluster_evol, verbose=2, detailed_res = 0) search._fit()
def gp_vs_random_search(test_name, n_tests, search_lenght, save_data=False): """ Compare GP-based search vs a simple random one Choose test_name in {'iris','text'} """ n_iter_search = search_lenght if (test_name == 'iris'): iris = load_digits() X, y = iris.data, iris.target pipeline = RandomForestClassifier() # specify parameters and distributions to sample from parameters = { "max_depth": ['int', [3, 3]], "max_features": ['int', [1, 11]], "min_samples_split": ['int', [1, 11]], "min_samples_leaf": ['int', [1, 11]], "bootstrap": ['cat', [True, False]], "criterion": ['cat', ["gini", "entropy"]] } elif (test_name == 'text'): # Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Load some categories from the training set categories = [ 'alt.atheism', 'talk.religion.misc', ] # Uncomment the following to do the analysis on all the categories #categories = None print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) X = data.data y = data.target # define a pipeline combining a text feature extractor with a simple # classifier pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) # uncommenting more parameters will give better exploring power but will # increase processing time in a combinatorial way parameters = { 'vect__max_df': ['float', [0.5, 1.]], #'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ['cat', [(1, 1), (1, 2)]], # unigrams or bigrams #'tfidf__use_idf': (True, False), #'tfidf__norm': ('l1', 'l2'), 'clf__alpha': ['float', [0.000001, 0.00001]], 'clf__penalty': ['cat', ['l2', 'elasticnet']] #'clf__n_iter': (10, 50, 80), } else: print('Dataset not available for test') # GP UCB search all_gp_ucb_results = [] print 'GP_ucb search' for i in range(n_tests): ucb_search = SmartSearch(parameters, estimator=pipeline, X=X, y=y, acquisition_function='UCB', n_iter=n_iter_search, n_init=20, verbose=False) _, scores = ucb_search._fit() max_scores = [scores[0]] print 'Test', i, '-', len(scores), 'parameters tested' for j in range(1, len(scores)): max_scores.append(max(max_scores[j - 1], scores[j])) all_gp_ucb_results.append(extend_result(n_iter_search, max_scores)) all_gp_ucb_results = np.asarray(all_gp_ucb_results) print all_gp_ucb_results.shape if (save_data): np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',') # # GP EI search # all_gp_ei_results = [] # print 'GP_ei search' # for i in range(n_tests): # ei_search = SmartSearch(parameters,estimator=pipeline,X=X,y=y, # acquisition_function='EI', # n_iter=n_iter_search, n_init=20, verbose=False) # _,scores = ei_search._fit() # max_scores = [scores[0]] # print 'Test',i,'-',len(scores),'parameters tested' # for j in range(1,len(scores)): # max_scores.append(max(max_scores[j-1],scores[j])) # all_gp_ei_results.append(extend_result(n_iter_search,max_scores)) # all_gp_ei_results = np.asarray(all_gp_ei_results) # print all_gp_ei_results.shape # if(save_data): # np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',') # Randomized search print 'Random search' all_random_results = [] for i in range(n_tests): random_search = SmartSearch(parameters, estimator=pipeline, X=X, y=y, n_iter=n_iter_search, n_init=n_iter_search, verbose=False) _, scores = random_search._fit() max_scores = [scores[0]] print 'Test', i, '-', len(scores), 'parameters tested' for j in range(1, len(scores)): max_scores.append(max(max_scores[j - 1], scores[j])) all_random_results.append(extend_result(n_iter_search, max_scores)) all_random_results = np.asarray(all_random_results) if (save_data): np.savetxt('rand_scores.csv', all_random_results, delimiter=',') plt.figure() # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI') plt.plot(range(n_iter_search), np.mean(all_gp_ucb_results, axis=0), 'b', label='GP-UCB') plt.plot(range(n_iter_search), np.mean(all_random_results, axis=0), 'g', label='Random') plt.legend(loc=4) plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' + str(n_tests) + ' trials') plt.xlabel('Iterations') plt.ylabel('Max CV performance') plt.show()
def test__has_feature(input_search_features: list, input_feature_search_words: list, expected: bool) -> None: """Tests for identification of a feature. Args: input_search_features (list): List of test desired domain features input_feature_search_words (list): List of test feature search words expected (bool): Expected output """ test_search_description = "This place has airconditioning, aint it great" available_property_features = [ "AirConditioning", "BuiltInWardrobes", "CableOrSatellite", "Ensuite", "Floorboards", "Gas", "InternalLaundry", "PetsAllowed", "SecureParking", "SwimmingPool", "Furnished", "GroundFloor", "WaterViews", "NorthFacing", "CityViews", "IndoorSpa", "Gym", "AlarmSystem", "Intercom", "BroadbandInternetAccess", "Bath", "Fireplace", "SeparateDiningRoom", "Heating", "Dishwasher", "Study", "TennisCourt", "Shed", "FullyFenced", "BalconyDeck", "GardenCourtyard", "OutdoorSpa", "DoubleGlazedWindows", "EnergyEfficientAppliances", "WaterEfficientAppliances", "WallCeilingInsulation", "RainwaterStorageTank", "GreywaterSystem", "WaterEfficientFixtures", "SolarHotWater", "SolarPanels", ] assert (SmartSearch._has_feature( search_features=input_search_features, feature_search_words=input_feature_search_words, property_details_features=available_property_features, property_description=test_search_description, ) == expected)