Exemplo n.º 1
0
def test3():
    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]
    # Uncomment the following to do the analysis on all the categories
    #categories = None

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    # define a pipeline combining a text feature extractor with a simple
    # classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__max_df': ['float', [0.5, 1.]],
        #'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ['cat', [(1, 1), (1, 2)]],  # unigrams or bigrams
        #'tfidf__use_idf': (True, False),
        'tfidf__norm': ['cat', ('l1', 'l2')],
        'clf__alpha': ['float', [0.000001, 0.0001]],
        'clf__penalty': ['cat', ['l2', 'elasticnet']]
        #'clf__n_iter': (10, 50, 80),
    }

    search = SmartSearch(parameters,
                         estimator=pipeline,
                         X=data.data,
                         y=data.target,
                         n_iter=30)
    search._fit()
    def test__create_next_day_valid_input(
        self,
        input_target_day_of_week: int,
        input_target_hour: int,
        input_target_minute: int,
        input_timezone: str,
    ) -> None:
        """Tests the generated day of week.

        Args:
            input_target_day_of_week (int): Target ISO day of week number
            input_target_hour (int): Target hour of the day in 24hr time
            input_target_minute (int): Target minute of the hour
            input_timezone (str): Target timezone
        """
        generated_test_value = SmartSearch._create_next_day(
            input_target_day_of_week,
            input_target_hour,
            input_target_minute,
            input_timezone,
        )

        assert (generated_test_value.date() >= datetime.now(
            timezone(input_timezone)).date())
        assert generated_test_value.isoweekday() == input_target_day_of_week
        assert generated_test_value.hour == input_target_hour
        assert generated_test_value.minute == input_target_minute
Exemplo n.º 3
0
def test2():
    parameters = {
        'kernel': ['cat', ['rbf', 'poly']],
        'd': ['int', [1, 3]],
        'C': ['float', [1, 10]]
    }

    def scoring_function(x):
        return [0.5]

    search = SmartSearch(parameters,
                         model='GP',
                         estimator=scoring_function,
                         n_iter=15,
                         n_init=10,
                         n_final_iter=3)
    search._fit()
Exemplo n.º 4
0
def test1():
    iris = load_digits()
    X, y = iris.data, iris.target
    clf = RandomForestClassifier(n_estimators=20)

    # specify parameters
    parameters = {
        "max_depth": ['int', [3, 3]],
        "max_features": ['int', [1, 11]],
        "min_samples_split": ['int', [1, 11]],
        "min_samples_leaf": ['int', [1, 11]],
        "bootstrap": ['cat', [True, False]],
        "criterion": ['cat', ["gini", "entropy"]]
    }

    search = SmartSearch(parameters, estimator=clf, X=X, y=y, n_iter=20)
    search._fit()
def test__has_desired_nbn(sample_listing: dict, desired_nbn: list,
                          expected: bool) -> None:
    """Tests to see if the desired NBN technology is present.

    Args:
        sample_listing (dict): Listing with NBN information to test against
        desired_nbn (list): Desired NBN technology
        expected (bool): Expected outcome
    """
    assert SmartSearch._has_desired_nbn(sample_listing,
                                        desired_nbn) == expected
def test__has_sufficient_walkscore(sample_listing: dict,
                                   walkscore_test_value: int,
                                   expected: bool) -> None:
    """Tests to see if the minimum walkscore threshold is satisfied.

    Args:
        sample_listing (dict): Listing with walkscore information to test against
        walkscore_test_value (int): Minimum acceptable walkscore
        expected (bool): Expected outcome
    """
    assert (SmartSearch._has_sufficient_walkscore(
        sample_listing, walkscore_test_value) == expected)
def test__extract_distance_duration(input_distance_result: dict) -> None:
    """Tests extraction of distance information from gmaps result.

    Args:
        input_distance_result (dict): gmaps distance element
    """
    result = SmartSearch._extract_distance_duration(input_distance_result)

    assert isinstance(result, dict)
    assert len(result.keys()) == 2
    assert "distance" in result
    assert "duration" in result
def test__travel_time_less_than_threshold(input_travel_time: int,
                                          input_max_travel_time: int,
                                          expected: bool) -> None:
    """Tests staticmethod that determines if travel time is acceptable.

    Args:
        input_travel_time (int): Test actual travel time
        input_max_travel_time (int): Test max travel time
        expected (bool): Expected outcome
    """
    assert (SmartSearch._travel_time_less_than_threshold(
        input_travel_time, input_max_travel_time) == expected)
    def test__create_next_day_invalid_input(
        self,
        input_target_day_of_week: int,
        input_target_hour: int,
        input_target_minute: int,
        input_timezone: str,
    ) -> None:
        """Tests the generated day of week.

        Args:
            input_target_day_of_week (int): Target ISO day of week number
            input_target_hour (int): Target hour of the day in 24hr time
            input_target_minute (int): Target minute of the hour
            input_timezone (str): Target timezone
        """
        with pytest.raises(ValueError):
            SmartSearch._create_next_day(
                input_target_day_of_week,
                input_target_hour,
                input_target_minute,
                input_timezone,
            )
def setup_smart_search() -> None:
    """Create an instance to test against.

    Returns:
        SmartSearch: Instantiated smart search class
    """
    scopes = ["api_listings_read", "api_agencies_read"]
    searcher = SmartSearch(
        domain_client_id=os.getenv("CLIENT_ID"),
        domain_client_secret=os.getenv("CLIENT_SECRET"),
        domain_scopes=scopes,
        google_maps_key=os.getenv("GOOGLE_MAPS_KEY"),
        walkscore_api_key=os.getenv("WSAPIKEY"),
    )
    return searcher
from response_parser import clean_response

from smart_search import SmartSearch

from utils import json_serial

# Instantiate Logger as per config
LOGGER = configure_logger()

# Instantiate searcher
scopes = ["api_listings_read", "api_agencies_read"]
searcher = SmartSearch(
    domain_client_id=os.getenv("CLIENT_ID"),
    domain_client_secret=os.getenv("CLIENT_SECRET"),
    domain_scopes=scopes,
    google_maps_key=os.getenv("GOOGLE_MAPS_KEY"),
    walkscore_api_key=os.getenv("WSAPIKEY"),
)


def search(event: dict, context: object) -> dict:
    """Entry point function for the API to manage & handle requests.

    Args:
        event (dict): API request including gateway information
        context (object): Methods and properties that provide information about the invocation,
                          function, and execution environment

    Returns:
        dict: Filtered properties based on search criteria
Exemplo n.º 12
0
def runExperiment(first_exp,
                  n_exp,
                  parameters,
                  model = 'GCP',
                  n_random_init = 10,
                  n_total_iter = 30,
                  n_candidates=500,
                  corr_kernel='squared_exponential',
                  acquisition_function = 'UCB',
                  n_clusters = 1,
                  cluster_evol = 'constant',
                  GCP_mapWithNoise=False,
                  GCP_useAllNoisyY=False,
                  model_noise=None):
  
  last_exp = first_exp + n_exp
  print 'Run experiment',first_exp,'to',last_exp

  # Load data
  output = []
  f =open(("scoring_function/output.csv"),'r')
  for l in f:
      l = l[1:-3]
      string_l = l.split(',')
      output.append( [ float(i) for i in string_l] )
  f.close()
  print 'Loaded output file,',len(output),'rows'

  params = np.genfromtxt(("scoring_function/params.csv"),delimiter=',')
  print 'Loaded parameters file, shape :',params.shape

  KNN = NearestNeighbors()
  KNN.fit(params)
  # KNN.kneighbors(p,1,return_distance=False)[0]

  # function that retrieves a performance evaluation from the stored results
  def get_cv_res(p_dict):
      p = np.zeros(len(parameters))
      for k in p_dict.keys():
        p[int(k)] = p_dict[k]
      idx = KNN.kneighbors(p,1,return_distance=False)[0]
      all_o = output[idx]
      r = np.random.randint(len(all_o)/5)
      return all_o[(5*r):(5*r+5)]


  ###  Run experiment  ### 

  for n_exp in range(first_exp,last_exp):
      print ' ****   Run exp',n_exp,'  ****'
      ### set directory
      if not os.path.exists("exp_results/exp"+str(n_exp)):
          os.mkdir("exp_results/exp"+str(n_exp))
      else:
          print('Warning : directory already exists')

      search = SmartSearch(parameters,
                        estimator = get_cv_res,
                        corr_kernel = corr_kernel ,
                        GCP_mapWithNoise=GCP_mapWithNoise,
                        GCP_useAllNoisyY=GCP_useAllNoisyY,
                        model_noise = model_noise,
                        model = model, 
                        n_candidates = n_candidates,
                        n_iter = n_total_iter,
                        n_init = n_random_init, 
                        n_clusters = n_clusters,
                        cluster_evol = cluster_evol,
                        verbose = 2,
                        acquisition_function = acquisition_function,
                        detailed_res = 2)

      all_parameters, all_search_path, all_raw_outputs,all_mean_outputs = search._fit()

      ## save experiment's data
      for i in range(len(all_raw_outputs)):
          f =open(("exp_results/exp"+str(n_exp)+"/output_"+str(i)+".csv"),'w')
          for line in all_raw_outputs[i]:
              print>>f,line
          f.close()
          np.savetxt(("exp_results/exp"+str(n_exp)+"/param_"+str(i)+".csv"),all_parameters[i], delimiter=",")
          np.savetxt(("exp_results/exp"+str(n_exp)+"/param_path_"+str(i)+".csv"),all_search_path[i], delimiter=",")

      print ' ****   End experiment',n_exp,'  ****\n'
Exemplo n.º 13
0
    print ' ****   Run exp', n_exp, '  ****'
    ### set directory
    if not os.path.exists("exp_results/exp" + str(n_exp)):
        os.mkdir("exp_results/exp" + str(n_exp))
    else:
        print('Warning : directory already exists')

    all_parameters,all_raw_outputs,all_mean_outputs, all_std_outputs, all_param_path = \
    search = SmartSearch(parameters,
          estimator=scoring_function,
          corr_kernel = corr_kernel,
          acquisition_function = acquisition_function,
          GCP_mapWithNoise=mapWithNoise,
          model_noise = model_noise,
          model = sampling_model,
          n_candidates=n_candidates,
          n_iter = n_iter,
          n_init = n_random_init,
          n_final_iter=nb_iter_final,
          n_clusters=n_clusters,
          cluster_evol = cluster_evol,
          verbose=2,
          detailed_res = 2)
    all_parameters, all_search_path, all_raw_outputs, all_mean_outputs = search._fit(
    )

    ## save experiment's data
    for i in range(len(all_raw_outputs)):
        f = open(
            ("exp_results/exp" + str(n_exp) + "/output_" + str(i) + ".csv"),
            'w')
Exemplo n.º 14
0
n_iter = 100
nb_iter_final = 0
acquisition_function = 'UCB'


def scoring_function(p_dict):
	x,y = p_dict['x'], p_dict['y']
	x = x -5.
	y= y
	return branin(x,y)


search = SmartSearch(parameters,
			estimator=scoring_function,
			corr_kernel = corr_kernel,
			acquisition_function = acquisition_function,
			GCP_mapWithNoise=mapWithNoise,
			model_noise = model_noise,
			model = sampling_model, 
			n_candidates=n_candidates,
			n_iter = n_iter,
			n_init = n_random_init,
			n_final_iter=nb_iter_final,
			n_clusters=n_clusters, 
			cluster_evol = cluster_evol,
			verbose=2,
			detailed_res = 0)

search._fit()

Exemplo n.º 15
0
def gp_vs_random_search(test_name, n_tests, search_lenght, save_data=False):
    """
	Compare GP-based search vs a simple random one
	Choose test_name in {'iris','text'}
	"""

    n_iter_search = search_lenght

    if (test_name == 'iris'):
        iris = load_digits()
        X, y = iris.data, iris.target
        pipeline = RandomForestClassifier()

        # specify parameters and distributions to sample from
        parameters = {
            "max_depth": ['int', [3, 3]],
            "max_features": ['int', [1, 11]],
            "min_samples_split": ['int', [1, 11]],
            "min_samples_leaf": ['int', [1, 11]],
            "bootstrap": ['cat', [True, False]],
            "criterion": ['cat', ["gini", "entropy"]]
        }

    elif (test_name == 'text'):
        # Display progress logs on stdout
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s')

        # Load some categories from the training set
        categories = [
            'alt.atheism',
            'talk.religion.misc',
        ]
        # Uncomment the following to do the analysis on all the categories
        #categories = None
        print("Loading 20 newsgroups dataset for categories:")
        print(categories)

        data = fetch_20newsgroups(subset='train', categories=categories)
        print("%d documents" % len(data.filenames))
        print("%d categories" % len(data.target_names))

        X = data.data
        y = data.target

        # define a pipeline combining a text feature extractor with a simple
        # classifier
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier()),
        ])

        # uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
            'vect__max_df': ['float', [0.5, 1.]],
            #'vect__max_features': (None, 5000, 10000, 50000),
            'vect__ngram_range': ['cat', [(1, 1),
                                          (1, 2)]],  # unigrams or bigrams
            #'tfidf__use_idf': (True, False),
            #'tfidf__norm': ('l1', 'l2'),
            'clf__alpha': ['float', [0.000001, 0.00001]],
            'clf__penalty': ['cat', ['l2', 'elasticnet']]
            #'clf__n_iter': (10, 50, 80),
        }

    else:
        print('Dataset not available for test')

    # GP UCB search
    all_gp_ucb_results = []
    print 'GP_ucb search'
    for i in range(n_tests):
        ucb_search = SmartSearch(parameters,
                                 estimator=pipeline,
                                 X=X,
                                 y=y,
                                 acquisition_function='UCB',
                                 n_iter=n_iter_search,
                                 n_init=20,
                                 verbose=False)
        _, scores = ucb_search._fit()

        max_scores = [scores[0]]
        print 'Test', i, '-', len(scores), 'parameters tested'

        for j in range(1, len(scores)):
            max_scores.append(max(max_scores[j - 1], scores[j]))
        all_gp_ucb_results.append(extend_result(n_iter_search, max_scores))
    all_gp_ucb_results = np.asarray(all_gp_ucb_results)
    print all_gp_ucb_results.shape
    if (save_data):
        np.savetxt('gp_ucb_scores.csv', all_gp_ucb_results, delimiter=',')

    # # GP EI search
    # all_gp_ei_results = []
    # print 'GP_ei search'
    # for i in range(n_tests):
    # 	ei_search = SmartSearch(parameters,estimator=pipeline,X=X,y=y,
    # 						acquisition_function='EI',
    # 						n_iter=n_iter_search, n_init=20, verbose=False)
    # 	_,scores = ei_search._fit()

    # 	max_scores = [scores[0]]
    # 	print 'Test',i,'-',len(scores),'parameters tested'

    # 	for j in range(1,len(scores)):
    # 		max_scores.append(max(max_scores[j-1],scores[j]))
    # 	all_gp_ei_results.append(extend_result(n_iter_search,max_scores))
    # all_gp_ei_results = np.asarray(all_gp_ei_results)
    # print all_gp_ei_results.shape
    # if(save_data):
    # 	np.savetxt('gp_ei_scores.csv',all_gp_ei_results,delimiter=',')

    # Randomized search
    print 'Random search'
    all_random_results = []
    for i in range(n_tests):
        random_search = SmartSearch(parameters,
                                    estimator=pipeline,
                                    X=X,
                                    y=y,
                                    n_iter=n_iter_search,
                                    n_init=n_iter_search,
                                    verbose=False)
        _, scores = random_search._fit()

        max_scores = [scores[0]]
        print 'Test', i, '-', len(scores), 'parameters tested'

        for j in range(1, len(scores)):
            max_scores.append(max(max_scores[j - 1], scores[j]))
        all_random_results.append(extend_result(n_iter_search, max_scores))
    all_random_results = np.asarray(all_random_results)
    if (save_data):
        np.savetxt('rand_scores.csv', all_random_results, delimiter=',')

    plt.figure()
    # plt.plot(range(n_iter_search),np.mean(all_gp_ei_results,axis=0),'r',label='GP-EI')
    plt.plot(range(n_iter_search),
             np.mean(all_gp_ucb_results, axis=0),
             'b',
             label='GP-UCB')
    plt.plot(range(n_iter_search),
             np.mean(all_random_results, axis=0),
             'g',
             label='Random')
    plt.legend(loc=4)
    plt.title('Test GP vs Random on ' + test_name + ' dataset - Average on ' +
              str(n_tests) + ' trials')
    plt.xlabel('Iterations')
    plt.ylabel('Max CV performance')
    plt.show()
def test__has_feature(input_search_features: list,
                      input_feature_search_words: list,
                      expected: bool) -> None:
    """Tests for identification of a feature.

    Args:
        input_search_features (list): List of test desired domain features
        input_feature_search_words (list): List of test feature search words
        expected (bool): Expected output
    """
    test_search_description = "This place has airconditioning, aint it great"
    available_property_features = [
        "AirConditioning",
        "BuiltInWardrobes",
        "CableOrSatellite",
        "Ensuite",
        "Floorboards",
        "Gas",
        "InternalLaundry",
        "PetsAllowed",
        "SecureParking",
        "SwimmingPool",
        "Furnished",
        "GroundFloor",
        "WaterViews",
        "NorthFacing",
        "CityViews",
        "IndoorSpa",
        "Gym",
        "AlarmSystem",
        "Intercom",
        "BroadbandInternetAccess",
        "Bath",
        "Fireplace",
        "SeparateDiningRoom",
        "Heating",
        "Dishwasher",
        "Study",
        "TennisCourt",
        "Shed",
        "FullyFenced",
        "BalconyDeck",
        "GardenCourtyard",
        "OutdoorSpa",
        "DoubleGlazedWindows",
        "EnergyEfficientAppliances",
        "WaterEfficientAppliances",
        "WallCeilingInsulation",
        "RainwaterStorageTank",
        "GreywaterSystem",
        "WaterEfficientFixtures",
        "SolarHotWater",
        "SolarPanels",
    ]

    assert (SmartSearch._has_feature(
        search_features=input_search_features,
        feature_search_words=input_feature_search_words,
        property_details_features=available_property_features,
        property_description=test_search_description,
    ) == expected)