def createHistoricalDataForMisoKGDiff(dim_obj_func_min, listPrevData, directory, bias_filename, mult=-1.0): """ This data is only used to train mKG hyperparams, and suppose listPrevData[0] is unbiased IS :param dim_obj_func_min: :param listPrevData: :param directory: :param bias_filename: :return: """ with open("{0}/{1}.pickle".format(directory, bias_filename), "rb") as input_file: bias_data = pickle.load(input_file) data_IS0 = HistoricalData(dim_obj_func_min) data_IS0.append_historical_data(listPrevData[0][0], mult * numpy.array(listPrevData[0][1]), numpy.array(listPrevData[0][2])) data_list = [data_IS0] for i in range(len(listPrevData) - 1): data = HistoricalData(dim_obj_func_min) data.append_historical_data( bias_data['points'][i][:200, :], mult * numpy.array(bias_data['vals'][i][:200]), numpy.ones(len(bias_data['vals'][i][:200])) * (numpy.mean(listPrevData[0][2]) + numpy.mean(listPrevData[i + 1][2]))) data_list.append(data) return data_list
def sample_intial_x_general(problem, num_initial_pts_per_s, points_x, exp_path, result_path): list_init_pts_value_noise = [] new_historical_data = HistoricalData(dim=problem.obj_func_min.getDim()) repQL = problem.obj_func_min.repQL s_min = problem.obj_func_min.getSearchDomain()[0, 0] s_max = problem.obj_func_min.getSearchDomain()[0, 1] for s in np.linspace(s_min, s_max, num=problem.obj_func_min.getNums()): random_seeds = np.random.randint(900, size=num_initial_pts_per_s) points = np.hstack((s * np.ones(num_initial_pts_per_s).reshape( (-1, 1)), points_x)) vals_array, noise_array = np.zeros(num_initial_pts_per_s), np.zeros( num_initial_pts_per_s) i = -1 for (pt, random_seed) in zip(points, random_seeds): i += 1 value, noise_array[i] = problem.obj_func_min.evaluate( repQL, pt, random_seed, exp_path) vals_array[i] = -1.0 * value new_historical_data.append_historical_data(points, vals_array, noise_array) pts_value_noise = np.hstack((points, vals_array.reshape( (-1, 1)), noise_array.reshape((-1, 1)))) list_init_pts_value_noise.append(pts_value_noise) with open(result_path + '_initial_samples.txt', "w") as file: file.write(str(list_init_pts_value_noise)) with open(result_path + '_initial_samples.pickle', "wb") as file: dump(np.array(list_init_pts_value_noise), file) # print(list_init_pts_value_noise) return new_historical_data
def createHistoricalDataGeneral(dim_obj_func_min, listPrevData, mult, indexFirstIS=0): ''' Args: dim_obj_func_min: dim of the obj function, as given in obj_func_min._dim listPrevData: list of tuples (data, vals, noise) indexFirstIS: what is the number of the first IS given in listPrevData. Others are numbered consecutively Returns: HistoricalData object for KG (with additional first column that gives the IS the data corresponds to ''' data = HistoricalData(dim_obj_func_min + 1) indexIS = indexFirstIS # this is the number that corresponds to the IS-dimension in the GP for dataset in listPrevData: # add first column that gives the IS the data corresponds to IS_pts = numpy.hstack((indexIS * numpy.ones(len(dataset[0])).reshape( (-1, 1)), dataset[0])) # multiply all values by -1 since we assume that the training data stems from the minimization version # but misoKG uses the maximization version vals = mult * numpy.array(dataset[1]) data.append_historical_data(IS_pts, vals, dataset[2]) indexIS += 1 return data
def sample_initial_data(problem, num_initial_pts_per_IS): points = problem.obj_func_min.get_moe_domain().generate_uniform_random_points_in_domain(num_initial_pts_per_IS) points_dict = {} vals_dict = {} noise_dict = {} new_historical_data = HistoricalData(dim=problem.obj_func_min.getDim() + 1) # increased by one for index of IS for IS in problem.obj_func_min.getList_IS_to_query(): points_dict[IS] = np.hstack((IS * np.ones(num_initial_pts_per_IS).reshape((-1, 1)), points)) vals_dict[IS] = np.array([-1.0 * problem.obj_func_min.evaluate(IS, pt) for pt in points]) noise_dict[IS] = np.ones(len(points)) * problem.obj_func_min.noise_and_cost_func(IS, None)[0] # note: misoKG will learn the noise from sampled data new_historical_data.append_historical_data(points_dict[IS], vals_dict[IS], noise_dict[IS]) return new_historical_data
def load_sample_data(problem, num_per_var, exp_path, result_path): var_dim = int(problem.obj_func_min.getDim()) - 1 num_initial_pts_per_s = int(num_per_var * var_dim) with open(result_path + '_initial_samples.pickle', 'rb') as file: list_init_pts_value_noise = pickle.load(file) new_historical_data = HistoricalData(dim=problem.obj_func_min.getDim()) count = -1 repQL = problem.obj_func_min.repQL s_min = problem.obj_func_min.getSearchDomain()[0, 0] s_max = problem.obj_func_min.getSearchDomain()[0, 1] for s in np.linspace(s_min, s_max, num=problem.obj_func_min.getNums()): count += 1 pts_value_noise = list_init_pts_value_noise[count] points = pts_value_noise[:, 0:-2] vals_array = pts_value_noise[:, -2] noise_array = pts_value_noise[:, -1] new_historical_data.append_historical_data(points, vals_array, noise_array) return new_historical_data
def get_random_gp_data(space_dim, num_is, num_data_each_is, kernel_name): """ Generate random gp data :param space_dim: :param num_is: :param num_data_each_is: :param kernel_name: currently it's either 'mix_exp' or 'prod_ker' :return: """ sample_var = 0.01 if kernel_name == "mix_exp": hyper_params = numpy.random.uniform(size=(num_is + 1) * (space_dim + 1)) cov = MixedSquareExponential(hyper_params, space_dim + 1, num_is) elif kernel_name == "prod_ker": hyper_params = numpy.random.uniform(size=(num_is + 1) * (num_is + 2) / 2 + space_dim + 1) cov = ProductKernel(hyper_params, space_dim + 1, num_is + 1) else: raise NotImplementedError("invalid kernel") python_search_domain = pythonTensorProductDomain([ ClosedInterval(bound[0], bound[1]) for bound in numpy.repeat([[-10., 10.]], space_dim + 1, axis=0) ]) data = HistoricalData(space_dim + 1) init_pts = python_search_domain.generate_uniform_random_points_in_domain(2) init_pts[:, 0] = numpy.zeros(2) data.append_historical_data(init_pts, numpy.zeros(2), numpy.ones(2) * sample_var) gp = GaussianProcess(cov, data) points = python_search_domain.generate_uniform_random_points_in_domain( num_data_each_is) for pt in points: for i in range(num_is): pt[0] = i val = gp.sample_point_from_gp(pt, sample_var) data.append_sample_points([ [pt, val, sample_var], ]) gp = GaussianProcess(cov, data) return hyper_params, data
def createHistoricalDataForMisoEI(dim_obj_func_min, listPrevData, directory, bias_filename): """ Note: since misoEI uses notion of fidelity variance, I set it to noise_var + bias^2, where bias is estimated from biasData :param dim_obj_func_min: :param listPrevData: :return: """ with open("{0}/{1}.pickle".format(directory, bias_filename), "rb") as input_file: bias_data = pickle.load(input_file) bias_sq_list = numpy.power( numpy.concatenate(([0.], [ numpy.mean(bias_data['vals'][i]) for i in range(len(listPrevData) - 1) ])), 2.0) data_list = [] for i, dataset in enumerate(listPrevData): data = HistoricalData(dim_obj_func_min) data.append_historical_data(dataset[0], dataset[1], numpy.array(dataset[2]) + bias_sq_list[i]) data_list.append(data) return data_list, bias_sq_list
def generate_data(self, num_data): python_search_domain = pythonTensorProductDomain([ ClosedInterval(bound[0], bound[1]) for bound in self._info_dict['search_domain'] ]) data = HistoricalData(self._info_dict['dim']) init_pts = python_search_domain.generate_uniform_random_points_in_domain( 2) init_pts[:, 0] = numpy.zeros(2) data.append_historical_data(init_pts, numpy.zeros(2), numpy.ones(2) * self._sample_var_1) gp = GaussianProcess(self._cov, data) points = python_search_domain.generate_uniform_random_points_in_domain( num_data) for pt in points: pt[0] = numpy.ceil(numpy.random.uniform(high=2.0, size=1)) sample_var = self._sample_var_1 if pt[ 0] == 1 else self._sample_var_2 val = gp.sample_point_from_gp(pt, sample_var) data.append_sample_points([ [pt, val, sample_var], ]) gp = GaussianProcess(self._cov, data) return data
def test(self): rb = RosenbrockVanilla() func_name = rb.getFuncName() pathToPickles = 'picklesTest' ### Test load_data_from_a_min_problem() name_testfile = 'load_and_store_Test' samples = numpy.array([[[1, 1], [1, 2]]]) #print samples values = [[1.0, 2.0]] data = {"points": samples, "vals": values} with open("{0}/{1}.pickle".format(pathToPickles, name_testfile), "wb") as output_file: pickle.dump(data, output_file) loaded_pts, loaded_vals = load_data_from_a_min_problem( pathToPickles, name_testfile) for index in range(len(samples)): self.assertTrue((samples[index] == loaded_pts[index]).all) for index in range(len(values)): self.assertTrue((values[index] == loaded_vals[index])) # test overwriting samples = numpy.array([[[1, 4], [1, 2]]]) with open("{0}/{1}.pickle".format(pathToPickles, name_testfile), "wb") as output_file: pickle.dump(data, output_file) loaded_pts, loaded_vals = load_data_from_a_min_problem( pathToPickles, name_testfile) for index in range(len(samples)): self.assertTrue((samples[index] == loaded_pts[index]).all) for index in range(len(values)): self.assertTrue((values[index] == loaded_vals[index])) ### Test obtainHistoricalDataForEGO() #TODO come up with tests for these functions list_IS_to_query = [0] num_init_pts_each_IS = 10 name_testfile = rb.getFuncName() + '_' + 'IS_' + '_'.join( str(element) for element in list_IS_to_query) + '_' + str( num_init_pts_each_IS) + "_points_each" with open("{0}/{1}.pickle".format(pathToPickles, name_testfile), "wb") as output_file: pickle.dump(data, output_file) # testHistoricalData = obtainHistoricalDataForEGO(True, rb, pathToPickles, list_IS_to_query, num_init_pts_each_IS) # print testHistoricalData # # testHistoricalDataRandom = obtainHistoricalDataForEGO(False, rb, pathToPickles, list_IS_to_query, num_init_pts_each_IS) # print testHistoricalDataRandom ### Test createHistoricalDataForKG() listPrevData = [] samples = [[1, 1], [1, 2]] values = [1.0, 2.0] list_noise_variance_at_sample = [0.1, 0.3] listPrevData.append((samples, values, list_noise_variance_at_sample)) hist_kg = createHistoricalDataForKG(rb._dim, listPrevData) #print hist_kg IS_samples = [[0, 1, 1], [0, 1, 2]] for index in range(len(hist_kg.points_sampled)): self.assertTrue( (IS_samples[index] == hist_kg.points_sampled[index]).all) for index in range(len(hist_kg.points_sampled_value)): self.assertTrue( (values[index] == hist_kg.points_sampled_value[index]).all) samples = [[0, 0], [4, 3]] for index in range(len(hist_kg.points_sampled)): self.assertTrue( (IS_samples[index] == hist_kg.points_sampled[index]).all) listPrevData = [(samples, values, list_noise_variance_at_sample)] bestpt, bestval, best_truth = findBestSampledValue(rb, listPrevData, 0) # print findBestSampledValue(rb, listPrevData, 0) self.assertAlmostEqual(bestval, 1.0, delta=.0001) self.assertAlmostEqual(bestval, 1.0, delta=0.0001) # self.assertAlmostEqual(bestval, 1.0, delta=0.0001) self.assertAlmostEqual(best_truth, numpy.float64(-9.0), delta=1.0) self.assertTrue((bestpt == [0.0, 0.0])) list_sampled_IS = [0, 0] gathered_data_from_all_replications = [] gathered_data_from_all_replications.append({ "points": samples, "vals": values, "noise_variance": list_noise_variance_at_sample, "sampledIS": list_sampled_IS }) for indexList in range(len(gathered_data_from_all_replications)): for indexElem in range( len(gathered_data_from_all_replications[indexList] ['vals'])): self.assertAlmostEqual( values[indexElem], gathered_data_from_all_replications[indexList]['vals'] [indexElem], delta=0.0001) for indexElem in range( len(gathered_data_from_all_replications[indexList] ['points'])): self.assertTrue(samples[indexElem] == gathered_data_from_all_replications[indexList] ['points'][indexElem]) for indexElem in range( len(gathered_data_from_all_replications[indexList] ['sampledIS'])): self.assertTrue(list_sampled_IS[indexElem] == gathered_data_from_all_replications[indexList] ['sampledIS'][indexElem]) gathered_data_from_all_replications.append({ "points": samples, "vals": values, "noise_variance": list_noise_variance_at_sample, "sampledIS": list_sampled_IS }) for indexList in range(len(gathered_data_from_all_replications)): for indexElem in range( len(gathered_data_from_all_replications[indexList] ['vals'])): self.assertAlmostEqual( values[indexElem], gathered_data_from_all_replications[indexList]['vals'] [indexElem], delta=0.0001) for indexElem in range( len(gathered_data_from_all_replications[indexList] ['points'])): self.assertTrue(samples[indexElem] == gathered_data_from_all_replications[indexList] ['points'][indexElem]) for indexElem in range( len(gathered_data_from_all_replications[indexList] ['sampledIS'])): self.assertTrue(list_sampled_IS[indexElem] == gathered_data_from_all_replications[indexList] ['sampledIS'][indexElem]) samples = [[-1., 0], [0.1, -2.0]] values = [0.2, 1.5] list_sampled_IS = [3, 3] gathered_data_from_all_replications.append({ "points": samples, "vals": values, "noise_variance": list_noise_variance_at_sample, "sampledIS": list_sampled_IS }) for indexElem in range( len(gathered_data_from_all_replications[2]['vals'])): self.assertAlmostEqual( values[indexElem], gathered_data_from_all_replications[2]['vals'][indexElem], delta=0.0001) for indexElem in range( len(gathered_data_from_all_replications[2]['points'])): self.assertTrue( samples[indexElem] == gathered_data_from_all_replications[2] ['points'][indexElem]) for indexElem in range( len(gathered_data_from_all_replications[2]['sampledIS'])): self.assertTrue( list_sampled_IS[indexElem] == gathered_data_from_all_replications[2]['sampledIS'][indexElem]) listPrevData.append( (gathered_data_from_all_replications[2]['points'], gathered_data_from_all_replications[2]['vals'], gathered_data_from_all_replications[2]['noise_variance'])) hist_kg = createHistoricalDataForKG(rb._dim, listPrevData) #print hist_kg self.assertTrue((hist_kg.points_sampled[0] == [0, 0, 0]).all) self.assertTrue((hist_kg.points_sampled[1] == [0, 4, 3]).all) self.assertTrue((hist_kg.points_sampled[2] == [1, -1.0, 0]).all) self.assertTrue((hist_kg.points_sampled[3] == [1, .1, -2]).all) self.assertAlmostEqual(values[0], -1.0 * hist_kg.points_sampled_value[2], delta=0.0001) self.assertAlmostEqual(values[1], -1.0 * hist_kg.points_sampled_value[3], delta=0.0001) self.assertAlmostEqual(list_noise_variance_at_sample[0], hist_kg.points_sampled_noise_variance[2], delta=0.0001) self.assertAlmostEqual(list_noise_variance_at_sample[1], hist_kg.points_sampled_noise_variance[3], delta=0.0001) ### Test for findBestSampledValueFromHistoricalData() atoext = AssembleToOrderExtended(mult=-1.0) hd = HistoricalData(atoext.getDim()) pts = numpy.array([[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 0.2, 0.3, 0.4, 0.5, 0.2, 0.3, 0.4]]) self.assertTrue(len(pts) == 2) self.assertTrue(len(pts[0]) == atoext.getDim()) self.assertTrue(len(pts[1]) == atoext.getDim()) vals = [-1.0, 0.2] noises = [0.1, 0.2] hd.append_historical_data(pts, vals, noises) # print hd.to_list_of_sample_points() bestpt, best_val, best_truth = findBestSampledValueFromHistoricalData( atoext, hd) # print bestpt # print best_val # print best_truth self.assertTrue((pts[0] == bestpt).all) self.assertTrue(best_val == -1.0) self.assertAlmostEqual(best_truth, atoext.evaluate(2, bestpt), delta=10.0) pts = numpy.array([[1.3, 1.4, 10.0, 11.0, 19.0, 1.0, 1.0, 1.0], [13.0, 10.2, 10.3, 10.4, 10.5, 0.2, 10.3, 0.4]]) vals = [-11.0, 10.2] noises = [10.1, 1000.2] hd.append_historical_data(pts, vals, noises) bestpt, best_val, best_truth = findBestSampledValueFromHistoricalData( atoext, hd) self.assertTrue((pts[0] == bestpt).all) self.assertTrue(best_val == -11.0) pts2 = numpy.array([[10.3, 10.4, 10.0, 11.0, 19.0, 1.0, 1.0, 1.0], [13.0, 10.2, 10.3, 10.4, 10.5, 0.2, 10.3, 0.4]]) vals = [11.0, 10.2] hd.append_historical_data(pts, vals, noises) bestpt, best_val, best_truth = findBestSampledValueFromHistoricalData( atoext, hd) self.assertTrue((pts[0] == bestpt).all) self.assertTrue(best_val == -11.0)
'multifidelity_kg_hyperparam_' + func_name, sql_util.sql_engine).mean(axis=0).values kg_data = HistoricalData(obj_func_max._dim + 1) best_sampled_val = numpy.inf for i in range(obj_func_max._num_IS): IS_pts = numpy.hstack(((i + 1) * numpy.ones(len(init_pts[i])).reshape( (-1, 1)), init_pts[i])) # multiply all values by -1 since we assume that the training data stems from the minimization version # but misoKG uses the maximization version vals = -1.0 * numpy.array(init_vals[i]) # obtain what used to be sample_vars noise_vars = numpy.array( [noise_and_cost_func(i + 1, pt)[0] for pt in init_pts[i]]) kg_data.append_historical_data(IS_pts, vals, noise_vars) # find the best initial value if numpy.amin(init_vals[i]) < best_sampled_val: best_sampled_val = numpy.amin(init_vals[i]) best_sampled_point = init_pts[i][numpy.argmin(init_vals[i]), :] truth_at_best_sampled = obj_func_min.evaluate(truth_IS, best_sampled_point) kg_cov = MixedSquareExponential(hyperparameters=kg_hyper_param, total_dim=obj_func_max._dim + 1, num_is=obj_func_max._num_IS) kg_cov_cpp = cppMixedSquareExponential(hyperparameters=kg_hyper_param) kg_gp_cpp = GaussianProcessNew(kg_cov_cpp, kg_data, obj_func_max._num_IS) for kg_n in range(num_iterations): print "itr {0}, {1}".format(kg_n, benchmark_result_table_name) ### First discretize points and then only keep the good points idea
### Format: IS 0: signal variance and length scales, IS 1: signal variance and length scales, etc. ### Then observational noise for IS 0, IS 1 etc. hyperparameters_noise = numpy.power(best_hyper[-num_IS:], 2.0) hypers_GP = best_hyper[:-num_IS] # update noise in historical data updated_points_sampled_noise_variance = create_array_points_sampled_noise_variance( current_hist_data.points_sampled, hyperparameters_noise) # create new Historical data object with updated values new_historical_data = HistoricalData( dim=problem.obj_func_min.getDim() + 1) # increased by one for index of IS new_historical_data.append_historical_data( current_hist_data.points_sampled, current_hist_data.points_sampled_value, updated_points_sampled_noise_variance) # Use new hyperparameters -- this requires instantiating a new GP object kg_cov_cpp = cppMixedSquareExponential(hyperparameters=hypers_GP) kg_gp_cpp = GaussianProcessNew(kg_cov_cpp, new_historical_data, num_IS_in=problem.num_is_in) # kg_cov_cpp is not used afterwards ### Find IS and point that maximize KG/cost discretization_points = problem.obj_func_min.get_moe_domain( ).generate_uniform_random_points_in_domain( num_discretization_before_ranking) discretization_points = np.hstack((np.zeros( (num_discretization_before_ranking, 1)), discretization_points))
sql_util.sql_engine).mean(axis=0).values search_domain = pythonTensorProductDomain([ ClosedInterval(bound[0], bound[1]) for bound in obj_func_max._search_domain ]) ### Gen initial points data = HistoricalData(obj_func_max._dim + 1) for i in range(obj_func_max._num_IS): pts = search_domain.generate_uniform_random_points_in_domain( num_init_pts_all_IS[i]) vals = [obj_func_max.evaluate(i + 1, pt) for pt in pts] IS_pts = numpy.hstack( ((i + 1) * numpy.ones(num_init_pts_all_IS[i]).reshape((-1, 1)), pts)) sample_vars = [ obj_func_max.noise_and_cost_func(i + 1, pt)[0] for pt in pts ] data.append_historical_data(IS_pts, vals, sample_vars) cov_func = MixedSquareExponential(hyperparameters=hyper_param, total_dim=obj_func_max._dim + 1, num_is=obj_func_max._num_IS) gp = GaussianProcess(cov_func, data) # print "start max mu" # num_randomization = 100000 # random_pts = search_domain.generate_uniform_random_points_in_domain(num_randomization) # zero_random_pts = numpy.hstack((numpy.zeros((num_randomization, 1)), random_pts)) # print "random pts generated" # mu_list = gp.compute_mean_of_points(zero_random_pts) # print "compute mean completed" # best_mu = numpy.amax(mu_list) # best_pt = random_pts[numpy.argmax(mu_list), :]
tolerance=1.0e-3) cpp_sgd_params_ps = cppGradientDescentParameters(num_multistarts=1, max_num_steps=12, max_num_restarts=1, num_steps_averaged=3, gamma=0.7, pre_mult=0.01, max_relative_change=0.01, tolerance=1.0e-5) if obj_func_name == "GP": gp_grad_info_dict = pickle.load(open('random_gp_grad_1d', 'rb')) hist_data_grad = HistoricalData(gp_grad_info_dict['dim'], 1) hist_data_grad.append_historical_data(gp_grad_info_dict['points'], gp_grad_info_dict['values'], gp_grad_info_dict['vars']) objective_func = synthetic_functions.RandomGP( gp_grad_info_dict['dim'], gp_grad_info_dict['hyper_params'], hist_data_grad) hyper_params = gp_grad_info_dict['hyper_params'] init_pts = [[-1.5], [-1.0], [1.0], [1.5]] ymax = 2 elif obj_func_name == "GP_wavy": gp_grad_info_dict = pickle.load(open('random_gp_1d_wavy', 'rb')) hist_data_grad = HistoricalData(gp_grad_info_dict['dim'], 0) hist_data_grad.append_historical_data(gp_grad_info_dict['points'], gp_grad_info_dict['values'], gp_grad_info_dict['vars']) print gp_grad_info_dict['values'] objective_func = synthetic_functions.RandomGP(
def construct_hist_data_from_pickle(dim, directory, IS_filename_dict, combine_IS, sign, take_diff=False, primary_key=None): """ :param dim: space dimension of the problem :type dim: int :param directory: dir of the pickle files :type directory: str :param IS_filename_dict: {IS: filename} hashtable which provides name of the pickle file for the corresponding IS :type IS_filename_dict: dict :param combine_IS: whether construct a single HistoricalData on the space IS \times space, or a dict of HistoricalData objects, with each corresponds to each IS :type combine_IS: bool :param sign: sign = 1.0 means minimization problem, otherwise is maximization :type sign: float :param take_diff: whether take diff between IS_i and primary_IS, this is enabled for one approach of estimating mKG hyperparameters :type take_diff: bool :param primary_key: if take_diff = True, this is used to specify primary IS :type primary_key: int :return: if combine_IS = True, return a HistoricalData object, otherwise return a dict of {IS: HistoricalData} :rtype: HistoricalData or dict """ points_dict = {} vals_dict = {} noise_dict = {} if take_diff: with open( "{0}/{1}.pickle".format(directory, IS_filename_dict[primary_key]), "rb") as f: data = pickle.load(f) points_dict[primary_key] = np.array(data['points']) vals_dict[primary_key] = sign * np.array(data['vals']) noise_dict[primary_key] = np.array(data['noise']) for key in IS_filename_dict: if take_diff and key != primary_key: with open( "{0}/{1}.pickle".format(directory, IS_filename_dict[key]), "rb") as f: data = pickle.load(f) assert np.array_equal( data['points'], points_dict[primary_key] ), "inconsistent points, cannot take diff!" points_dict[key] = np.array(data['points']) vals_dict[key] = sign * np.array( data['vals']) - vals_dict[primary_key] noise_dict[key] = np.array( data['noise']) + noise_dict[primary_key] elif not take_diff: with open( "{0}/{1}.pickle".format(directory, IS_filename_dict[key]), "rb") as f: data = pickle.load(f) points_dict[key] = np.array(data['points']) vals_dict[key] = sign * np.array(data['vals']) noise_dict[key] = np.array(data['noise']) if combine_IS: to_return = HistoricalData(dim=dim + 1) for key in points_dict: num_data = len(vals_dict[key]) to_return.append_historical_data( np.hstack((key * np.ones(num_data).reshape( (-1, 1)), points_dict[key])), vals_dict[key], noise_dict[key]) else: to_return = {} for key in points_dict: to_return[key] = HistoricalData(dim=dim) to_return[key].append_historical_data(points_dict[key], vals_dict[key], noise_dict[key]) return to_return
def construct_hist_data_from_s3(bucket, dim, IS_key_dict, combine_IS, sign, take_diff=False, primary_IS=None): """ :param bucket: amazon s3 bucket object :param dim: space dimension of the problem :type dim: int :param IS_key_dict: {IS: key} hashtable which provides key of the data for the corresponding IS :type IS_key_dict: dict :param combine_IS: whether construct a single HistoricalData on the space IS \times space, or a dict of HistoricalData objects, with each corresponds to each IS :type combine_IS: bool :param sign: sign = 1.0 means minimization problem, otherwise is maximization :type sign: float :param take_diff: whether take diff between IS_i and primary_IS, this is enabled for one approach of estimating mKG hyperparameters :type take_diff: bool :param primary_key: if take_diff = True, this is used to specify primary IS :type primary_key: int :return: if combine_IS = True, return a HistoricalData object, otherwise return a dict of {IS: HistoricalData} :rtype: HistoricalData or dict """ points_dict = {} vals_dict = {} noise_dict = {} if take_diff: data = get_data_from_s3(bucket, IS_key_dict[primary_IS]) points_dict[primary_IS] = np.array(data['points']) vals_dict[primary_IS] = sign * np.array(data['vals']) noise_dict[primary_IS] = np.array(data['noise']) for IS in IS_key_dict: if take_diff and IS != primary_IS: data = get_data_from_s3(bucket, IS_key_dict[IS]) assert np.array_equal(data['points'], points_dict[primary_IS] ), "inconsistent points, cannot take diff!" points_dict[IS] = np.array(data['points']) vals_dict[IS] = sign * np.array( data['vals']) - vals_dict[primary_IS] noise_dict[IS] = np.array(data['noise']) + noise_dict[primary_IS] elif not take_diff: data = get_data_from_s3(bucket, IS_key_dict[IS]) points_dict[IS] = np.array(data['points']) vals_dict[IS] = sign * np.array(data['vals']) noise_dict[IS] = np.array(data['noise']) if combine_IS: to_return = HistoricalData(dim=dim + 1) for IS in points_dict: num_data = len(vals_dict[IS]) to_return.append_historical_data( np.hstack((IS * np.ones(num_data).reshape( (-1, 1)), points_dict[IS])), vals_dict[IS], noise_dict[IS]) else: to_return = {} for IS in points_dict: to_return[IS] = HistoricalData(dim=dim) to_return[IS].append_historical_data(points_dict[IS], vals_dict[IS], noise_dict[IS]) return to_return
def obtainHistoricalDataForEGO(load_historical_data_from_pickle, obj_func_min, directoryToPickles, list_IS_to_query, num_init_pts_each_IS, init_data_pickle_filename=''): ''' Create Historical Data object for EGO that contains initial data. If truthIS is among the IS, then load only the data from that one Args: load_historical_data_from_pickle: if True load from pickle otherwise do a random Latin hypercube design obj_func_min: the problem directoryToPickles: path to the directory that contains the pickle files list_IS_to_query: list of the IS that should be queried, e.g. [0, 1, 2] num_init_pts_each_IS: how many points for each IS - is either used to find right pickle or to determine the number of points to sample init_data_pickle_filename: optional parameter that gives the filename of the pickle to load Returns: HistoricalData object ''' historical_data = HistoricalData(obj_func_min._dim) if (load_historical_data_from_pickle): # To load the pickled data, do: if (init_data_pickle_filename == ''): init_data_pickle_filename = obj_func_min.getFuncName() + '_' + 'IS_' \ + '_'.join(str(element) for element in list_IS_to_query) + '_' \ + str(num_init_pts_each_IS) + "_points_each" init_pts_array, init_vals_array = load_data_from_a_min_problem( directoryToPickles, init_data_pickle_filename) # if truthIS is among the sampled, then load only that one: if obj_func_min.getTruthIS() in list_IS_to_query: indexArray = list_IS_to_query.index(obj_func_min.getTruthIS()) sample_vars = [ obj_func_min.noise_and_cost_func(obj_func_min.getTruthIS(), pt)[0] for pt in init_pts_array[indexArray] ] historical_data.append_historical_data(init_pts_array[indexArray], init_vals_array[indexArray], sample_vars) else: # load data for all IS indexArray = 0 for index_IS in list_IS_to_query: sample_vars = [ obj_func_min.noise_and_cost_func(index_IS, pt)[0] for pt in init_pts_array[indexArray] ] historical_data.append_historical_data( init_pts_array[indexArray], init_vals_array[indexArray], sample_vars) indexArray += 1 else: # generate initial data from querying random points for each IS for index_IS in list_IS_to_query: if (obj_func_min.getTruthIS() in list_IS_to_query) and ( index_IS != obj_func_min.getTruthIS()): continue # the truthIS is observed but this is another IS: skip! search_domain = pythonTensorProductDomain([ ClosedInterval(bound[0], bound[1]) for bound in obj_func_min._search_domain ]) pts = search_domain.generate_uniform_random_points_in_domain( num_init_pts_each_IS) vals = [obj_func_min.evaluate(index_IS, pt) for pt in pts] sample_vars = [ obj_func_min.noise_and_cost_func(index_IS, pt)[0] for pt in pts ] historical_data.append_historical_data(pts, vals, sample_vars) return historical_data