def write_gp_model(cls, gp_model, method=SBO_METHOD, n_samples_parameters=0, name_model='gp_fitting_gaussian'): """ Write the gp_model after new points are added. :param gp_model: gp model instance :param method: (str) :param n_samples_parameters: int :param name_model: (str) """ model_type = cls._model_map[name_model] f_name = cls._get_filename_modified(model_type, gp_model.problem_name, gp_model.type_kernel, gp_model.training_name, method, n_samples_parameters) gp_dir = path.join(GP_DIR, gp_model.problem_name) if not os.path.exists(gp_dir): os.mkdir(gp_dir) gp_path = path.join(gp_dir, f_name) JSONFile.write(gp_model.serialize(), gp_path)
def add_point(self, point, model_objective_value): """ :param point: np.array(k) :param model_objective_value: float :return: float (optimal value) """ self.evaluated_points.append(list(point)) self.model_objective_values.append(model_objective_value) eval = self.evaluate_objective( self.module, list(point), n_samples=self.n_samples, objective_function=self.objective_function) self.objective_values.append(eval[0]) if self.noise: self.standard_deviation_evaluations.append(eval[1]) data = self.serialize() JSONFile.write(data, self.file_path) return eval[0]
def train_nn(model, n_epochs=20, name_model='a.json', random_seed=1): np.random.seed(1) values = {} for epoch in range(1, n_epochs + 1): logger.info('epoch is %d' % epoch) values[epoch] = [] optimizer = optim.SGD(model.parameters(), lr=(0.1 / np.sqrt(epoch)), momentum=args_opt['momentum']) shuffled_order = np.arange(len(train_dict)) np.random.shuffle(shuffled_order) for i in shuffled_order: total = 0 correct = 0 for data in train_test: images, labels = data outputs = model(Variable(images)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() values[epoch].append(100. * correct / float(total)) logger.info('Error in epoch %d is:' % epoch) logger.info(100. * correct / float(total)) data, target = train_dict[i] data, target = Variable(data), Variable(target) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() f_name = 'data/multi_start/neural_networks/training_results/' f_name += name_model JSONFile.write(values, f_name)
def assign_categories(cls, list_papers, year, month): """ :param list_papers: [str] :return: {paper_name (str): category (str)} """ papers = {} for paper in list_papers: before_2007 = False arxiv_id = paper if '/' in arxiv_id: before_2007 = True index = arxiv_id.index('/') cat = arxiv_id[0:index] arxiv_id = arxiv_id[index + 1:] if 'v' in arxiv_id: index = arxiv_id.rfind('v') arxiv_id = arxiv_id[0:index] if not before_2007: cat = cls.get_cats(arxiv_id, arxiv_id[0:2], arxiv_id[2:4]) papers[paper] = cat JSONFile.write(papers, cls._name_file_categories(year=year, month=month)) return papers
def save_data(self, sufix=None): data = {} data['chosen_points'] = self.chosen_points data['evaluations'] = self.evaluations_obj data['parameters'] = self.parameters data['chosen_index'] = self.chosen_index file_name = 'data/multi_start/' file_name += self.problem_name + '/' if sufix is None: sufix = self.name_model if not os.path.exists(file_name): os.mkdir(file_name) file_name += 'hutter_greedy_policy/' if not os.path.exists(file_name): os.mkdir(file_name) file_name += '/' + sufix if self.random_seed is not None: file_name += '_random_seed_' + str(self.random_seed) if self.n_restarts is not None: file_name += '_n_restarts_' + str(self.n_restarts) JSONFile.write(data, file_name + '.json') for i in self.dict_stat_models: model = self.dict_stat_models[i] model.save_model(str(i))
def save_data(self, sufix=None): data = {} data['chosen_points'] = self.chosen_points data['evaluations'] = self.evaluations_obj data['chosen_index'] = self.chosen_index file_name = 'data/multi_start/' file_name += self.problem_name + '/' if sufix is None: sufix = self.name_model if not os.path.exists(file_name): os.mkdir(file_name) file_name += 'random_policy' + '/' if not os.path.exists(file_name): os.mkdir(file_name) file_name += sufix if self.random_seed is not None: file_name += '_random_seed_' + str(self.random_seed) if self.n_restarts is not None: file_name += '_n_restarts_' + str(self.n_restarts) JSONFile.write(data, file_name + '.json')
def top_users_papers_selecting_categories(cls, year, month, top_categories=10, different_papers=20): """ Selects only users and papers in the top_categories based on the data generated by the previous function. :param year: :param month: :param top_categories: :param different_papers :return: [ {'paper': (int) number of times seen}, {'user': {'stats': ((int) # entries, (int) # different papers in the top_n papers), 'diff_papers': [str] } } ] """ categories = JSONFile.read( cls._name_file_categories(year=year, month=month)) papers_cat = pd.DataFrame.from_records([categories]).transpose() users_cg, user_cat = cls.assign_categories_to_users(year, month) user_cat = pd.DataFrame.from_records([user_cat]).transpose() pap_1 = set(user_cat[0].sort_values().index.values[-top_categories:]) pap_2 = set(papers_cat[0].value_counts().sort_values().index. values[-top_categories:]) top_cat = pap_1.intersection(pap_2) full_data = JSONFile.read(cls._name_file_final(year=year, month=month)) papers_or = full_data[0] papers_new = {} for paper in papers_or: cat = categories[paper] if cat in top_cat: papers_new[paper] = papers_or[paper] users_new = {} for user in full_data[1]: paper_user = [] for paper in full_data[1][user]['diff_papers']: cat = categories[paper] if cat in top_cat: paper_user.append(paper) if len(paper_user) > different_papers: users_new[user] = full_data[1][user] users_new[user]['diff_papers'] = paper_user file_name = cls._name_file_final_categ(year=year, month=month) JSONFile.write([papers_new, users_new], file_name) logger.info('Number of papers is %d' % len(papers_new)) logger.info('Number of users is %d' % len(users_new)) return [papers_new, users_new]
def accuracy(self, gp_model, start=3, iterations=21, sufix=None, model=None): #TODO: UPDATE THIS FUNCTION. NOW IT'S WRONG!! means = {} cis = {} values_observed = {} mean, std, ci = self.compute_posterior_params_marginalize(gp_model) means[start] = mean cis[start] = ci values_observed[start] = gp_model.raw_results['values'][-1] for i in range(start, iterations): print(i) if len(gp_model.raw_results) < i + 1: data_new = self.get_value_next_iteration(i + 1, **self.kwargs) self.add_observations(gp_model, i + 1, data_new['value'], data_new['point'], data_new['gradient']) mean, std, ci = self.compute_posterior_params_marginalize(gp_model) means[i + 1] = mean cis[i + 1] = ci values_observed[i + 1] = data_new['value'] print mean, ci value_tmp = self.get_value_next_iteration(i + 1, **self.kwargs) print value_tmp accuracy_results = {} accuracy_results['means'] = means accuracy_results['ci'] = cis accuracy_results['values_observed'] = values_observed file_name = 'data/multi_start/accuracy_results/stat_model' if not os.path.exists('data/multi_start'): os.mkdir('data/multi_start') if not os.path.exists('data/multi_start/accuracy_results'): os.mkdir('data/multi_start/accuracy_results/') if not os.path.exists('data/multi_start/accuracy_results/' + self.problem_name): os.mkdir('data/multi_start/accuracy_results/' + self.problem_name) if sufix is None: sufix = self.specifications file_name = 'data/multi_start/accuracy_results/' + self.problem_name + '/' + sufix JSONFile.write(accuracy_results, file_name + '.json') return means, cis, values_observed
def get_training_data(cls, year, month, random_seed=1): """ Creates a file with the training data: [[user_id, paper_id, rating]], where rating is 1 if the paper wasn't seen by the user, or 2 otherwise. :param year: str :param month: str (e.g. '1', '12') :param random_seed: int """ random.seed(random_seed) file_name = cls._name_file_final_categ(year=year, month=month) data = JSONFile.read(file_name) papers = data[0].keys() users_data = data[1] users = users_data.keys() training_data = [] key_paper = {} for i, paper in enumerate(papers): key_paper[paper] = i + 1 for i, user in enumerate(users): for paper in users_data[user]['diff_papers']: training_data.append([i + 1, key_paper[paper], 2]) other_papers = list( set(papers) - set(users_data[user]['diff_papers'])) index_papers = range(len(other_papers)) random.shuffle(index_papers) seen_papers = len(set(users_data[user]['diff_papers'])) dislike_papers = np.random.randint( int(0.5 * seen_papers), min(int(1.8 * seen_papers), len(index_papers)), 1) index = dislike_papers[0] keep_index_papers = index_papers[0:index] for index in keep_index_papers: training_data.append( [i + 1, key_paper[other_papers[index]], 1]) file_name = cls._name_training_data(year=year, month=month) logger.info('There are %d training points' % len(training_data)) JSONFile.write(training_data, file_name)
def accuracy(self, gp_model, start=3, iterations=21, sufix=None, model=None): means = {} cis = {} mean, std, ci = self.compute_posterior_params_marginalize(gp_model) means[start] = mean cis[start] = ci for i in range(start, iterations): print(i) if len(gp_model.raw_results) < i + 1: data_new = self.get_value_next_iteration(i + 1, **self.kwargs) self.add_observations(gp_model, i + 1, data_new['value'], data_new['point'], data_new['gradient']) mean, std, ci = self.compute_posterior_params_marginalize(gp_model) means[i + 1] = mean cis[i + 1] = ci print mean, ci value_tmp = self.get_value_next_iteration(i + 1, **self.kwargs) print value_tmp accuracy_results = {} accuracy_results['means'] = means accuracy_results['ci'] = cis file_name = 'data/multi_start/accuracy_results/stat_model' if not os.path.exists('data/multi_start'): os.mkdir('data/multi_start') if not os.path.exists('data/multi_start/accuracy_results'): os.mkdir('data/multi_start/accuracy_results') if self.problem_name is not None: file_name += '_' + self.problem_name if sufix is not None: file_name += '_' + sufix JSONFile.write(accuracy_results, file_name + '.json') return means, cis
def get_points_domain(cls, n_training, bounds_domain, random_seed, training_name, problem_name, type_bounds=None, simplex_domain=None): """ Get random points in the domain. :param n_training: (int) Number of points :param bounds_domain: [([float, float] or [float])], the first case is when the bounds are lower or upper bound of the respective entry; in the second case, it's list of finite points representing the domain of that entry. :param random_seed: (int) :param training_name: (str), prefix used to save the training data. :param problem_name: str :param type_bounds: [0 or 1], 0 if the bounds are lower or upper bound of the respective entry, 1 if the bounds are all the finite options for that entry. :return: [[float]] """ file_name = cls._filename_domain( problem_name=problem_name, training_name=training_name, n_points=n_training, random_seed=random_seed, ) training_dir = path.join(PROBLEM_DIR, problem_name, 'data') training_path = path.join(training_dir, file_name) points = JSONFile.read(training_path) if points is not None: return points points = DomainService.get_points_domain(n_training, bounds_domain, type_bounds=type_bounds, random_seed=random_seed, simplex_domain=simplex_domain) print(points) JSONFile.write(points, training_path) return points
def save_model(self, sufix=None): stat_model_dict = {} stat_model_dict['current_point'] = self.current_point stat_model_dict['starting_point'] = self.starting_point stat_model_dict['current_batch_index'] = self.current_batch_index stat_model_dict['best_result'] = self.gp_model.best_result stat_model_dict['current_iteration'] = self.gp_model.current_iteration stat_model_dict['raw_results'] = self.gp_model.raw_results file_name = 'data/multi_start/stat_model' if self.problem_name is not None: file_name += '_' + self.problem_name if sufix is not None: file_name += '_' + sufix JSONFile.write(stat_model_dict, file_name + '.json')
def load_discretization(cls, problem_name, bounds_domain_x, number_points_each_dimension_x): """ Try to load discretization for problem_name from file. If the file doesn't exist, will generate the discretization and store it. :param problem_name: (str) :param bounds_domain_x: ([BoundsEntity]) :param number_points_each_dimension_x: ([int]) :return: [[float]] """ bounds_str = BoundsEntity.get_bounds_as_lists(bounds_domain_x) filename = cls._disc_x_filename( name=problem_name, bounds=bounds_str, number_points_each_dimension=number_points_each_dimension_x) if not os.path.exists(path.join(PROBLEM_DIR, problem_name)): os.mkdir(path.join(PROBLEM_DIR, problem_name)) domain_dir = path.join(PROBLEM_DIR, problem_name, DOMAIN_DIR) if not os.path.exists(domain_dir): os.mkdir(domain_dir) domain_path = path.join(domain_dir, filename) discretization_data = JSONFile.read(domain_path) if discretization_data is not None: return discretization_data logger.info('Gnerating discretization of domain_x') discretization_data = DomainEntity.discretize_domain( bounds_domain_x, number_points_each_dimension_x) logger.info('Generated discretization of domain_x') JSONFile.write(discretization_data, domain_path) return discretization_data
def write_debug_data(self, problem_name, model_type, training_name, n_training, random_seed, n_samples_parameters, **kwargs): """ Write the results of the optimization. :param problem_name: (str) :param model_type: (str) :param training_name: (str) :param n_training: (int) :param random_seed: (int) :param n_samples_parameters: int """ if not os.path.exists(DEBUGGING_DIR): os.mkdir(DEBUGGING_DIR) debug_dir = path.join(DEBUGGING_DIR, problem_name) if not os.path.exists(debug_dir): os.mkdir(debug_dir) kernel_name = '' for kernel in self.gp.type_kernel: kernel_name += kernel + '_' kernel_name = kernel_name[0:-1] f_name = self._filename(model_type=model_type, problem_name=problem_name, type_kernel=kernel_name, training_name=training_name, n_training=n_training, random_seed=random_seed, n_samples_parameters=n_samples_parameters) debug_path = path.join(debug_dir, f_name) JSONFile.write(self.optimization_results, debug_path)
def accuracy(self, gp_model, start=3, iterations=21, sufix=None): means = [] cis = [] mean, std, ci = self.compute_posterior_params_marginalize(gp_model) means.append(mean) cis.append(ci) for i in range(start, iterations): print(i) if len(gp_model.raw_results) < i + 1: self.add_observations(gp_model, i + 1, self.get_value_next_iteration(i + 1)) mean, std, ci = self.compute_posterior_params_marginalize(gp_model) means.append(mean) cis.append(ci) accuracy_results = {} accuracy_results['means'] = means accuracy_results['ci'] = cis file_name = 'data/multi_start/accuracy_results/parametric_model' if not os.path.exists('data/multi_start'): os.mkdir('data/multi_start') if not os.path.exists('data/multi_start/accuracy_results'): os.mkdir('data/multi_start/accuracy_results') if self.problem_name is not None: file_name += '_' + self.problem_name if sufix is not None: file_name += '_' + sufix JSONFile.write(accuracy_results, file_name + '.json') return means, cis
def assign_categories_to_users(cls, year, month): file_name = cls._name_file_final(year=year, month=month) full_data = JSONFile.read(file_name) users = full_data[1] paper_cat = JSONFile.read( cls._name_file_categories(year=year, month=month)) users_cg = {} for user in users: diff_papers = users[user]['diff_papers'] papers_cat = [] for paper in diff_papers: papers_cat.append(paper_cat[paper]) users_cg[user] = papers_cat JSONFile.write(users_cg, cls._name_file_categories_users(year=year, month=month)) user_cat = {} for user in users_cg: papers = users_cg[user] cat_us = {} for cat in papers: if cat not in cat_us: cat_us[cat] = 0 cat_us[cat] += 1 for cat in cat_us: if cat_us[cat] >= 0.10 * len(papers): if cat not in user_cat: user_cat[cat] = 0 user_cat[cat] += 1 JSONFile.write( user_cat, cls._name_file_categories_users_hist(year=year, month=month)) return users_cg, user_cat
from __future__ import absolute_import from problems.cnn_cifar10.cnn import train_nn import argparse from stratified_bayesian_optimization.util.json_file import JSONFile if __name__ == '__main__': # Example usage: # python -m problems.cnn_cifar10.scripts.run_cnn 1 1 parser = argparse.ArgumentParser() parser.add_argument('random_seed', help='e.g. 2') parser.add_argument('n_epochs', help='e.g. 2') args = parser.parse_args() rs = int(args.random_seed) n_epochs = int(args.n_epochs) errors = train_nn(rs, n_epochs) directory = 'problems/cnn_cifar10/runs_random_seeds/' + 'rs_%d' % rs + '.json' JSONFile.write(errors, directory)
def test_write(self): with patch('__builtin__.open', mock_open()): JSONFile.write([], self.filename)
def get_gp(cls, name_model, problem_name, type_kernel, dimensions, bounds_domain, type_bounds=None, n_training=0, noise=False, training_data=None, points=None, training_name=None, mle=True, thinning=0, n_burning=0, max_steps_out=1, n_samples=None, random_seed=DEFAULT_RANDOM_SEED, kernel_values=None, mean_value=None, var_noise_value=None, cache=True, same_correlation=False, use_only_training_points=True, optimization_method=None, n_samples_parameters=0, parallel_training=True, simplex_domain=None, objective_function=None, define_samplers=True): """ Fetch a GP model from file if it exists, otherwise train a new model and save it locally. :param name_model: str :param problem_name: str :param type_kernel: [(str)] Must be in possible_kernels. If it's a product of kernels it should be a list as: [PRODUCT_KERNELS_SEPARABLE, NAME_1_KERNEL, NAME_2_KERNEL] :param dimensions: [int]. It has only the n_tasks for the task_kernels, and for the PRODUCT_KERNELS_SEPARABLE contains the dimensions of every kernel in the product :param bounds_domain: [([float, float] or [float])], the first case is when the bounds are lower or upper bound of the respective entry; in the second case, it's list of finite points representing the domain of that entry. :param type_bounds: [0 or 1], 0 if the bounds are lower or upper bound of the respective entry, 1 if the bounds are all the finite options for that entry. :param n_training: int :param noise: (boolean) If true, we get noisy evaluations. :param training_data: {'points': [[float]], 'evaluations': [float], 'var_noise': [float] or None} :param points: [[float]]. If training_data is None, we can evaluate the objective function in these points. :param training_name: (str), prefix used to save the training data. :param mle: (boolean) If true, fits the GP by MLE. :param thinning: (int) :param n_burning: (int) Number of burnings samples for the MCMC. :param max_steps_out: (int) Maximum number of steps out for the stepping out or doubling procedure in slice sampling. :param n_samples: (int) If the objective is noisy, we take n_samples of the function to estimate its value. :param random_seed: (int) :param kernel_values: [float], contains the default values of the parameters of the kernel :param mean_value: [float], It contains the value of the mean parameter. :param var_noise_value: [float], It contains the variance of the noise of the model :param cache: (boolean) Try to get model from cache :param same_correlation: (boolean) If true, it uses the same correlations for the task kernel. :param use_only_training_points (boolean) If the model is read, and the param is true, it uses only the training points in data. Otherwise, it also includes new points previously computed. :param optimization_method: (str) :param n_samples_parameters: (int) :param parallel_training: (boolean) :param define_samplers: (boolean) If False, samplers for the hyperparameters are not defined. :return: (GPFittingGaussian) - An instance of GPFittingGaussian """ model_type = cls._model_map[name_model] if training_name is None: training_name = 'default_training_data_%d_points_rs_%d' % ( n_training, random_seed) if use_only_training_points: f_name = cls._get_filename(model_type, problem_name, type_kernel, training_name) f_name_cache = cls._get_filename_modified(model_type, problem_name, type_kernel, training_name, optimization_method, n_samples_parameters) else: f_name = cls._get_filename_modified(model_type, problem_name, type_kernel, training_name, optimization_method, n_samples_parameters) if not os.path.exists('data'): os.mkdir('data') if not os.path.exists(GP_DIR): os.mkdir(GP_DIR) gp_dir = path.join(GP_DIR, problem_name) if not os.path.exists(gp_dir): os.mkdir(gp_dir) gp_path = path.join(gp_dir, f_name) gp_path_cache = path.join(gp_dir, f_name_cache) if cache: data = JSONFile.read(gp_path) data = None else: data = None if data is not None: return model_type.deserialize( data, use_only_training_points=use_only_training_points) if training_data is None or training_data == {}: training_data = TrainingDataService.get_training_data( problem_name, training_name, bounds_domain, n_training=n_training, points=points, noise=noise, n_samples=n_samples, random_seed=random_seed, type_bounds=type_bounds, cache=cache, parallel=parallel_training, gp_path_cache=gp_path_cache, simplex_domain=simplex_domain, objective_function=objective_function) logger.info("Training %s" % model_type.__name__) gp_model = model_type.train(type_kernel, dimensions, mle, training_data, bounds_domain, thinning=thinning, n_burning=n_burning, max_steps_out=max_steps_out, random_seed=random_seed, type_bounds=type_bounds, training_name=training_name, problem_name=problem_name, kernel_values=kernel_values, mean_value=mean_value, var_noise_value=var_noise_value, same_correlation=same_correlation, simplex_domain=simplex_domain, define_samplers=define_samplers) JSONFile.write(gp_model.serialize(), gp_path) return gp_model
from stratified_bayesian_optimization.kernels.matern52 import Matern52 from stratified_bayesian_optimization.lib.sample_functions import SampleFunctions decimals = 10 random_seed = 5 np.random.seed(random_seed) n_points = 1000 points = np.linspace(0, 100, n_points) points = np.round(points, decimals=decimals) points = points.reshape([n_points, 1]) tasks = np.array([[0, 1]]) add = [10, -10] kernel = Matern52.define_kernel_from_array(1, np.array([100.0, 1.0])) function = SampleFunctions.sample_from_gp(points, kernel) function = function[0, :] final_function = {} for task in range(2): final_function[task] = [] for i in xrange(n_points): point = np.concatenate((points[i, :], np.array([task]))) final_function[task].append(function[i] + add[task]) filename = path.join('problems', 'test_simulated_gp', 'simulated_function_with_%d_%d' % (n_points, random_seed)) JSONFile.write({'function': final_function, 'points': points}, filename)
def generate_evaluations(self, problem_name, model_type, training_name, n_training, random_seed, iteration, n_points_by_dimension=None, n_tasks=0): """ Generates evaluations of SBO, and write them in the debug directory. :param problem_name: (str) :param model_type: (str) :param training_name: (str) :param n_training: (int) :param random_seed: (int) :param iteration: (int) :param n_points_by_dimension: [int] Number of points by dimension :param n_tasks: (int) n_tasks > 0 if the last element of the domain is a task """ if not os.path.exists(DEBUGGING_DIR): os.mkdir(DEBUGGING_DIR) debug_dir = path.join(DEBUGGING_DIR, problem_name) if not os.path.exists(debug_dir): os.mkdir(debug_dir) kernel_name = '' for kernel in self.gp.type_kernel: kernel_name += kernel + '_' kernel_name = kernel_name[0:-1] f_name = self._filename_points_ei_evaluations( model_type=model_type, problem_name=problem_name, type_kernel=kernel_name, training_name=training_name, n_training=n_training, random_seed=random_seed) debug_path = path.join(debug_dir, f_name) vectors = JSONFile.read(debug_path) if vectors is None: bounds = self.gp.bounds n_points = n_points_by_dimension if n_points is None: n_points = (bounds[0][1] - bounds[0][0]) * 10 if n_tasks > 0: bounds_x = [bounds[i] for i in xrange(len(bounds) - 1)] n_points_x = [n_points[i] for i in xrange(len(n_points))] else: n_points_x = n_points bounds_x = bounds points = [] for bound, number_points in zip(bounds_x, n_points_x): points.append(np.linspace(bound[0], bound[1], number_points)) vectors = [] for point in itertools.product(*points): vectors.append(point) JSONFile.write(vectors, debug_path) n = len(vectors) points_ = deepcopy(vectors) vectors = np.array(vectors) if n_tasks > 0: vectors_ = None for i in xrange(n_tasks): task_vector = np.zeros(n) + i task_vector = task_vector.reshape((n, 1)) points_ = np.concatenate((vectors, task_vector), axis=1) if vectors_ is not None: vectors_ = np.concatenate((vectors_, points_), axis=0) else: vectors_ = points_ vectors = vectors_ # TODO: extend to the case where w can be continuous n = vectors.shape[0] points = {} for i in xrange(n): points[i] = vectors[i, :] args = ( False, None, False, 0, self, ) val = Parallel.run_function_different_arguments_parallel( wrapper_objective_acquisition_function, points, *args) values = np.zeros(n) for i in xrange(n): values[i] = val.get(i) f_name = self._filename_ei_evaluations(iteration=iteration, model_type=model_type, problem_name=problem_name, type_kernel=kernel_name, training_name=training_name, n_training=n_training, random_seed=random_seed) debug_path = path.join(debug_dir, f_name) JSONFile.write({'points': points_, 'evaluations': values}, debug_path) return values
rs_2 = len(spec_2['random_seeds']) for key in keys: values_1 = None values_2 = None if key in spec_1: values_1 = spec_1[key] if key in spec_2: values_2 = spec_2[key] if values_1 is None: values_1 = rs_1 * [None] if values_2 is None: values_2 = rs_2 * [None] new_spec[key] = [] for i in xrange(max(len(values_1), len(values_2))): if i < len(values_1): value_1 = values_1[i] new_spec[key] += [value_1] if i < len(values_2): value_2 = values_2[i] new_spec[key] += [value_2] # for key in spec_1: # value_1 = spec_1[key] # value_2 = spec_2[key] # new_spec[key] = value_1 + value_2 JSONFile.write(new_spec, path.join(MULTIPLESPECS_DIR, output))
def get_click_data(cls, filenames, store_filename): """ Get click data from filenames. Writes a JSON file with the format: { 'cookie_hash': {'arxiv_id'} } :param filenames: [str] :param store_filename: str """ paper = {} process_data = {} process_files = [] store_files = "problems/arxiv/data/store_files.json" for filename in filenames: logger.info("Processing filename: %s" % filename) f = gzip.open(filename, 'rb') data = json.load(f) entries = data['entries'] for entry in entries: if 'arxiv_id' in entry and 'cookie_hash' in entry: before_2007 = False arxiv_id = entry['arxiv_id'] # if '/' in arxiv_id: # before_2007 = True # index = arxiv_id.index('/') # cat = arxiv_id[0: index] # arxiv_id = arxiv_id[index + 1:] # # if 'v' in arxiv_id: # index = arxiv_id.rfind('v') # arxiv_id = arxiv_id[0: index] # # user = entry['cookie_hash'] # # if arxiv_id not in paper: # if not before_2007: # cat = cls.get_cats(arxiv_id, arxiv_id[0: 2], arxiv_id[2: 4]) if arxiv_id not in paper: paper[arxiv_id] = {'views': 0} paper[arxiv_id]['views'] += 1 if user not in process_data: process_data[user] = {} process_data[user][arxiv_id] = 0 elif arxiv_id not in process_data[user]: process_data[user][arxiv_id] = 0 process_data[user][arxiv_id] += 1 process_files.append(filename[22:28]) JSONFile.write(process_files, store_files) JSONFile.write([process_data, paper], store_filename) JSONFile.write([process_data, paper], store_filename)
def SGD(start, gradient, n, function, exact_gradient=None, args=(), kwargs={}, bounds=None, learning_rate=0.1, momentum=0.0, maxepoch=250, adam=True, betas=None, eps=1e-8, simplex_domain=None, name_model='1', method='real_gradient', n_epochs=1, n_samples=100, gradient_samples=None, problem=None, exact_objective=None): """ SGD to minimize sum(i=0 -> n) (1/n) * f(x). Batch sizes are of size 1. ADAM: https://arxiv.org/pdf/1412.6980.pdf :param start: np.array(n) :param gradient: :param n: :param learning_rate: :param momentum: :param maxepoch: :param args: () arguments for the gradient :param kwargs: :param bounds: [(min, max)] for each point :return: np.array(n) """ values = [] points = [] gradients = [] exact_values = [] stochastic_gradients = [] if method == 'grad_epoch': gradients = {} gradient_batch = [] # points.append(np.array(start)) # values.append(function(start)) logger.info('start_value') logger.info(function(start)) # if exact_gradient is not None and method == 'real_gradient': # gradients.append(exact_gradient(start)) project = False if bounds is not None or simplex_domain is not None: project = True if betas is None: betas = (0.9, 0.999) m0 = np.zeros(len(start)) v0 = np.zeros(len(start)) point = start v = np.zeros(len(start)) times_out_boundary = 0 t_ = 0 lr = learning_rate for iteration in xrange(maxepoch): learning_rate = lr / float(iteration + 1) previous = point.copy() t_ += 1 grad = [] for j in xrange(n): gradient_ = gradient(point, *args, **kwargs) while gradient_ is np.nan: norm_point = np.sqrt(np.sum(point**2)) perturbation = norm_point * 1e-6 if project: parameters_uniform = [] for i in range(len(bounds)): bound = bounds[i] dist = point[i] - bound[0] lb = min(perturbation, dist) dist = bound[1] - point[i] ub = min(perturbation, dist) parameters_uniform.append([-lb, ub]) else: parameters_uniform = len(point) * [[ -perturbation, perturbation ]] perturbation = [] for i in range(len(point)): lb = parameters_uniform[i][0] ub = parameters_uniform[i][1] perturbation.append(np.random.uniform(lb, ub)) perturbation = np.array(perturbation) point = point + perturbation gradient_ = gradient(point, *args, **kwargs) grad.append(gradient_) gradient_ = np.mean(np.array(grad), axis=0) stochastic_gradients.append(gradient_) if exact_gradient is not None and method == 'real_gradient': gradients.append(exact_gradient(point)) points.append(np.array(point)) values.append(function(point)) if exact_objective is not None: exact_values.append(exact_objective(point)) if not adam: v = momentum * v + gradient_ old_p = point.copy() point -= learning_rate * v else: m0 = betas[0] * m0 + (1 - betas[0]) * gradient_ v0 = betas[1] * v0 + (1 - betas[1]) * (gradient_**2) m_1 = m0 / (1 - (betas[0])**(t_)) v_1 = v0 / (1 - (betas[1])**(t_)) point = point - learning_rate * m_1 / (np.sqrt(v_1) + eps) in_domain = True if project: for dim, bound in enumerate(bounds): if bound[0] is not None and point[dim] < bound[0]: in_domain = False break if bound[1] is not None and point[dim] > bound[1]: in_domain = False break if simplex_domain is not None: if np.sum(point) > simplex_domain: in_domain = False break #TODO:Only for citibike, generalize later if simplex_domain - np.sum(point) > 3717.0: in_domain = False break if project and not in_domain: for dim, bound in enumerate(bounds): if bound[0] is not None: point[dim] = max(bound[0], point[dim]) if bound[1] is not None: point[dim] = min(bound[1], point[dim]) if simplex_domain is not None: if np.sum(point) > simplex_domain: point = simplex_domain * (point / np.sum(point)) if simplex_domain - np.sum(point) > 3717.0: point = (simplex_domain - 3717.0) * (point / np.sum(point)) if not adam: for dim, bound in enumerate(bounds): v[dim] = (point[dim] - old_p[dim]) / learning_rate # points.append(np.array(point)) # values.append(function(point)) # gradients.append(np.array(gradient_)) if exact_gradient is not None and method == 'real_gradient': gradients.append(exact_gradient(point)) points.append(np.array(point)) values.append(function(point)) if exact_objective is not None: exact_values.append(exact_objective(point)) gradient_ = np.array(gradient(point, *args, **kwargs)) stochastic_gradients.append(gradient_) if method == 'grad_epoch': for iteration in range(maxepoch): if iteration % n_epochs == (n_epochs - 1): gradients[iteration] = gradient_samples( points[iteration], n_samples) results = { 'points': points, 'values': values, 'gradients': gradients, 'n_epochs': n_epochs, 'stochastic_gradients': stochastic_gradients, 'exact_values': exact_values } f_name = 'data/multi_start/analytic_example/training_results/' if not os.path.exists('data/multi_start'): os.mkdir('data/multi_start') if not os.path.exists('data/multi_start/' + problem): os.mkdir('data/multi_start/' + problem) f_name = 'data/multi_start/' + problem + '/' if not os.path.exists(f_name + 'training_results'): os.mkdir(f_name + 'training_results') f_name += 'training_results' + '/' + name_model print "optimal_value!!!" print exact_values[-1] JSONFile.write(results, f_name) return results
def get_training_data(cls, problem_name, training_name, bounds_domain, n_training=5, points=None, noise=False, n_samples=None, random_seed=DEFAULT_RANDOM_SEED, parallel=True, type_bounds=None, cache=True, gp_path_cache=None, simplex_domain=None, objective_function=None): """ :param problem_name: str :param training_name: (str), prefix used to save the training data. :param bounds_domain: [([float, float] or [float])], the first case is when the bounds are lower or upper bound of the respective entry; in the second case, it's list of finite points representing the domain of that entry. :param n_training: (int), number of training points if points is None :param points: [[float]] :param noise: boolean, true if the evaluations are noisy :param n_samples: int. If noise is true, we take n_samples of the function to estimate its value. :param random_seed: int :param parallel: (boolean) Train in parallel if it's True. :param type_bounds: [0 or 1], 0 if the bounds are lower or upper bound of the respective entry, 1 if the bounds are all the finite options for that entry. :param cache: (boolean) Try to get model from cache :return: {'points': [[float]], 'evaluations': [float], 'var_noise': [float] or []} """ if cache and gp_path_cache is not None: data = JSONFile.read(gp_path_cache) if data is not None: return data['data'] logger.info("Getting training data") rs = random_seed if points is not None and len(points) > 0: n_training = len(points) rs = 0 file_name = cls._filename( problem_name=problem_name, training_name=training_name, n_points=n_training, random_seed=rs, ) if not os.path.exists(PROBLEM_DIR): os.mkdir(PROBLEM_DIR) training_dir = path.join(PROBLEM_DIR, problem_name, 'data') if not os.path.exists(path.join(PROBLEM_DIR, problem_name)): os.mkdir(path.join(PROBLEM_DIR, problem_name)) if not os.path.exists(training_dir): os.mkdir(training_dir) training_path = path.join(training_dir, file_name) if cache: training_data = JSONFile.read(training_path) else: training_data = None if training_data is not None: return training_data if n_training == 0: return {'points': [], 'evaluations': [], 'var_noise': []} np.random.seed(random_seed) if points is None or len(points) == 0: points = cls.get_points_domain(n_training, bounds_domain, random_seed, training_name, problem_name, type_bounds, simplex_domain=simplex_domain) if objective_function is None: name_module = cls.get_name_module(problem_name) module = __import__(name_module, globals(), locals(), -1) else: name_module = None module = None training_data = {} training_data['points'] = points training_data['evaluations'] = [] training_data['var_noise'] = [] if not parallel: for point in points: if noise: if module is not None: evaluation = cls.evaluate_function( module, point, n_samples) else: evaluation = objective_function(point, n_samples) training_data['var_noise'].append(evaluation[1]) else: if module is not None: evaluation = cls.evaluate_function(module, point) else: evaluation = objective_function(point) training_data['evaluations'].append(evaluation[0]) JSONFile.write(training_data, training_path) JSONFile.write(training_data, training_path) return training_data arguments = convert_list_to_dictionary(points) if name_module is not None: kwargs = { 'name_module': name_module, 'cls_': cls, 'n_samples': n_samples } else: kwargs = { 'name_module': None, 'cls_': cls, 'n_samples': n_samples, 'objective_function': objective_function } training_points = Parallel.run_function_different_arguments_parallel( wrapper_evaluate_objective_function, arguments, **kwargs) training_points = convert_dictionary_to_list(training_points) training_data['evaluations'] = [value[0] for value in training_points] if noise: training_data['var_noise'] = [ value[1] for value in training_points ] if cache: JSONFile.write(training_data, training_path) return training_data
def cv_data_sets(cls, year, month, n_folds=5, random_seed=1): """ Creates n_folds files with pairs of datasets: (training_data, validation_data). :param year: str :param month: str (e.g. '1', '12') :param n_folds: int :param random_seed: int """ random.seed(random_seed) file_name = cls._name_training_data(year=year, month=month) data = JSONFile.read(file_name) indexes_data = range(len(data)) random.shuffle(indexes_data) n_batch = len(indexes_data) / n_folds random_indexes = [ indexes_data[i * n_batch:n_batch + i * n_batch] for i in xrange(n_folds) ] extra = 0 for j in xrange(len(indexes_data) % n_folds): random_indexes[j].append(indexes_data[n_batch + extra + (n_folds - 1) * n_batch]) extra += 1 file_name = cls._name_fold_indexes(year=year, month=month) JSONFile.write(random_indexes, file_name) for i in xrange(n_folds): validation = [data[index] for index in random_indexes[i]] training_indexes = [] for j in xrange(n_folds): if j != i: training_indexes += random_indexes[j] training = [data[index] for index in training_indexes] file_name = cls._name_fold_data_training(year=year, month=month, fold=i) JSONFile.write(training, file_name) file_name = cls._name_fold_data_training_matlab(year=year, month=month, fold=i) sio.savemat(file_name, {'training': training}) file_name = cls._name_fold_data_validation(year=year, month=month, fold=i) JSONFile.write(validation, file_name) file_name = cls._name_fold_data_validation_matlab(year=year, month=month, fold=i) sio.savemat(file_name, {'validation': validation})
def collect_multi_spec_results(cls, multiple_spec, total_iterations=None, sign=True, sqr=False, same_random_seeds=False, rs_lw=0, rs_up=None): """ Writes the files with the aggregated results :param multiple_spec: :param total_iterations: (int) Collect results until this iteration :param sign: (boolean) If true, we multiply the results by -1 :param sqr: (boolean) If true, we take the square root of the results :param same_random_seeds: (boolean) If true, we use the same random seeds for both problems :return: """ if total_iterations is None: total_iterations = 10000 n_specs = len(multiple_spec.get('random_seeds')) results_dict = {} if sign: sign = -1.0 else: sign = 1.0 if sqr: f = lambda x: x**0.5 else: f = lambda x: x if rs_up is not None: same_random_seeds = True if same_random_seeds: random_seeds = {} for method in set(multiple_spec.get('method_optimizations')): random_seeds[method] = [] for i in xrange(n_specs): problem_name = multiple_spec.get('problem_names')[i] dir = path.join(PROBLEM_DIR, problem_name, PARTIAL_RESULTS) if not os.path.exists(dir): continue training_name = multiple_spec.get('training_names')[i] n_training = multiple_spec.get('n_trainings')[i] random_seed = multiple_spec.get('random_seeds')[i] method = multiple_spec.get('method_optimizations')[i] n_samples_parameters = multiple_spec.get( 'n_samples_parameterss')[i] n_iterations = multiple_spec.get('n_iterationss')[i] file_name = cls._filename_results( problem_name=problem_name, training_name=training_name, n_points=n_training, random_seed=random_seed, method=method, n_samples_parameters=n_samples_parameters, ) file_path = path.join(dir, file_name) if not os.path.exists(file_path): continue random_seeds[method].append(random_seed) methods = list(set(multiple_spec.get('method_optimizations'))) random_seeds_check = set(random_seeds[methods[0]]) for i in xrange(1, len(methods)): random_seeds_check = random_seeds_check.intersection( random_seeds[methods[i]]) if rs_up is not None: random_seeds_check = random_seeds_check.intersection( range(rs_lw, rs_up)) for i in xrange(n_specs): problem_name = multiple_spec.get('problem_names')[i] dir = path.join(PROBLEM_DIR, problem_name, PARTIAL_RESULTS) if not os.path.exists(dir): continue training_name = multiple_spec.get('training_names')[i] n_training = multiple_spec.get('n_trainings')[i] random_seed = multiple_spec.get('random_seeds')[i] method = multiple_spec.get('method_optimizations')[i] n_samples_parameters = multiple_spec.get( 'n_samples_parameterss')[i] n_iterations = multiple_spec.get('n_iterationss')[i] if same_random_seeds and random_seed not in random_seeds_check: continue file_name = cls._filename_results( problem_name=problem_name, training_name=training_name, n_points=n_training, random_seed=random_seed, method=method, n_samples_parameters=n_samples_parameters, ) file_path = path.join(dir, file_name) if not os.path.exists(file_path): continue results = JSONFile.read(file_path) results = results['objective_values'] key_dict = (problem_name, training_name, n_training, method) if key_dict not in results_dict: results_dict[key_dict] = \ [[] for _ in range(min(n_iterations + 1, total_iterations))] for iteration in range( min(total_iterations, n_iterations + 1, len(results))): results_dict[key_dict][iteration].append( f(sign * results[iteration])) problem_names = list(set(multiple_spec.get('problem_names'))) training_names = set(multiple_spec.get('training_names')) n_trainings = set(multiple_spec.get('n_trainings')) methods = set(multiple_spec.get('method_optimizations')) aggregated_results = {} for problem in problem_names: for training in training_names: for n_training in n_trainings: for method in methods: key = (problem, training, n_training, method) aggregated_results[key] = {} if key not in results_dict: continue results = results_dict[key] for iteration in xrange( min(len(results), total_iterations)): if len(results[iteration]) > 0: values = results[iteration] mean = np.mean(values) std = np.std(values) n_samples = len(results[iteration]) ci_low = mean - 1.96 * std / np.sqrt(n_samples) ci_up = mean + 1.96 * std / np.sqrt(n_samples) aggregated_results[key][iteration] = {} aggregated_results[key][iteration][ 'mean'] = mean aggregated_results[key][iteration]['std'] = std aggregated_results[key][iteration][ 'n_samples'] = n_samples aggregated_results[key][iteration][ 'ci_low'] = ci_low aggregated_results[key][iteration][ 'ci_up'] = ci_up else: break if len(aggregated_results[key]) > 0: dir = path.join(PROBLEM_DIR, problem, AGGREGATED_RESULTS) if not os.path.exists(dir): os.mkdir(dir) file_name = cls._aggregated_results( problem_name=problem, training_name=training, n_points=n_training, method=method, ) file_path = path.join(dir, file_name) JSONFile.write(aggregated_results[key], file_path)
def test_write_debug_data(self, mock_mkdir, mock_exists): mock_exists.return_value = False with patch('__builtin__.open', mock_open()): self.gp.write_debug_data("a", "b", "c", "d", "e") JSONFile.write([], "a") mock_mkdir.assert_called_with('data/debugging/a')
def top_users_papers(cls, year, month, n_entries=100, different_papers=20, top_n=5000, n_users=None, only_assign_categories=True): """ Returns the users that accessed to at least n_entries papers, and at least different_papers were different and were in the top_n papers in the month of the year. Returns the top_n papers based on how many times they were seen. :param year: (str) :param month: (str) e.g. '1', '12' :param n_entries: (int) :param different_papers: int :param top_n: int :param n_users: (int) Maximum number of users allowed :return: [ {'paper': (int) number of times seen}, {'user': {'stats': ((int) # entries, (int) # different papers in the top_n papers), 'diff_papers': [str] } } ] """ file_name = cls._name_file_(year=year, month=month) data = JSONFile.read(file_name) users = data[0] papers = data[1] n_papers = [] paper_ls = [] for paper in papers: paper_ls.append(paper) n_papers.append(papers[paper]['views']) index_top_papers = sorted(range(len(n_papers)), key=lambda k: n_papers[k]) index_top_papers = index_top_papers[-top_n:] rank_papers = {} for index in index_top_papers: rank_papers[paper_ls[index]] = n_papers[index] paper_ls = rank_papers.keys() cls.assign_categories(paper_ls) if only_assign_categories: return rank_user = {} users_ls = [] n_entries_ls = [] for user in users: users_ls.append(user) n_entries_ls.append(sum(users[user].values())) index_top_users = sorted(range(len(n_entries_ls)), key=lambda k: n_entries_ls[k]) users_ls = [users_ls[i] for i in index_top_users] n_entries_ls = [n_entries_ls[i] for i in index_top_users] ind_bis = bisect_left(n_entries_ls, n_entries) users_ls = users_ls[ind_bis:] n_entries_ls = n_entries_ls[ind_bis:] final_users = [] metric_users = [] for user, n in zip(users_ls, n_entries_ls): diff_papers = set(users[user].keys()).intersection(set(paper_ls)) n_diff = len(diff_papers) if n_diff < different_papers: continue final_users.append(user) metric_users.append(n_diff) rank_user[user] = { 'stats': (n, n_diff), 'diff_papers': diff_papers } index_top_users = sorted(range(len(final_users)), key=lambda k: metric_users[k]) if n_users is not None and len(index_top_users) > n_users: index_top_users = index_top_users[-n_users:] rank_user_final = {} for ind in index_top_users: rank_user_final[final_users[ind]] = rank_user[final_users[ind]] rank_user = rank_user_final file_name = cls._name_file_final(year=year, month=month) JSONFile.write([rank_papers, rank_user], file_name) logger.info('Number of papers is %d' % len(rank_papers)) logger.info('Number of users is %d' % len(rank_user)) return [rank_papers, rank_user]