def optimize_parameter_lbfgs(model, param_name, f, g, bounds=(1e-4, None), disp=0, max_evals=100): from scipy.optimize import fmin_l_bfgs_b p = ModelParameterAcessor(model, param_name) # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each # evaluation def eval_f(param_as_list): old_value = p.get() # Save old p.set_flattened(param_as_list) # Set new f_val = f() p.set(old_value) # Restore old value return -f_val def eval_g(param_as_list): old_value = p.get() # Save old p.set_flattened(param_as_list) # Set new g_val = ravel(g()) p.set(old_value) # Restore old value return -g_val x0 = ravel(p.get()) #bounds = [(1e-4, None) for each in x0] # Keep the parameter positive bounds = [bounds] * len(x0) old_f_val = f() x, new_f_val, d = fmin_l_bfgs_b(eval_f, x0, fprime=eval_g, bounds=bounds, maxfun=max_evals, disp=disp) p.set_flattened(x) new_f_val = f() log.info('Optimized %s; improvement: %g' % (param_name, new_f_val - old_f_val))
def __init__(self, argv=None, kw=None): self.output_files = [] if kw and argv: raise ValueError('Must provide at most one of the argv or kw kwargs') # If the class doesn't specify 'binary', use the python file in which the class was defined. if self.binary is None: self.binary = os.path.abspath(inspect.getfile(self.__class__)) log.info('Guessing binary %s' % self.binary) if not os.path.isfile(self.binary): raise Exception('Unable to locate binary %s for condorizable job' % self.binary) if kw is not None: argv = [self.binary] + kwargs_to_argv(kw) elif argv is not None: argv = list(argv) # Install the sigterm handler signal.signal(signal.SIGTERM, self.sigterm_handler) self.argv = argv if self.argv is not None: self.parse_argv_and_run(self.argv)
def parse_argv_and_run(self, argv=None): if argv is not None: self.argv = argv condorize = self.CONDOR_FLAG in self.argv if condorize: self.argv.remove(self.CONDOR_FLAG) log_output = self.CONDOR_LOG_FLAG in self.argv if log_output: self.argv.remove(self.CONDOR_LOG_FLAG) if not condorize: raise Exception('Flag %s only applies to condor jobs' % self.CONDOR_LOG_FLAG) # Check the arguments, even if this is a condor job being started. This allows condor jobs to fail fast, # rather than dying on a remote node. options = self.check_args(self.argv) if options is None: raise Exception('check_args function must return options structure; got None instead') for filename in self.output_files: self.check_output_file_is_unlocked(filename) if condorize: log.info('Condorizing %s' % ' '.join(self.argv)) self.run_on_condor(self.argv, log_output=log_output) return try: self.lock_output_files_or_die() self.run(options) finally: self.on_exit()
def run(argv): parser = ArgumentParser() parser.add_argument('vem_model', type=str, help='SAM VEM model to use features from') parser.add_argument('-c', type=float, default=1.0, help='SVM C parameter') options = parser.parse_args(argv[1:]) log.info('Loading SAM model %s' % options.vem_model) sam_model = VEMModel.load(options.vem_model) log.info('Making dataset') dataset = make_dataset(sam_model) metric = ClassificationError() scores = [] for i in range(20): train_data, test_data = dataset.split(p=0.90, seed=i) topic_svm = TopicSVM(sam_model, C=options.c, normalize=True) topic_svm.train(train_data) predictions = topic_svm.predict(test_data) score = metric(test_data.targets, predictions) log.info(score) scores.append(score) log.info('Mean classification error: %g' % np.mean(scores))
def run(self, options): labeler = labelers.registry[options.labeler] # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing filenames = open(options.file_list).readlines() labels = [labeler(each) for each in filenames] class_list = sorted(set(labels)) writer = ArffWriter(options.dest, class_list=class_list) log.info('Writing GIST data to %s' % options.dest) for i, (filename, label) in enumerate(izip(filenames, labels)): filename = filename.strip() log.info('Processing image %d/%d' % (i+1, len(filenames))) descriptor = color_gist(filename) if options.color else grayscale_gist(filename) if options.normalize: descriptor = l2_normalize(descriptor) writer.write_example(descriptor, label) writer.close()
def run(self, options): labeler = None if options.labeler is None else labelers.registry[options.labeler] # Wait to instantiate the corpus writer until we know the dimensionality of the descriptors we'll be writing writer = None log.info('Writing SAM corpus to %s' % options.dest_corpus) filenames = open(options.file_list).readlines() for i, filename in enumerate(filenames): filename = filename.strip() log.info('Processing image %d/%d' % (i+1, len(filenames))) descriptor = color_gist(filename) if options.color else grayscale_gist(filename) if writer is None: dim = descriptor.size writer = CorpusWriter(options.dest_corpus, data_series='sam', dim=dim) normalized_descriptor = l2_normalize(descriptor) doc_label = labeler(filename) if labeler else None writer.write_doc(ascolvector(normalized_descriptor), name=filename, label=doc_label) writer.close()
def optimize_parameter(model, param_name, f, g, bounds=(1e-4, None), disp=0, max_evals=100): from scipy.optimize import fmin_tnc p = ModelParameterAcessor(model, param_name) # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each # evaluation def negative_f_and_f_prime(param_as_list): old_value = p.get() # Save old p.set_flattened(param_as_list) # Set new f_val = -f() f_prime_val = ravel(-g()) p.set(old_value) # Restore old value return f_val, f_prime_val x0 = ravel(p.get()) bounds = [bounds] * len(x0) old_f_val = f() x, nfeval, rc = fmin_tnc(negative_f_and_f_prime, x0=x0, bounds=bounds, disp=disp, maxfun=max_evals) p.set_flattened(x) new_f_val = f() log.info('Optimized %s; improvement: %g' % (param_name, new_f_val - old_f_val))
def optimize_parameter_lbfgs_coor(model, f, g, bounds=(1e-4, None), disp=0, max_evals=10): from scipy.optimize import fmin_l_bfgs_b doc_x = ModelParameterAcessor(model, 'x') topic_delta = ModelParameterAcessor(model, 'delta') # Scipy expects function parameters to be 1d, so we have to ravel/unravel the parameter values for each # evaluation def eval_f(param_as_list): old_value_doc_x = doc_x.get() # Save old old_value_topic_delta = topic_delta.get() doc_x.set_flattened(param_as_list[:model.num_docs * model.dim]) # Set new topic_delta.set_flattened(param_as_list[model.num_docs * model.dim:]) f_val = -f() doc_x.set(old_value_doc_x) # Restore old value topic_delta.set(old_value_topic_delta) return f_val def eval_g(param_as_list): old_value_doc_x = doc_x.get() # Save old old_value_topic_delta = topic_delta.get() doc_x.set_flattened(param_as_list[:model.num_docs * model.dim]) # Set new topic_delta.set_flattened(param_as_list[model.num_docs * model.dim:]) f_prime_val = -g() doc_x.set(old_value_doc_x) # Restore old value topic_delta.set(old_value_topic_delta) return f_prime_val x0 = np.concatenate([ravel(doc_x.get()),ravel(topic_delta.get())]) # bounds = [bounds] * len(x0) old_f_val = f() x, new_f_val, d = fmin_l_bfgs_b(eval_f, x0, fprime=eval_g, maxfun=max_evals, disp=disp) doc_x.set_flattened(x[:model.num_docs * model.dim]) topic_delta.set_flattened(x[model.num_docs * model.dim:]) new_f_val = f() log.info('Optimized %s; improvement: %g' % ('x,delta', new_f_val - old_f_val))
def main(argv=None): if argv is None: argv = sys.argv parser = ArgumentParser() parser.add_argument('input_file', type=str, help='Input file in evidence format') parser.add_argument('output_file', type=str, help='Path to destination corpus file') parser.add_argument('--labeler', type=str, help='Labeler to apply') options = parser.parse_args(argv[1:]) labeler = None if options.labeler is None: log.warning('no labeler provided') elif options.labeler not in labelers.registry: labeler_names = ', '.join(sorted(labelers.registry.keys())) parser.error('Invalid labeler "%s"; available options are %s' % (options.labeler, labeler_names)) else: labeler = labelers.registry[options.labeler] instance_dict = load_evidence_file(options.input_file) num_docs = len(instance_dict) feature_ids = sorted(set(chain(*[each.iterkeys() for each in instance_dict.values()]))) vocab_size = len(feature_ids) log.info('Read %d docs (vocabulary size %d) from %s' % (num_docs, vocab_size, options.input_file)) log.info('Writing L2-normalized corpus to %s' % options.output_file) writer = CorpusWriter(options.output_file, data_series='sam', dim=vocab_size) # Create a map of feature_id => dense feature index feature_index = {k:i for i, k in enumerate(feature_ids)} # For each document, convert sparse features to dense L2-normalized feature vector and write it to the corpus for name, sparse_features in instance_dict.iteritems(): doc_data = np.zeros((vocab_size, 1)) for id, count in sparse_features.iteritems(): doc_data[feature_index[id]] = count doc_data = l2_normalize(doc_data) doc_label = labeler(name) if labeler else None writer.write_doc(doc_data, name=name, label=doc_label) writer.close() wordlist_path = options.output_file + '.wordlist' log.info('Writing wordlist to %s' % wordlist_path) with open(wordlist_path, 'w') as f: f.writelines([s + '\n' for s in feature_ids])
def check_grads(model): assert np.isfinite(model.l_alpha()) assert np.isfinite(model.l_valpha()) x = model.grad_l_vmu() assert np.isfinite(x).all() import pdb try: # Main update rules log.info('xi update:', check_grad(model, 'xi', model.l_xi, model.grad_l_xi)) log.info('valpha update:', check_grad(model, 'valpha', model.l_valpha, model.grad_l_valpha)) log.info('alpha update:', check_grad(model, 'alpha', model.l_alpha, model.grad_l_alpha)) log.info('vmu update:', check_grad(model, 'vmu', model.l_vmu, model.tangent_grad_l_vmu)) f = lambda: avk(model.V, model.xi) g = lambda: deriv_avk(model.V, model.xi) log.info('avk_xi', check_grad(model, 'xi', f, g)) f = lambda: np.sum(model.e_squared_norm_batch()) g = lambda: np.sum(model.grad_e_squared_norm_xi()) log.info('grad_esn_xi', check_grad(model, 'xi', f, g)) f = lambda: np.sum(model.rho_batch()) g = lambda: np.sum(model.deriv_rho_xi()) log.info('deriv_rho_xi', check_grad(model, 'xi', f, g)) except Exception, e: log.error(e) pdb.post_mortem()
def run(self, options): if os.path.exists(options.model): log.info('Loading model snapshot from %s' % options.model) model = VEMModel.load(options.model) else: # Initialize a model from scratch log.info('Initializing new model on %s [T=%d]' % (options.corpus, options.T)) reader = CorpusReader(options.corpus, data_series='sam') model = VEMModel(reader=reader, T=options.T) while model.iteration < options.iterations: log.info('** Iteration %d / %d **' % (model.iteration + 1, options.iterations)) model.run_one_iteration() if model.iteration % SAVE_MODEL_INTERVAL == 0: log.info('Saving model snapshot...') model.save(options.model) if model.iteration % SAVE_TOPICS_INTERVAL == 0: if options.write_topics: log.info('Saving topics to %s' % options.write_topics) with open(options.write_topics, 'w') as f: model.write_topics(f) if options.write_topic_weights: log.info('Saving topic weights to %s' % options.write_topic_weights) with open(options.write_topic_weights, 'w') as f: model.write_topic_weights_arff(f) if options.write_topics: log.info('Saving topics to %s' % options.write_topics) with open(options.write_topics, 'w') as f: model.write_topics(f) if options.write_topic_weights: log.info('Saving topic weights to %s' % options.write_topic_weights) with open(options.write_topic_weights, 'w') as f: model.write_topic_weights_arff(f) model.save(options.model)
def on_exit(self): for output_file in self.output_files: lock_file = self.get_lock_file_for(output_file) if os.path.isfile(lock_file): log.info('Removing lock file %s' % lock_file) os.remove(lock_file)