def run_tuner_loop(ns): """Run tuning loop for this worker.""" is_chief = FLAGS.task_id == 0 tuning_space = ns.define_tuner_hparam_space( hparam_space_type=FLAGS.hparam_space) fixed_hparams = parse_hparams_string(FLAGS.fixed_hparams) for name in fixed_hparams: value = fixed_hparams.get(name) tuning_space[name] = [value] tuning_space_size = np.prod( [len(values) for values in tuning_space.values()]) num_local_trials, remainder = divmod(tuning_space_size, FLAGS.num_tuners) if FLAGS.tuner_id < remainder: num_local_trials += 1 starting_trial_id = (num_local_trials * FLAGS.tuner_id + min(remainder, FLAGS.tuner_id)) logging.info('tuning_space_size: %d', tuning_space_size) logging.info('num_local_trials: %d', num_local_trials) logging.info('starting_trial_id: %d', starting_trial_id) for local_trial_index in xrange(num_local_trials): trial_config = defaults.default_config_with_updates(FLAGS.config) global_trial_index = local_trial_index + starting_trial_id trial_name = 'trial_' + str(global_trial_index) trial_dir = os.path.join(FLAGS.logdir, trial_name) hparams = hparams_for_index(global_trial_index, tuning_space) ns.write_hparams_to_config(trial_config, hparams, hparam_space_type=FLAGS.hparam_space) results_list = ns.run_training(config=trial_config, tuner=None, logdir=trial_dir, is_chief=is_chief, trial_name=trial_name) if not is_chief: # Only chief worker needs to write tuning results to disk. continue objective, metrics = compute_tuning_objective( results_list, hparams, trial_name, num_trials=tuning_space_size) logging.info('metrics:\n%s', metrics) logging.info('objective: %s', objective) logging.info('programs_seen_fraction: %s', metrics['programs_seen_fraction']) logging.info('success_rate: %s', metrics['success_rate']) logging.info('success_rate_objective_weight: %s', FLAGS.success_rate_objective_weight) tuning_results_file = os.path.join(trial_dir, 'tuning_results.txt') with tf.gfile.FastGFile(tuning_results_file, 'a') as writer: writer.write(str(metrics) + '\n') logging.info('Trial %s complete.', trial_name)
def testMakeTask(self): maxlen = 100 padchr = '[' config = defaults.default_config_with_updates( 'env=c(config_for_iclr=False,fixed_string=[8,5,12,12,15])') task = code_tasks.make_task(config.env, 'print') reward_fns = task.rl_batch(1) r = reward_fns[0] self.assertClose( r('++++++++.---.+++++++...').episode_rewards[-1], 0.2444) self.assertClose( r('++++++++.---.+++++++..+++.').episode_rewards[-1], 0.935) self.assertClose( r(pad('++++++++.---.+++++++..+++.', maxlen, padchr)).episode_rewards[-1], 0.75)
def RunTrainingSteps(config_string, num_steps=10): """Run a few training steps with the given config. Just check that nothing crashes. Args: config_string: Config encoded in a string. See $REPO_PATH/common/config_lib.py num_steps: Number of training steps to run. Defaults to 10. """ config = defaults.default_config_with_updates(config_string) FLAGS.max_npe = num_steps * config.batch_size FLAGS.logdir = tf.test.get_temp_dir() FLAGS.config = config_string run.main(None)
def testVarUpdates(self): """Tests that variables get updated as expected. For the RL update, check that gradients are non-zero and that the global model gets updated. """ config = defaults.default_config_with_updates( 'env=c(task="reverse"),' 'agent=c(algorithm="pg",eos_token=True,optimizer="sgd",lr=1.0)') lr = config.agent.lr tf.reset_default_graph() trainer = pg_train.AsyncTrainer(config, task_id=0, ps_tasks=0, num_workers=1) global_init_op = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global')) with tf.Session() as sess: sess.run(global_init_op) # Initialize global copy. trainer.initialize(sess) model = trainer.model global_vars = sess.run(trainer.global_model.trainable_variables) local_vars = sess.run(model.trainable_variables) # Make sure names match. g_prefix = 'global/' l_prefix = 'local/' for g, l in zip(trainer.global_model.trainable_variables, model.trainable_variables): self.assertEqual(g.name[len(g_prefix):], l.name[len(l_prefix):]) # Assert that shapes and values are the same between global and local # models. for g, l in zip(global_vars, local_vars): self.assertEqual(g.shape, l.shape) self.assertTrue(np.array_equal(g, l)) # Make all gradients dense tensors. for param, grad in model.gradients_dict.items(): if isinstance(grad, tf.IndexedSlices): # Converts to dense tensor. model.gradients_dict[param] = tf.multiply(grad, 1.0) # Perform update. results = model.update_step(sess, trainer.data_manager.sample_rl_batch(), trainer.train_op, trainer.global_step, return_gradients=True) grads_dict = results.gradients_dict for grad in grads_dict.values(): self.assertIsNotNone(grad) self.assertTrue(np.count_nonzero(grad) > 0) global_update = sess.run(trainer.global_model.trainable_variables) for tf_var, var_before, var_after in zip(model.trainable_variables, local_vars, global_update): # Check that the params were updated. self.assertTrue( np.allclose(var_after, var_before - grads_dict[tf_var] * lr)) # Test that global to local sync works. sess.run(trainer.sync_op) global_vars = sess.run(trainer.global_model.trainable_variables) local_vars = sess.run(model.trainable_variables) for l, g in zip(local_vars, global_vars): self.assertTrue(np.allclose(l, g))
def testNumericalGradChecking(self): # Similar to # http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization. epsilon = 1e-4 eos = misc.BF_EOS_INT self.assertEqual(0, eos) config = defaults.default_config_with_updates( 'env=c(task="print"),' 'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,' 'entropy_beta=0.0,topk_loss_hparam=0.0,policy_lstm_sizes=[10],' 'eos_token=True),' 'batch_size=64') dtype = tf.float64 tf.reset_default_graph() tf.set_random_seed(12345678987654321) np.random.seed(1294024302) trainer = pg_train.AsyncTrainer(config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype) model = trainer.model actions_ph = model.actions lengths_ph = model.adjusted_lengths multipliers_ph = model.policy_multipliers loss = model.pi_loss global_init_op = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global')) assign_add_placeholders = [None] * len(model.trainable_variables) assign_add_ops = [None] * len(model.trainable_variables) param_shapes = [None] * len(model.trainable_variables) for i, param in enumerate(model.trainable_variables): param_shapes[i] = param.get_shape().as_list() assign_add_placeholders[i] = tf.placeholder( dtype, np.prod(param_shapes[i])) assign_add_ops[i] = param.assign_add( tf.reshape(assign_add_placeholders[i], param_shapes[i])) with tf.Session() as sess: sess.run(global_init_op) # Initialize global copy. trainer.initialize(sess) actions_raw = [random_sequence(10, 9) for _ in xrange(16)] actions_batch = utils.stack_pad(actions_raw, 0) lengths_batch = [len(l) for l in actions_raw] feed = { actions_ph: actions_batch, multipliers_ph: np.ones_like(actions_batch), lengths_ph: lengths_batch } estimated_grads = [None] * len(model.trainable_variables) for i, param in enumerate(model.trainable_variables): param_size = np.prod(param_shapes[i]) estimated_grads[i] = np.zeros(param_size, dtype=np.float64) for index in xrange(param_size): e = onehot(index, param_size) * epsilon sess.run(assign_add_ops[i], {assign_add_placeholders[i]: e}) j_plus = sess.run(loss, feed) sess.run(assign_add_ops[i], {assign_add_placeholders[i]: -2 * e}) j_minus = sess.run(loss, feed) sess.run(assign_add_ops[i], {assign_add_placeholders[i]: e}) estimated_grads[i][index] = (j_plus - j_minus) / (2 * epsilon) estimated_grads[i] = estimated_grads[i].reshape( param_shapes[i]) analytic_grads = sess.run(model.dense_unclipped_grads, feed) for g1, g2 in zip(estimated_grads[1:], analytic_grads[1:]): logging.info('norm (g1-g2): %s', np.abs(g1 - g2).mean()) self.assertTrue(np.allclose(g1, g2))
def testMonteCarloGradients(self): """Test Monte Carlo estimate of REINFORCE gradient. Test that the Monte Carlo estimate of the REINFORCE gradient is approximately equal to the true gradient. We compute the true gradient for a toy environment with a very small action space. Similar to section 5 of https://arxiv.org/pdf/1505.00521.pdf. """ # Test may have different outcome on different machines due to different # rounding behavior of float arithmetic. tf.reset_default_graph() tf.set_random_seed(12345678987654321) np.random.seed(1294024302) max_length = 2 num_tokens = misc.bf_num_tokens() eos = misc.BF_EOS_INT assert eos == 0 def sequence_iterator(max_length): """Iterates through all sequences up to the given length.""" yield [eos] for a in xrange(1, num_tokens): if max_length > 1: for sub_seq in sequence_iterator(max_length - 1): yield [a] + sub_seq else: yield [a] actions = list(sequence_iterator(max_length)) # This batch contains all possible episodes up to max_length. actions_batch = utils.stack_pad(actions, 0) lengths_batch = [len(s) for s in actions] reward_map = { tuple(a): np.random.randint(-1, 7) for a in actions_batch } # reward_map = {tuple(a): np.random.normal(3, 1) # for a in actions_batch} # normal distribution # reward_map = {tuple(a): 1.0 # for a in actions_batch} # expected reward is 1 n = 100000 # MC sample size. config = defaults.default_config_with_updates( 'env=c(task="print"),' 'agent=c(algorithm="pg",optimizer="sgd",lr=1.0,ema_baseline_decay=0.99,' 'entropy_beta=0.0,topk_loss_hparam=0.0,regularizer=0.0,' 'policy_lstm_sizes=[10],eos_token=True),' 'batch_size=' + str(n) + ',timestep_limit=' + str(max_length)) dtype = tf.float64 trainer = pg_train.AsyncTrainer(config, task_id=0, ps_tasks=0, num_workers=1, dtype=dtype) model = trainer.model actions_ph = model.actions lengths_ph = model.adjusted_lengths multipliers_ph = model.policy_multipliers global_init_op = tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'global')) with tf.Session() as sess, sess.graph.as_default(): sess.run(global_init_op) # Initialize global copy. trainer.initialize(sess) # Compute exact gradients. # exact_grads = sum(P(a) * grad(log P(a)) * R(a) for a in actions_batch) true_loss_unnormalized = 0.0 exact_grads = [ np.zeros(v.shape) for v in model.trainable_variables ] episode_probs_map = {} grads_map = {} for a_idx in xrange(len(actions_batch)): a = actions_batch[a_idx] grads_result, probs_result, loss = sess.run( [ model.dense_unclipped_grads, model.chosen_probs, model.loss ], { actions_ph: [a], lengths_ph: [lengths_batch[a_idx]], multipliers_ph: [ repeat_and_pad(reward_map[tuple(a)], lengths_batch[a_idx], max_length) ] }) # Take product over time axis. episode_probs_result = np.prod( probs_result[0, :lengths_batch[a_idx]]) for i in range(0, len(exact_grads)): exact_grads[i] += grads_result[i] * episode_probs_result episode_probs_map[tuple(a)] = episode_probs_result reward_map[tuple(a)] = reward_map[tuple(a)] grads_map[tuple(a)] = grads_result true_loss_unnormalized += loss # Normalize loss. Since each episode is feed into the model one at a time, # normalization needs to be done manually. true_loss = true_loss_unnormalized / float(len(actions_batch)) # Compute Monte Carlo gradients. # E_a~P[grad(log P(a)) R(a)] is aprox. eq. to # sum(grad(log P(a)) R(a) for a in actions_sampled_from_P) / n # where len(actions_sampled_from_P) == n. # # In other words, sample from the policy and compute the gradients of the # log probs weighted by the returns. This will excersize the code in # agent.py sampled_actions, sampled_lengths = sess.run( [model.sampled_tokens, model.episode_lengths]) pi_multipliers = [ repeat_and_pad(reward_map[tuple(a)], l, max_length) for a, l in zip(sampled_actions, sampled_lengths) ] mc_grads_unnormalized, sampled_probs, mc_loss_unnormalized = sess.run( [model.dense_unclipped_grads, model.chosen_probs, model.loss], { actions_ph: sampled_actions, multipliers_ph: pi_multipliers, lengths_ph: sampled_lengths }) # Loss is already normalized across the minibatch, so no normalization # is needed. mc_grads = mc_grads_unnormalized mc_loss = mc_loss_unnormalized # Make sure true loss and MC loss are similar. loss_error = smape(true_loss, mc_loss) self.assertTrue(loss_error < 0.15, msg='actual: %s' % loss_error) # Check that probs computed for episodes sampled from the model are the same # as the recorded true probs. for i in range(100): acs = tuple(sampled_actions[i].tolist()) sampled_prob = np.prod(sampled_probs[i, :sampled_lengths[i]]) self.assertTrue(np.isclose(episode_probs_map[acs], sampled_prob)) # Make sure MC estimates of true probs are close. counter = Counter(tuple(e) for e in sampled_actions) for acs in counter: count = counter.get(acs) mc_prob = count / float(len(sampled_actions)) true_prob = episode_probs_map[acs] error = smape(mc_prob, true_prob) self.assertTrue( error < 0.15, msg='actual: %s; count: %s; mc_prob: %s; true_prob: %s' % (error, count, mc_prob, true_prob)) # Manually recompute MC gradients and make sure they match MC gradients # computed in TF. mc_grads_recompute = [ np.zeros(v.shape) for v in model.trainable_variables ] angle_frac = None for i in range(n): acs = tuple(sampled_actions[i].tolist()) for i in range(0, len(mc_grads_recompute)): mc_grads_recompute[i] += grads_map[acs][i] for i in range(0, len(mc_grads_recompute)): self.assertTrue(np.allclose(mc_grads[i], mc_grads_recompute[i] / n)) # Check angle between gradients as fraction of pi. for index in range(len(mc_grads)): v1 = mc_grads[index].reshape(-1) v2 = exact_grads[index].reshape(-1) # angle = arccos(v1 . v2 / (|v1|*|v2|)) angle_rad = np.arccos( np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))) logging.info('angle / pi: %s', angle_rad / np.pi) angle_frac = angle_rad / np.pi self.assertTrue(angle_frac < 0.02, msg='actual: %s' % angle_frac) # Check norms. for index in range(len(mc_grads)): v1_norm = np.linalg.norm(mc_grads[index].reshape(-1)) v2_norm = np.linalg.norm(exact_grads[index].reshape(-1)) error = smape(v1_norm, v2_norm) self.assertTrue(error < 0.02, msg='actual: %s' % error) # Check expected rewards. # E_a~P[R(a)] approx eq sum(P(a) * R(a) for a in actions) mc_expected_reward = np.mean( [reward_map[tuple(a)] for a in sampled_actions]) exact_expected_reward = np.sum( [episode_probs_map[k] * reward_map[k] for k in reward_map]) error = smape(mc_expected_reward, exact_expected_reward) self.assertTrue(error < 0.005, msg='actual: %s' % angle_frac)
def run_training( config=None, tuner=None, logdir=None, trial_name=None, # pylint: disable=unused-argument is_chief=True): """Do all training runs. This is the top level training function for policy gradient based models. Run this from the main function. Args: config: config_lib.Config instance containing global config (agent and environment hparams). If None, config will be parsed from FLAGS.config. tuner: (unused) A tuner instance. Leave as None if not tuning. logdir: Parent directory where all data from all runs will be written. If None, FLAGS.logdir will be used. trial_name: (unused) If tuning, set this to a unique string that identifies this trial. If `tuner` is not None, this also must be set. is_chief: True if this worker is the chief. Returns: List of results dicts which were written to disk. Each training run gets a results dict. Results dict contains metrics, i.e. (name, value) pairs which give information about the training run. Raises: ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions. ValueError: If results dicts read from disk contain invalid data. """ if not config: # If custom config is not given, get it from flags. config = defaults.default_config_with_updates(FLAGS.config) if not logdir: logdir = FLAGS.logdir if FLAGS.num_repetitions % FLAGS.num_workers != 0: raise ValueError('Number of workers must divide number of repetitions') num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers logging.info('Running %d reps globally.', FLAGS.num_repetitions) logging.info('This worker will run %d local reps.', num_local_reps) if FLAGS.max_npe: max_generations = FLAGS.max_npe // config.batch_size logging.info('Max samples per rep: %d', FLAGS.max_npe) logging.info('Max generations per rep: %d', max_generations) else: max_generations = sys.maxsize logging.info('Running unlimited generations.') assert FLAGS.num_workers > 0 logging.info('Starting experiment. Directory: "%s"', logdir) results = results_lib.Results(logdir, FLAGS.task_id) local_results_list = results.read_this_shard() if local_results_list: if local_results_list[0]['max_npe'] != FLAGS.max_npe: raise ValueError( 'Cannot resume training. Max-NPE changed. Was %s, now %s', local_results_list[0]['max_npe'], FLAGS.max_npe) if local_results_list[0][ 'max_global_repetitions'] != FLAGS.num_repetitions: raise ValueError( 'Cannot resume training. Number of repetitions changed. Was %s, ' 'now %s', local_results_list[0]['max_global_repetitions'], FLAGS.num_repetitions) start_rep = len(local_results_list) for rep in xrange(start_rep, num_local_reps): global_rep = num_local_reps * FLAGS.task_id + rep logging.info('Starting repetition: Rep = %d. (global rep = %d)', rep, global_rep) # Save data for each rep, like checkpoints, goes into separate folders. run_dir = os.path.join(logdir, 'run_%d' % global_rep) if not tf.gfile.IsDirectory(run_dir): tf.gfile.MakeDirs(run_dir) checkpoint_writer = CheckpointWriter(run_dir, population_size=config.batch_size) data_manager = data.DataManager(config, run_number=global_rep) task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task) if config.agent.algorithm == 'rand': logging.info('Running random search.') assert FLAGS.max_npe result = run_random_search(FLAGS.max_npe, run_dir, task_eval_fn, config.timestep_limit) else: assert config.agent.algorithm == 'ga' logging.info('Running genetic algorithm.') pop = ga_lib.make_population(ga_lib.random_individual( config.timestep_limit), n=config.batch_size) hof = utils.MaxUniquePriorityQueue(2) # Hall of fame. result = ga_lib.ga_loop(pop, cxpb=config.agent.crossover_rate, mutpb=config.agent.mutation_rate, task_eval_fn=task_eval_fn, ngen=max_generations, halloffame=hof, checkpoint_writer=checkpoint_writer) logging.info('Finished rep. Num gens: %d', result.generations) results_dict = { 'max_npe': FLAGS.max_npe, 'batch_size': config.batch_size, 'max_batches': FLAGS.max_npe // config.batch_size, 'npe': result.num_programs, 'max_global_repetitions': FLAGS.num_repetitions, 'max_local_repetitions': num_local_reps, 'code_solution': result.best_code if result.solution_found else '', 'best_reward': result.reward, 'num_batches': result.generations, 'found_solution': result.solution_found, 'task': data_manager.task_name, 'global_rep': global_rep } logging.info('results_dict: %s', results_dict) results.append(results_dict) if is_chief: logging.info( 'Worker is chief. Waiting for all workers to finish so that results ' 'can be reported to the tuner.') global_results_list, shard_stats = results.read_all( num_shards=FLAGS.num_workers) while not all(s.finished for s in shard_stats): logging.info( 'Still waiting on these workers: %s', ', '.join([ '%d (%d reps left)' % (i, s.max_local_reps - s.num_local_reps_completed) for i, s in enumerate(shard_stats) if not s.finished ])) sleep(60) global_results_list, shard_stats = results.read_all( num_shards=FLAGS.num_workers) logging.info( '%d results obtained. Chief worker is exiting the experiment.', len(global_results_list)) return global_results_list