def run_training(
        config=None,
        tuner=None,
        logdir=None,
        trial_name=None,  # pylint: disable=unused-argument
        is_chief=True):
    """Do all training runs.

  This is the top level training function for policy gradient based models.
  Run this from the main function.

  Args:
    config: config_lib.Config instance containing global config (agent and
        environment hparams). If None, config will be parsed from FLAGS.config.
    tuner: (unused) A tuner instance. Leave as None if not tuning.
    logdir: Parent directory where all data from all runs will be written. If
        None, FLAGS.logdir will be used.
    trial_name: (unused) If tuning, set this to a unique string that identifies
        this trial. If `tuner` is not None, this also must be set.
    is_chief: True if this worker is the chief.

  Returns:
    List of results dicts which were written to disk. Each training run gets a
    results dict. Results dict contains metrics, i.e. (name, value) pairs which
    give information about the training run.

  Raises:
    ValueError: If FLAGS.num_workers does not divide FLAGS.num_repetitions.
    ValueError: If results dicts read from disk contain invalid data.
  """
    if not config:
        # If custom config is not given, get it from flags.
        config = defaults.default_config_with_updates(FLAGS.config)
    if not logdir:
        logdir = FLAGS.logdir

    if FLAGS.num_repetitions % FLAGS.num_workers != 0:
        raise ValueError('Number of workers must divide number of repetitions')
    num_local_reps = FLAGS.num_repetitions // FLAGS.num_workers
    logging.info('Running %d reps globally.', FLAGS.num_repetitions)
    logging.info('This worker will run %d local reps.', num_local_reps)
    if FLAGS.max_npe:
        max_generations = FLAGS.max_npe // config.batch_size
        logging.info('Max samples per rep: %d', FLAGS.max_npe)
        logging.info('Max generations per rep: %d', max_generations)
    else:
        max_generations = sys.maxint
        logging.info('Running unlimited generations.')

    assert FLAGS.num_workers > 0
    logging.info('Starting experiment. Directory: "%s"', logdir)
    results = results_lib.Results(logdir, FLAGS.task_id)
    local_results_list = results.read_this_shard()
    if local_results_list:
        if local_results_list[0]['max_npe'] != FLAGS.max_npe:
            raise ValueError(
                'Cannot resume training. Max-NPE changed. Was %s, now %s',
                local_results_list[0]['max_npe'], FLAGS.max_npe)
        if local_results_list[0][
                'max_global_repetitions'] != FLAGS.num_repetitions:
            raise ValueError(
                'Cannot resume training. Number of repetitions changed. Was %s, '
                'now %s', local_results_list[0]['max_global_repetitions'],
                FLAGS.num_repetitions)
    start_rep = len(local_results_list)

    for rep in xrange(start_rep, num_local_reps):
        global_rep = num_local_reps * FLAGS.task_id + rep
        logging.info('Starting repetition: Rep = %d. (global rep = %d)', rep,
                     global_rep)

        # Save data for each rep, like checkpoints, goes into separate folders.
        run_dir = os.path.join(logdir, 'run_%d' % global_rep)

        if not tf.gfile.IsDirectory(run_dir):
            tf.gfile.MakeDirs(run_dir)
        checkpoint_writer = CheckpointWriter(run_dir,
                                             population_size=config.batch_size)

        data_manager = data.DataManager(config, run_number=global_rep)
        task_eval_fn = ga_lib.make_task_eval_fn(data_manager.rl_task)

        if config.agent.algorithm == 'rand':
            logging.info('Running random search.')
            assert FLAGS.max_npe
            result = run_random_search(FLAGS.max_npe, run_dir, task_eval_fn,
                                       config.timestep_limit)
        else:
            assert config.agent.algorithm == 'ga'
            logging.info('Running genetic algorithm.')
            pop = ga_lib.make_population(ga_lib.random_individual(
                config.timestep_limit),
                                         n=config.batch_size)
            hof = utils.MaxUniquePriorityQueue(2)  # Hall of fame.
            result = ga_lib.ga_loop(pop,
                                    cxpb=config.agent.crossover_rate,
                                    mutpb=config.agent.mutation_rate,
                                    task_eval_fn=task_eval_fn,
                                    ngen=max_generations,
                                    halloffame=hof,
                                    checkpoint_writer=checkpoint_writer)

        logging.info('Finished rep. Num gens: %d', result.generations)

        results_dict = {
            'max_npe': FLAGS.max_npe,
            'batch_size': config.batch_size,
            'max_batches': FLAGS.max_npe // config.batch_size,
            'npe': result.num_programs,
            'max_global_repetitions': FLAGS.num_repetitions,
            'max_local_repetitions': num_local_reps,
            'code_solution': result.best_code if result.solution_found else '',
            'best_reward': result.reward,
            'num_batches': result.generations,
            'found_solution': result.solution_found,
            'task': data_manager.task_name,
            'global_rep': global_rep
        }
        logging.info('results_dict: %s', results_dict)
        results.append(results_dict)

    if is_chief:
        logging.info(
            'Worker is chief. Waiting for all workers to finish so that results '
            'can be reported to the tuner.')

        global_results_list, shard_stats = results.read_all(
            num_shards=FLAGS.num_workers)
        while not all(s.finished for s in shard_stats):
            logging.info(
                'Still waiting on these workers: %s', ', '.join([
                    '%d (%d reps left)' %
                    (i, s.max_local_reps - s.num_local_reps_completed)
                    for i, s in enumerate(shard_stats) if not s.finished
                ]))
            sleep(60)
            global_results_list, shard_stats = results.read_all(
                num_shards=FLAGS.num_workers)

        logging.info(
            '%d results obtained. Chief worker is exiting the experiment.',
            len(global_results_list))

        return global_results_list
Пример #2
0
    def __init__(self,
                 config,
                 task_id,
                 ps_tasks,
                 num_workers,
                 is_chief=True,
                 summary_writer=None,
                 dtype=tf.float32,
                 summary_interval=1,
                 run_number=0,
                 logging_dir='/tmp',
                 model_v=0):
        self.config = config
        self.data_manager = data.DataManager(
            config,
            run_number=run_number,
            do_code_simplification=not FLAGS.stop_on_success)
        self.task_id = task_id
        self.ps_tasks = ps_tasks
        self.is_chief = is_chief
        if ps_tasks == 0:
            assert task_id == 0, 'No parameter servers specified. Expecting 1 task.'
            assert num_workers == 1, (
                'No parameter servers specified. Expecting 1 task.')
            worker_device = '/job:localhost/replica:%d/task:0/cpu:0' % task_id
            # worker_device = '/cpu:0'
            # ps_device = '/cpu:0'
        else:
            assert num_workers > 0, 'There must be at least 1 training worker.'
            worker_device = '/job:worker/replica:%d/task:0/cpu:0' % task_id
            # ps_device = '/job:ps/replica:0/task:0/cpu:0'
        tf.logging.info('worker_device: %s', worker_device)

        logging_file = os.path.join(logging_dir, 'solutions_%d.txt' % task_id)
        experience_replay_file = os.path.join(
            logging_dir, 'replay_buffer_%d.pickle' % task_id)
        self.topk_file = os.path.join(logging_dir,
                                      'topk_buffer_%d.pickle' % task_id)

        tf.get_variable_scope().set_use_resource(True)

        # global model
        with tf.device(
                tf.train.replica_device_setter(ps_tasks,
                                               ps_device='/job:ps/replica:0',
                                               worker_device=worker_device)):
            with tf.variable_scope('global'):
                global_model = agent_lib.LMAgent(config,
                                                 dtype=dtype,
                                                 is_local=False)
                global_params_dict = {
                    p.name: p
                    for p in global_model.sync_variables
                }
                self.global_model = global_model
                self.global_step = make_initialized_variable(0,
                                                             'global_step',
                                                             dtype=tf.int64)

                self.global_best_reward = make_initialized_variable(
                    -10.0, 'global_best_reward', dtype=tf.float64)
                self.is_best_model = make_initialized_variable(False,
                                                               'is_best_model',
                                                               dtype=tf.bool)
                self.reset_is_best_model = self.is_best_model.assign(False)
                self.global_best_reward_placeholder = tf.placeholder(
                    tf.float64, [], name='global_best_reward_placeholder')
                self.assign_global_best_reward_op = tf.group(
                    self.global_best_reward.assign(
                        self.global_best_reward_placeholder),
                    self.is_best_model.assign(True))

                def assign_global_best_reward_fn(session, reward):
                    reward = round(reward, 10)
                    best_reward = round(session.run(self.global_best_reward),
                                        10)
                    is_best = reward > best_reward
                    if is_best:
                        session.run(
                            self.assign_global_best_reward_op,
                            {self.global_best_reward_placeholder: reward})
                    return is_best

                self.assign_global_best_reward_fn = assign_global_best_reward_fn

                # Any worker will set to true when it finds a solution.
                self.found_solution_flag = make_initialized_variable(
                    False, 'found_solution_flag', dtype=tf.bool)
                self.found_solution_op = self.found_solution_flag.assign(True)

                self.run_number = make_initialized_variable(run_number,
                                                            'run_number',
                                                            dtype=tf.int32)

                # Store a solution when found.
                self.code_solution_variable = tf.get_variable(
                    'code_solution', [],
                    tf.string,
                    initializer=tf.constant_initializer(''))
                self.code_solution_ph = tf.placeholder(tf.string, [],
                                                       name='code_solution_ph')
                self.code_solution_assign_op = self.code_solution_variable.assign(
                    self.code_solution_ph)

                def assign_code_solution_fn(session, code_solution_string):
                    session.run(self.code_solution_assign_op,
                                {self.code_solution_ph: code_solution_string})

                self.assign_code_solution_fn = assign_code_solution_fn

                # Count all programs sampled from policy. This does not include
                # programs sampled from replay buffer.
                # This equals NPE (number of programs executed). Only programs sampled
                # from the policy need to be executed.
                self.program_count = make_initialized_variable(0,
                                                               'program_count',
                                                               dtype=tf.int64)

        # local model
        with tf.device(worker_device):
            with tf.variable_scope('local'):
                self.model = model = agent_lib.LMAgent(
                    config,
                    task_id=task_id,
                    logging_file=logging_file,
                    experience_replay_file=experience_replay_file,
                    dtype=dtype,
                    global_best_reward_fn=self.assign_global_best_reward_fn,
                    found_solution_op=self.found_solution_op,
                    assign_code_solution_fn=self.assign_code_solution_fn,
                    program_count=self.program_count,
                    stop_on_success=FLAGS.stop_on_success,
                    verbose_level=model_v)
                local_params = model.trainable_variables
                local_params_dict = {p.name: p for p in local_params}

        # Pull global params to local model.
        def _global_to_local_scope(name):
            assert name.startswith('global/')
            return 'local' + name[6:]

        sync_dict = {
            local_params_dict[_global_to_local_scope(p_name)]: p
            for p_name, p in global_params_dict.items()
        }
        self.sync_op = tf.group(*[
            v_local.assign(v_global)
            for v_local, v_global in sync_dict.items()
        ])

        # Pair local gradients with global params.
        grad_var_dict = {
            gradient: sync_dict[local_var]
            for local_var, gradient in model.gradients_dict.items()
        }

        # local model
        model.make_summary_ops()  # Don't put summaries under 'local' scope.
        with tf.variable_scope('local'):
            self.train_op = model.optimizer.apply_gradients(
                grad_var_dict.items(), global_step=self.global_step)
            self.local_init_op = tf.variables_initializer(
                tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                  tf.get_variable_scope().name))

        self.local_step = 0
        self.last_summary_time = time.time()
        self.summary_interval = summary_interval
        self.summary_writer = summary_writer
        self.cached_global_step = -1
        self.cached_global_npe = -1

        tf.logging.info('summary_interval: %d', self.summary_interval)

        # Load top-k buffer.
        if self.model.top_episodes is not None and tf.gfile.Exists(
                self.topk_file):
            try:
                with tf.gfile.FastGFile(self.topk_file, 'r') as f:
                    self.model.top_episodes = cPickle.loads(f.read())
                tf.logging.info(
                    'Loaded top-k buffer from disk with %d items. Location: "%s"',
                    len(self.model.top_episodes), self.topk_file)
            except (cPickle.UnpicklingError, EOFError) as e:
                tf.logging.warn(
                    'Failed to load existing top-k buffer from disk. Removing bad file.'
                    '\nLocation: "%s"\nException: %s', self.topk_file, str(e))
                tf.gfile.Remove(self.topk_file)