def _set_up(self, eval_mode): """Sets up the runner by creating and initializing the agent.""" # Reset the tf default graph to avoid name collisions from previous runs # before doing anything else. tf.reset_default_graph() self._summary_writer = tf.summary.FileWriter(self._output_dir) if self._episode_log_file: self._episode_writer = tf.io.TFRecordWriter( os.path.join(self._output_dir, self._episode_log_file)) # Set up a session and initialize variables. self._sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self._agent = self._create_agent_fn( self._sess, self._env, summary_writer=self._summary_writer, eval_mode=eval_mode) # type check: env/agent must both be multi- or single-user if self._agent.multi_user and not isinstance( self._env.environment, environment.MultiUserEnvironment): raise ValueError( 'Multi-user agent requires multi-user environment.') if not self._agent.multi_user and isinstance( self._env.environment, environment.MultiUserEnvironment): raise ValueError( 'Single-user agent requires single-user environment.') self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._sess.run(tf.local_variables_initializer())
def main(args): run_name = FLAGS.run_name or time.strftime('%Y%m%d-%H%M%S', time.localtime()) output_dir = path.join(FLAGS.run_dir, run_name) gin.bind_parameter('SC2EnvironmentConfig.map_name', FLAGS.map) gin_files = [] if path.exists(output_dir): print('Resuming', output_dir) gin_files.append(path.join(output_dir, 'operative_config-0.gin')) if FLAGS.gin_file: gin_files += FLAGS.gin_file gin.parse_config_files_and_bindings(gin_files, FLAGS.gin_param, finalize_config=True) env = VecEnv(SC2Environment, SC2EnvironmentConfig()) try: agent = A2CAgent(env.spec, callbacks=RewardSummaryHook( summary_output_dir=output_dir, write_summaries_secs=30)) runner = Runner(env, agent) print_parameter_summary() config = tf.ConfigProto() config.gpu_options.allow_growth = FLAGS.gpu_memory_allow_growth if FLAGS.gpu_memory_fraction: config.gpu_options.per_process_gpu_memory_fraction = FLAGS.gpu_memory_fraction hooks = [gin.tf.GinConfigSaverHook(output_dir)] if FLAGS.step_limit: hooks.append(tf.train.StopAtStepHook(last_step=FLAGS.step_limit)) hooks.append(LogProgressHook(FLAGS.step_limit)) if FLAGS.profile: hooks.append( tf.train.ProfilerHook(save_secs=60, output_dir=output_dir)) if FLAGS.debug: hooks.append(tf_debug.LocalCLIDebugHook()) else: hooks.append(tf.train.NanTensorHook(agent.loss)) with tf.train.MonitoredTrainingSession( config=config, hooks=hooks, checkpoint_dir=output_dir, save_summaries_secs=30, save_checkpoint_secs=FLAGS.save_checkpoint_secs, save_checkpoint_steps=FLAGS.save_checkpoint_steps) as sess: while not sess.should_stop(): def step_fn(step_context): runner.train(step_context, 512) sess.run_step_fn(step_fn) finally: env.close()
def _set_up(self, eval_mode): """Sets up the runner by creating and initializing the agent.""" # Reset the tf default graph to avoid name collisions from previous runs # before doing anything else. tf.reset_default_graph() self._summary_writer = tf.summary.FileWriter(self._output_dir) if self._episode_log_file: self._episode_writer = tf.python_io.TFRecordWriter( os.path.join(self._output_dir, self._episode_log_file)) # Set up a session and initialize variables. self._sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self._agent = self._create_agent_fn( self._sess, self._env, summary_writer=self._summary_writer, eval_mode=eval_mode) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._sess.run(tf.local_variables_initializer())
def train_q(dataset, policy, optimizer=None, pack_transition_fn=None, q_graph_fn=None, log_dir=None, master='', task=0, training_steps=None, max_training_steps=100000, reuse=False, init_checkpoint=None, update_target_every_n_steps=50, log_every_n_steps=None, save_checkpoint_steps=500, save_summaries_steps=500): """Self-contained learning loop for offline Q-learning. Code inspired by OpenAI Baselines' deepq.build_train. This function is compatible with discrete Q-learning graphs, continuous Q learning graphs, and SARSA. Args: dataset: tf.data.Dataset providing transitions. policy: Instance of TFDQNPolicy class that provides functor for building the critic function. optimizer: Optional instance of an optimizer. If not specified, creates an AdamOptimizer using the default constructor. pack_transition_fn: Optional function that performs additional processing of the transition. This is a convenience method for ad-hoc manipulation of transition data passed to the learning function after parsing. q_graph_fn: Function used to construct training objectives w.r.t. critic outputs. log_dir: Where to save model checkpoints and tensorboard summaries. master: Optional address of master worker. Specify this when doing distributed training. task: Optional worker task for distributed training. Defaults to solo master task on a single machine. training_steps: Optional number of steps to run training before terminating early. Max_training_steps remains unchanged - training will terminate after max_training_steps whether or not training_steps is specified. max_training_steps: maximum number of training iters. reuse: If True, reuse existing variables for all declared variables by this function. init_checkpoint: Optional checkpoint to restore prior to training. If not provided, variables are initialized using global_variables_initializer(). update_target_every_n_steps: How many global steps (training) between copying the Q network weights (scope='q_func') to target network (scope='target_q_func'). log_every_n_steps: How many global steps between logging loss tensors. save_checkpoint_steps: How many global steps between saving TF variables to a checkpoint file. save_summaries_steps: How many global steps between saving TF summaries. Returns: (int) Current `global_step` reached after training for training_steps, or `max_training_steps` if `global_step` has reached `max_training_steps`. Raises: ValueError: If a batch of transitions is empty or the zeroth element is empty, when it's supposed to be of length batch_size. """ data_iterator = dataset.make_one_shot_iterator() transition = data_iterator.get_next() if pack_transition_fn: transition = pack_transition_fn(transition) if optimizer is None: optimizer = tf.train.AdamOptimizer() q_func = policy.get_q_func(is_training=True, reuse=reuse) loss, all_summaries = q_graph_fn(q_func, transition) q_func_vars = contrib_framework.get_trainable_variables(scope='q_func') target_q_func_vars = contrib_framework.get_trainable_variables( scope='target_q_func') global_step = tf.train.get_or_create_global_step() # Only optimize q_func and update its batchnorm params. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='q_func') with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step=global_step, var_list=q_func_vars) chief_hooks = [] hooks = [] # Save summaries periodically. if save_summaries_steps is not None: chief_hooks.append( tf.train.SummarySaverHook(save_steps=save_summaries_steps, output_dir=log_dir, summary_op=all_summaries)) # Stop after training_steps if max_training_steps: hooks.append(tf.train.StopAtStepHook(last_step=max_training_steps)) # Report if loss tensor is NaN. hooks.append(tf.train.NanTensorHook(loss)) if log_every_n_steps is not None: tensor_dict = {'global_step': global_step, 'train loss': loss} chief_hooks.append( tf.train.LoggingTensorHook(tensor_dict, every_n_iter=log_every_n_steps)) # Measure how fast we are training per sec and save to summary. chief_hooks.append( tf.train.StepCounterHook(every_n_steps=log_every_n_steps, output_dir=log_dir)) # If target network exists, periodically update target Q network with new # weights (frozen target network). We hack this by # abusing a LoggingTensorHook for this. if target_q_func_vars and update_target_every_n_steps is not None: update_target_expr = [] for var, var_t in zip(sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_t.assign(var)) update_target_expr = tf.group(*update_target_expr) with tf.control_dependencies([update_target_expr]): update_target = tf.constant(0) chief_hooks.append( tf.train.LoggingTensorHook( {'update_target': update_target}, every_n_iter=update_target_every_n_steps)) # Save checkpoints periodically, save all of them. saver = tf.train.Saver(max_to_keep=None) chief_hooks.append( tf.train.CheckpointSaverHook(log_dir, save_steps=save_checkpoint_steps, saver=saver, checkpoint_basename='model.ckpt')) # Save our experiment params to checkpoint dir. chief_hooks.append( gin.tf.GinConfigSaverHook(log_dir, summarize_config=True)) session_config = tf.ConfigProto(log_device_placement=False) init_fn = None if init_checkpoint: assign_fn = contrib_framework.assign_from_checkpoint_fn( init_checkpoint, contrib_framework.get_model_variables()) init_fn = lambda _, sess: assign_fn(sess) scaffold = tf.train.Scaffold(saver=saver, init_fn=init_fn) with tf.train.MonitoredTrainingSession( master=master, is_chief=(task == 0), config=session_config, checkpoint_dir=log_dir, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_hooks) as sess: np_step = 0 while not sess.should_stop(): np_step, _ = sess.run([global_step, train_op]) if training_steps and np_step % training_steps == 0: break done = np_step >= max_training_steps return np_step, done
def __init__(self, base_dir, data_load_fn=load_data, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=250, batch_size=100, evaluation_inputs=None, evaluation_size=None): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. data_load_fn: function that returns data as a tuple (inputs, outputs). checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. batch_size: int, batch size used for the training. evaluation_inputs: tuple of inputs to the generator that can be used during qualitative evaluation. If None, inputs set passed above will be used. evaluation_size: int, the number of images that should be generated randomly sampling from the data specified in evaluation_inputs. If None, all evaluation_inputs are generated. This constructor will take the following actions: - Initialize a `tf.Session`. - Initialize a logger. - Initialize a generator. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None inputs, data_to_generate = data_load_fn() assert inputs is None or inputs.shape[0] == data_to_generate.shape[0] assert evaluation_inputs is None or \ evaluation_inputs.shape[1:] == inputs.shape[1:] assert evaluation_inputs is not None or evaluation_size is not None, \ 'Either evaluation_inputs or evaluation_size has to be initialised.' self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._data_to_generate = data_to_generate self._inputs = inputs self._num_iterations = num_iterations self._training_steps = training_steps self._batch_size = batch_size self._evaluation_inputs = evaluation_inputs if self._evaluation_inputs is None: self._evaluation_inputs = inputs self._evaluation_size = evaluation_size self._base_dir = base_dir self._create_directories() self._summary_writer = tf.summary.FileWriter(self._base_dir) config = tf.ConfigProto(allow_soft_placement=True) # Allocate only subset of the GPU memory as needed which allows for running # multiple workers on the same GPU. config.gpu_options.allow_growth = True # Set up a session and initialize variables. self._sess = tf.Session('', config=config) self._generator = create_generator(self._sess, data_to_generate, inputs, summary_writer=self._summary_writer) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix)
def __init__(self, num_actions=None, observation_size=None, num_players=None, gamma=0.99, update_horizon=1, min_replay_history=500, update_period=4, stack_size=1, target_update_period=500, epsilon_fn=linearly_decaying_epsilon, epsilon_train=0.02, epsilon_eval=0.001, epsilon_decay_period=1000, graph_template=dqn_template, tf_device='/cpu:*', use_staging=True, optimizer=tf.train.RMSPropOptimizer(learning_rate=.0025, decay=0.95, momentum=0.0, epsilon=1e-6, centered=True)): """Initializes the agent and constructs its graph. Args: num_actions: int, number of actions the agent can take at any state. observation_size: int, size of observation vector. num_players: int, number of players playing this game. gamma: float, discount factor as commonly used in the RL literature. update_horizon: int, horizon at which updates are performed, the 'n' in n-step update. min_replay_history: int, number of stored transitions before training. update_period: int, period between DQN updates. stack_size: int, number of observations to use as state. target_update_period: Update period for the target network. epsilon_fn: Function expecting 4 parameters: (decay_period, step, warmup_steps, epsilon), and which returns the epsilon value used for exploration during training. epsilon_train: float, final epsilon for training. epsilon_eval: float, epsilon during evaluation. epsilon_decay_period: int, number of steps for epsilon to decay. graph_template: function for building the neural network graph. tf_device: str, Tensorflow device on which to run computations. use_staging: bool, when True use a staging area to prefetch the next sampling batch. optimizer: Optimizer instance used for learning. """ self.partial_reload = False tf.logging.info('Creating %s agent with the following parameters:', self.__class__.__name__) tf.logging.info('\t gamma: %f', gamma) tf.logging.info('\t update_horizon: %f', update_horizon) tf.logging.info('\t min_replay_history: %d', min_replay_history) tf.logging.info('\t update_period: %d', update_period) tf.logging.info('\t target_update_period: %d', target_update_period) tf.logging.info('\t epsilon_train: %f', epsilon_train) tf.logging.info('\t epsilon_eval: %f', epsilon_eval) tf.logging.info('\t epsilon_decay_period: %d', epsilon_decay_period) tf.logging.info('\t tf_device: %s', tf_device) tf.logging.info('\t use_staging: %s', use_staging) tf.logging.info('\t optimizer: %s', optimizer) # Global variables. self.num_actions = num_actions self.observation_size = observation_size self.num_players = num_players self.gamma = gamma self.update_horizon = update_horizon self.cumulative_gamma = math.pow(gamma, update_horizon) self.min_replay_history = min_replay_history self.target_update_period = target_update_period self.epsilon_fn = epsilon_fn self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.epsilon_decay_period = epsilon_decay_period self.update_period = update_period self.eval_mode = False self.training_steps = 0 self.batch_staged = False self.optimizer = optimizer with tf.device(tf_device): # Calling online_convnet will generate a new graph as defined in # graph_template using whatever input is passed, but will always share # the same weights. online_convnet = tf.make_template('Online', graph_template) target_convnet = tf.make_template('Target', graph_template) # The state of the agent. The last axis is the number of past observations # that make up the state. states_shape = (1, observation_size, stack_size) self.state = np.zeros(states_shape) self.state_ph = tf.placeholder(tf.uint8, states_shape, name='state_ph') self.legal_actions_ph = tf.placeholder(tf.float32, [self.num_actions], name='legal_actions_ph') self._q = online_convnet(state=self.state_ph, num_actions=self.num_actions) self._replay = self._build_replay_memory(use_staging) self._replay_qs = online_convnet(self._replay.states, self.num_actions) self._replay_next_qt = target_convnet(self._replay.next_states, self.num_actions) self._train_op = self._build_train_op() self._sync_qt_ops = self._build_sync_op() self._q_argmax = tf.argmax(self._q + self.legal_actions_ph, axis=1)[0] # Set up a session and initialize variables. self._sess = tf.Session( '', config=tf.ConfigProto(allow_soft_placement=True)) self._init_op = tf.global_variables_initializer() self._sess.run(self._init_op) self._saver = tf.train.Saver(max_to_keep=3) # This keeps tracks of the observed transitions during play, for each # player. self.transitions = [[] for _ in range(num_players)]
def __init__(self, base_dir, create_agent_fn, create_environment_fn=atari_lib.create_atari_environment, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=250000, evaluation_steps=125000, max_steps_per_episode=27000, reward_clipping=(-1, 1)): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. create_agent_fn: A function that takes as args a Tensorflow session and an environment, and returns an agent. create_environment_fn: A function which receives a problem name and creates a Gym environment for that problem (e.g. an Atari 2600 game). checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. evaluation_steps: int, the number of evaluation steps to perform. max_steps_per_episode: int, maximum number of steps after which an episode terminates. reward_clipping: Tuple(int, int), with the minimum and maximum bounds for reward at each step. If `None` no clipping is applied. This constructor will take the following actions: - Initialize an environment. - Initialize a `tf.Session`. - Initialize a logger. - Initialize an agent. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._num_iterations = num_iterations self._training_steps = training_steps self._evaluation_steps = evaluation_steps self._max_steps_per_episode = max_steps_per_episode self._base_dir = base_dir self._create_directories() self._summary_writer = tf.summary.FileWriter(self._base_dir) self._environment = create_environment_fn() # Set up a session and initialize variables. config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self._sess = tf.Session('', config=config) self._agent = create_agent_fn(self._sess, self._environment, summary_writer=self._summary_writer) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix) self._reward_clipping = reward_clipping
def initialize_session(self): """Initializes a tf Session.""" if ENABLE_TF_OPTIMIZATIONS: self.sess = tf.Session() else: rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, remapping=rewriter_config_pb2.RewriterConfig.OFF, shape_optimization=rewriter_config_pb2.RewriterConfig.OFF, dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF, function_optimization=rewriter_config_pb2.RewriterConfig.OFF, layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF, loop_optimization=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig. NO_MEM_OPT) graph_options = tf.GraphOptions(rewrite_options=rewriter_config) session_config = tf.ConfigProto(graph_options=graph_options) self.sess = tf.Session(config=session_config) # Restore or initialize the variables. self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) if self.learner_config.checkpoint_for_eval: # Requested a specific checkpoint. self.saver.restore(self.sess, self.learner_config.checkpoint_for_eval) tf.logging.info('Restored checkpoint: %s' % self.learner_config.checkpoint_for_eval) else: # Continue from the latest checkpoint if one exists. # This handles fault-tolerance. latest_checkpoint = None if self.checkpoint_dir is not None: latest_checkpoint = tf.train.latest_checkpoint( self.checkpoint_dir) if latest_checkpoint: self.saver.restore(self.sess, latest_checkpoint) tf.logging.info('Restored checkpoint: %s' % latest_checkpoint) else: tf.logging.info('No previous checkpoint.') self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) # For episodic models, potentially use pretrained weights at the start of # training. If this happens it will overwrite the embedding weights, but # taking care to not restore the Adam parameters. if self.learner_config.pretrained_checkpoint and not self.sess.run( tf.train.get_global_step()): self.saver.restore(self.sess, self.learner_config.pretrained_checkpoint) tf.logging.info('Restored checkpoint: %s' % self.learner_config.pretrained_checkpoint) # We only want the embedding weights of the checkpoint we just restored. # So we re-initialize everything that's not an embedding weight. Also, # since this episodic finetuning procedure is a different optimization # problem than the original training of the baseline whose embedding # weights are re-used, we do not reload ADAM's variables and instead learn # them from scratch. vars_to_reinit, embedding_var_names, vars_to_reinit_names = [], [], [] for var in tf.global_variables(): if (any(keyword in var.name for keyword in EMBEDDING_KEYWORDS) and 'adam' not in var.name.lower()): embedding_var_names.append(var.name) continue vars_to_reinit.append(var) vars_to_reinit_names.append(var.name) tf.logging.info('Initializing all variables except for %s.' % embedding_var_names) self.sess.run(tf.variables_initializer(vars_to_reinit)) tf.logging.info('Re-initialized vars %s.' % vars_to_reinit_names)
def __init__(self, base_dir, agent_creator, create_environment_fn=create_atari_environment, game_name=None, checkpoint_file_prefix='ckpt', logging_file_prefix='log', log_every_n=1, num_iterations=200, training_steps=250000, evaluation_steps=125000, max_steps_per_episode=27000): """Initialize the Runner object in charge of running a full experiment. Args: base_dir: str, the base directory to host all required sub-directories. agent_creator: A function that takes as args a Tensorflow session and an Atari 2600 Gym environment, and returns an agent. create_environment_fn: A function which receives a game name and creates an Atari 2600 Gym environment. game_name: str, name of the Atari 2600 domain to run. sticky_actions: bool, whether to enable sticky actions in the environment. checkpoint_file_prefix: str, the prefix to use for checkpoint files. logging_file_prefix: str, prefix to use for the log files. log_every_n: int, the frequency for writing logs. num_iterations: int, the iteration number threshold (must be greater than start_iteration). training_steps: int, the number of training steps to perform. evaluation_steps: int, the number of evaluation steps to perform. max_steps_per_episode: int, maximum number of steps after which an episode terminates. This constructor will take the following actions: - Initialize an environment. - Initialize a `tf.Session`. - Initialize a logger. - Initialize an agent. - Reload from the latest checkpoint, if available, and initialize the Checkpointer object. """ assert base_dir is not None self._logging_file_prefix = logging_file_prefix self._log_every_n = log_every_n self._num_iterations = num_iterations self._training_steps = training_steps self._evaluation_steps = evaluation_steps self._max_steps_per_episode = max_steps_per_episode self._base_dir = base_dir self._create_directories() self._summary_writer = tf.summary.FileWriter(self._base_dir) self._environment = create_environment_fn() # Set up a session and initialize variables. self._sess = tf.Session( '', config=tf.ConfigProto(allow_soft_placement=True)) self._agent = agent_creator(self._sess, self._environment, summary_writer=self._summary_writer) self._summary_writer.add_graph(graph=tf.get_default_graph()) self._sess.run(tf.global_variables_initializer()) self._summary_helper = SummaryHelper(self._summary_writer) self._initialize_checkpointer_and_maybe_resume(checkpoint_file_prefix) self._steps_done = 0 self._total_timer = None