def evaluate( loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir, agent_type, eval_mode, eval_with_learner, log_every_steps, debug_video_path, num_debug_videos=1, random_starts_step_limit=None, report_fn=None, report_metric=None ): """Evaluate.""" if eval_with_learner: assert agent_type == "policy" if report_fn: assert report_metric is not None eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) video_writers = () kwargs = {} if eval_mode in ["agent_real", "agent_simulated"]: if not eval_with_learner: if debug_video_path: tf.gfile.MakeDirs(debug_video_path) video_writers = [ common_video.WholeVideoWriter( # pylint: disable=g-complex-comprehension fps=10, output_path=os.path.join(debug_video_path, "{}.avi".format(i)), file_format="avi", ) for i in range(num_debug_videos) ] kwargs["eval_fn"] = make_eval_fn_with_agent( agent_type, eval_mode, planner_hparams, model_dir, log_every_steps=log_every_steps, video_writers=video_writers, random_starts_step_limit=random_starts_step_limit ) eval_metrics = rl_utils.evaluate_all_configs( loop_hparams, policy_dir, **kwargs ) else: eval_metrics = evaluate_world_model( agent_type, loop_hparams, planner_hparams, model_dir, policy_dir, random_starts_step_limit, debug_video_path, log_every_steps ) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0) for video_writer in video_writers: video_writer.finish_to_disk() # Report metrics if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( sampling_temp=loop_hparams.eval_sampling_temps[0], max_num_noops=loop_hparams.eval_max_num_noops, clipped=False ) report_fn(eval_metrics[metric_name], 0) else: report_fn(eval_metrics[report_metric], 0) return eval_metrics
def debug_video_writer_factory(output_dir): """Creates a VideoWriter for debug videos.""" if FLAGS.disable_ffmpeg: return common_video.IndividualFrameWriter(output_dir) else: output_path = os.path.join(output_dir, "video.avi") return common_video.WholeVideoWriter(fps=10, output_path=output_path, file_format="avi")
def _video_dump_frame(self, obs, rews): if self._video_writer is None: self._video_counter += 1 self._video_writer = common_video.WholeVideoWriter( fps=10, output_path=os.path.join(self._video_dir, "{}.avi".format(self._video_counter)), file_format="avi") img = PIL_Image().new("RGB", (obs.shape[-2], 11),) draw = PIL_ImageDraw().Draw(img) draw.text((0, 0), "r:{:3}".format(int(rews[0])), fill=(255, 0, 0)) self._video_writer.write(np.concatenate([np.asarray(img), obs[0]], axis=0))
def evaluate(loop_hparams, planner_hparams, policy_dir, model_dir, eval_metrics_dir, agent_type, eval_with_learner, log_every_steps, debug_video_path, report_fn=None, report_metric=None): """Evaluate.""" if eval_with_learner: assert agent_type == "policy" if report_fn: assert report_metric is not None eval_metrics_writer = tf.summary.FileWriter(eval_metrics_dir) kwargs = {} if not eval_with_learner: if debug_video_path: video_writer = common_video.WholeVideoWriter( fps=10, output_path=debug_video_path, file_format="avi") else: video_writer = None kwargs["eval_fn"] = make_eval_fn_with_agent( agent_type, planner_hparams, model_dir, log_every_steps=log_every_steps, video_writer=video_writer) eval_metrics = rl_utils.evaluate_all_configs(loop_hparams, policy_dir, **kwargs) rl_utils.summarize_metrics(eval_metrics_writer, eval_metrics, 0) if video_writer is not None: video_writer.finish_to_disk() # Report metrics if report_fn: if report_metric == "mean_reward": metric_name = rl_utils.get_metric_name( sampling_temp=loop_hparams.eval_sampling_temps[0], max_num_noops=loop_hparams.eval_max_num_noops, clipped=False) report_fn(eval_metrics[metric_name], 0) else: report_fn(eval_metrics[report_metric], 0) return eval_metrics
def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path): """Evaluate the world model (reward accuracy).""" environment_spec = make_simulated_env_spec(real_env, hparams) environment_spec.wrappers = [] num_input_frames = environment_spec.video_num_input_frames rollout_subsequences = [] def initial_frame_chooser(batch_size): assert batch_size == len(rollout_subsequences) return np.stack([[ frame.observation.decode() for frame in subsequence[:num_input_frames] ] for subsequence in rollout_subsequences]) environment_spec.add_hparam("initial_frame_chooser", initial_frame_chooser) sim_env = SimulatedBatchGymEnv(environment_spec, hparams.wm_eval_batch_size, world_model_dir) subsequence_length = int( max(hparams.wm_eval_rollout_ratios) * hparams.ppo_epoch_length) rollouts = real_env.current_epoch_rollouts( split=tf.contrib.learn.ModeKeys.EVAL, minimal_rollout_frames=(subsequence_length + num_input_frames)) video_writer = common_video.WholeVideoWriter(fps=10, output_path=debug_video_path, file_format="avi") reward_accuracies_by_length = { int(ratio * hparams.ppo_epoch_length): [] for ratio in hparams.wm_eval_rollout_ratios } for _ in range(hparams.wm_eval_epochs_num): rollout_subsequences[:] = random_rollout_subsequences( rollouts, hparams.wm_eval_batch_size, subsequence_length + num_input_frames) eval_subsequences = [ subsequence[(num_input_frames - 1):] for subsequence in rollout_subsequences ] # Check that the initial observation is the same in the real and simulated # rollout. sim_init_obs = sim_env.reset() def decode_real_obs(index): return np.stack([ subsequence[index].observation.decode() for subsequence in eval_subsequences # pylint: disable=cell-var-from-loop ]) real_init_obs = decode_real_obs(0) assert np.all(sim_init_obs == real_init_obs) debug_frame_batches = [] def append_debug_frame_batch(sim_obs, real_obs): errs = np.maximum( np.abs(sim_obs.astype(np.int) - real_obs, dtype=np.int) - 10, 0).astype(np.uint8) debug_frame_batches.append( # pylint: disable=cell-var-from-loop np.concatenate([sim_obs, real_obs, errs], axis=2)) append_debug_frame_batch(sim_init_obs, real_init_obs) (sim_cum_rewards, real_cum_rewards) = (np.zeros(hparams.wm_eval_batch_size) for _ in range(2)) for i in range(subsequence_length): actions = [ subsequence[i].action for subsequence in eval_subsequences ] (sim_obs, sim_rewards, _) = sim_env.step(actions) sim_cum_rewards += sim_rewards real_cum_rewards += [ subsequence[i + 1].reward for subsequence in eval_subsequences ] for (length, reward_accuracies ) in six.iteritems(reward_accuracies_by_length): if i + 1 == length: reward_accuracies.append( np.sum(sim_cum_rewards == real_cum_rewards) / len(real_cum_rewards)) real_obs = decode_real_obs(i + 1) append_debug_frame_batch(sim_obs, real_obs) for debug_frames in np.stack(debug_frame_batches, axis=1): for debug_frame in debug_frames: video_writer.write(debug_frame) video_writer.finish_to_disk() return { "reward_accuracy/at_{}".format(length): np.mean(reward_accuracies) for (length, reward_accuracies) in six.iteritems(reward_accuracies_by_length) }
def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path): """Evaluate the world model (reward accuracy).""" frame_stack_size = hparams.frame_stack_size rollout_subsequences = [] def initial_frame_chooser(batch_size): assert batch_size == len(rollout_subsequences) return np.stack([ [frame.observation.decode() for frame in subsequence[:frame_stack_size]] for subsequence in rollout_subsequences ]) env_fn = make_simulated_env_fn_from_hparams( real_env, hparams, batch_size=hparams.wm_eval_batch_size, initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir ) sim_env = env_fn(in_graph=False) subsequence_length = int( max(hparams.wm_eval_rollout_ratios) * hparams.simulated_rollout_length ) rollouts = real_env.current_epoch_rollouts( split=tf.estimator.ModeKeys.EVAL, minimal_rollout_frames=(subsequence_length + frame_stack_size) ) video_writer = common_video.WholeVideoWriter( fps=10, output_path=debug_video_path, file_format="avi" ) reward_accuracies_by_length = { int(ratio * hparams.simulated_rollout_length): [] for ratio in hparams.wm_eval_rollout_ratios } for _ in range(hparams.wm_eval_num_batches): rollout_subsequences[:] = random_rollout_subsequences( rollouts, hparams.wm_eval_batch_size, subsequence_length + frame_stack_size ) eval_subsequences = [ subsequence[(frame_stack_size - 1):] for subsequence in rollout_subsequences ] # Check that the initial observation is the same in the real and simulated # rollout. sim_init_obs = sim_env.reset() def decode_real_obs(index): return np.stack([ subsequence[index].observation.decode() for subsequence in eval_subsequences # pylint: disable=cell-var-from-loop ]) real_init_obs = decode_real_obs(0) assert np.all(sim_init_obs == real_init_obs) debug_frame_batches = [] def append_debug_frame_batch(sim_obs, real_obs, sim_cum_rews, real_cum_rews, sim_rews, real_rews): """Add a debug frame.""" rews = [[sim_cum_rews, sim_rews], [real_cum_rews, real_rews]] headers = [] for j in range(len(sim_obs)): local_nps = [] for i in range(2): img = PIL_Image().new("RGB", (sim_obs.shape[-2], 11),) draw = PIL_ImageDraw().Draw(img) draw.text((0, 0), "c:{:3}, r:{:3}".format(int(rews[i][0][j]), int(rews[i][1][j])), fill=(255, 0, 0)) local_nps.append(np.asarray(img)) local_nps.append(np.zeros_like(local_nps[0])) headers.append(np.concatenate(local_nps, axis=1)) errs = absolute_hinge_difference(sim_obs, real_obs) headers = np.stack(headers) debug_frame_batches.append( # pylint: disable=cell-var-from-loop np.concatenate([headers, np.concatenate([sim_obs, real_obs, errs], axis=2)], axis=1) ) append_debug_frame_batch(sim_init_obs, real_init_obs, np.zeros(hparams.wm_eval_batch_size), np.zeros(hparams.wm_eval_batch_size), np.zeros(hparams.wm_eval_batch_size), np.zeros(hparams.wm_eval_batch_size)) (sim_cum_rewards, real_cum_rewards) = ( np.zeros(hparams.wm_eval_batch_size) for _ in range(2) ) for i in range(subsequence_length): actions = [subsequence[i].action for subsequence in eval_subsequences] (sim_obs, sim_rewards, _) = sim_env.step(actions) sim_cum_rewards += sim_rewards real_rewards = np.array([ subsequence[i + 1].reward for subsequence in eval_subsequences ]) real_cum_rewards += real_rewards for (length, reward_accuracies) in six.iteritems( reward_accuracies_by_length ): if i + 1 == length: reward_accuracies.append( np.sum(sim_cum_rewards == real_cum_rewards) / len(real_cum_rewards) ) real_obs = decode_real_obs(i + 1) append_debug_frame_batch(sim_obs, real_obs, sim_cum_rewards, real_cum_rewards, sim_rewards, real_rewards) for debug_frames in np.stack(debug_frame_batches, axis=1): for debug_frame in debug_frames: video_writer.write(debug_frame) video_writer.finish_to_disk() return { "reward_accuracy/at_{}".format(length): np.mean(reward_accuracies) for (length, reward_accuracies) in six.iteritems( reward_accuracies_by_length ) }
def main(_): tf.logging.set_verbosity(tf.logging.INFO) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) # Create hparams hparams = create_hparams() hparams.force_full_predict = True hparams.scheduled_sampling_k = -1 # Params num_agents = 1 # TODO(mbz): fix the code for more agents num_steps = FLAGS.num_steps num_actions = hparams.problem.num_actions frame_shape = hparams.problem.frame_shape resized_frame = hparams.preprocess_resize_frames is not None if resized_frame: frame_shape = hparams.preprocess_resize_frames frame_shape += [hparams.problem.num_channels] dataset = registry.problem(FLAGS.problem).dataset( tf.estimator.ModeKeys.TRAIN, shuffle_files=True, hparams=hparams) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(num_agents)) data = dataset.make_one_shot_iterator().get_next() # Setup input placeholders input_size = [num_agents, hparams.video_num_input_frames] placeholders = { "inputs": tf.placeholder(tf.float32, input_size + frame_shape), "input_action": tf.placeholder(tf.int64, input_size + [1]), "input_reward": tf.placeholder(tf.int64, input_size + [1]), } # Creat model model_cls = registry.model(FLAGS.model) model = model_cls(hparams, tf.estimator.ModeKeys.PREDICT) prediction_ops = model.infer(placeholders) states_q = Queue(maxsize=hparams.video_num_input_frames) actions_q = Queue(maxsize=hparams.video_num_input_frames) rewards_q = Queue(maxsize=hparams.video_num_input_frames) all_qs = (states_q, actions_q, rewards_q) writer = common_video.WholeVideoWriter(fps=10, output_path=FLAGS.output_gif) saver = tf.train.Saver() with tf.train.SingularMonitoredSession() as sess: # Load latest checkpoint ckpt = tf.train.get_checkpoint_state( FLAGS.output_dir).model_checkpoint_path saver.restore(sess.raw_session(), ckpt) # get init frames from the dataset data_np = sess.run(data) frames = np.split(data_np["inputs"], hparams.video_num_input_frames, 1) for frame in frames: frame = np.squeeze(frame, 1) states_q.put(frame) writer.write(frame[0].astype(np.uint8)) actions = np.split(data_np["input_action"], hparams.video_num_input_frames, 1) for action in actions: actions_q.put(np.squeeze(action, 1)) rewards = np.split(data_np["input_reward"], hparams.video_num_input_frames, 1) for reward in rewards: rewards_q.put(np.squeeze(reward, 1)) for step in range(num_steps): print(">>>>>>> ", step) random_actions = np.random.randint(num_actions - 1) random_actions = np.expand_dims(random_actions, 0) random_actions = np.tile(random_actions, (num_agents, 1)) # Shape inputs and targets inputs, input_action, input_reward = (np.stack(list(q.queue), axis=1) for q in all_qs) # Predict next frames feed = { placeholders["inputs"]: inputs, placeholders["input_action"]: input_action, placeholders["input_reward"]: input_reward, } predictions = sess.run(prediction_ops, feed_dict=feed) predicted_states = predictions["targets"][:, 0] predicted_reward = predictions["target_reward"][:, 0] # Update queues new_data = (predicted_states, random_actions, predicted_reward) for q, d in zip(all_qs, new_data): q.get() q.put(d.copy()) writer.write(np.round(predicted_states[0]).astype(np.uint8)) video = writer.finish() writer.save_to_disk(video)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) # Create hparams hparams = trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams, data_dir=os.path.expanduser( FLAGS.data_dir), problem_name=FLAGS.problem) hparams.force_full_predict = True hparams.scheduled_sampling_k = -1 # Params num_agents = 1 # TODO(mbz): fix the code for more agents num_steps = FLAGS.num_steps if hasattr(hparams.problem, "num_actions"): num_actions = hparams.problem.num_actions else: num_actions = None frame_shape = hparams.problem.frame_shape resized_frame = hparams.preprocess_resize_frames is not None if resized_frame: frame_shape = hparams.preprocess_resize_frames frame_shape += [hparams.problem.num_channels] dataset = registry.problem(FLAGS.problem).dataset( tf.estimator.ModeKeys.TRAIN, shuffle_files=True, data_dir=os.path.expanduser(FLAGS.data_dir), hparams=hparams) dataset = dataset.batch(num_agents, drop_remainder=True) data = dataset.make_one_shot_iterator().get_next() # Setup input placeholders input_size = [num_agents, hparams.video_num_input_frames] if num_actions is None: placeholders = { "inputs": tf.placeholder(tf.float32, input_size + frame_shape) } else: placeholders = { "inputs": tf.placeholder(tf.float32, input_size + frame_shape), "input_action": tf.placeholder(tf.int64, input_size + [1]), "input_reward": tf.placeholder(tf.int64, input_size + [1]), "reset_internal_states": tf.placeholder(tf.float32, []), } # Create model. model_cls = registry.model(FLAGS.model) model = model_cls(hparams, tf.estimator.ModeKeys.PREDICT) prediction_ops = model.infer(placeholders) states_q = Queue(maxsize=hparams.video_num_input_frames) actions_q = Queue(maxsize=hparams.video_num_input_frames) rewards_q = Queue(maxsize=hparams.video_num_input_frames) if num_actions is not None: all_qs = [states_q, actions_q, rewards_q] else: all_qs = [states_q] writer = common_video.WholeVideoWriter(fps=FLAGS.fps, output_path=FLAGS.output_gif) saver = tf.train.Saver(tf.trainable_variables()) with tf.train.SingularMonitoredSession() as sess: # Load latest checkpoint ckpt = tf.train.get_checkpoint_state( FLAGS.output_dir).model_checkpoint_path saver.restore(sess.raw_session(), ckpt) # get init frames from the dataset data_np = sess.run(data) frames = np.split(data_np["inputs"], hparams.video_num_input_frames, 1) for frame in frames: frame = np.squeeze(frame, 1) states_q.put(frame) writer.write(frame[0].astype(np.uint8)) if num_actions is not None: actions = np.split(data_np["input_action"], hparams.video_num_input_frames, 1) for action in actions: actions_q.put(np.squeeze(action, 1)) rewards = np.split(data_np["input_reward"], hparams.video_num_input_frames, 1) for reward in rewards: rewards_q.put(np.squeeze(reward, 1)) for step in range(num_steps): print(">>>>>>> ", step) if num_actions is not None: random_actions = np.random.randint(num_actions - 1) random_actions = np.expand_dims(random_actions, 0) random_actions = np.tile(random_actions, (num_agents, 1)) # Shape inputs and targets inputs, input_action, input_reward = (np.stack(list(q.queue), axis=1) for q in all_qs) else: assert len(all_qs) == 1 q = all_qs[0] elems = list(q.queue) # Need to adjust shapes sometimes. for i, e in enumerate(elems): if len(e.shape) < 4: elems[i] = np.expand_dims(e, axis=0) inputs = np.stack(elems, axis=1) # Predict next frames if num_actions is None: feed = {placeholders["inputs"]: inputs} else: feed = { placeholders["inputs"]: inputs, placeholders["input_action"]: input_action, placeholders["input_reward"]: input_reward, placeholders["reset_internal_states"]: float(step == 0), } predictions = sess.run(prediction_ops, feed_dict=feed) if num_actions is None: predicted_states = predictions[:, 0] else: predicted_states = predictions["targets"][:, 0] predicted_reward = predictions["target_reward"][:, 0] # Update queues if num_actions is None: new_data = (predicted_states) else: new_data = (predicted_states, random_actions, predicted_reward) for q, d in zip(all_qs, new_data): q.get() q.put(d.copy()) writer.write(np.round(predicted_states[0]).astype(np.uint8)) writer.finish_to_disk()