def train(config, trial_dir=None, visualize=False): pid = os.getpid() logger, log_dir = prepare_for_logging("pid_{}".format(pid)) # create environment env = NIPS(visualize) logger.info("pid={}, env={}".format(pid, id(env))) if trial_dir is not None and os.path.exists( trial_dir) and config['agent'] == 'DDPG': logger.info("Loading config from {} ...".format(trial_dir)) with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) # config["scale_action"] = scale_action config["title_prefix"] = "RunEnv" # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() # snapshot info if "save_snapshot_every" not in config: config["save_snapshot_every"] = 500 save_snapshot_every = config["save_snapshot_every"] # save config with open(os.path.join(log_dir, "config.pk"), "wb") as f: pickle.dump(config, f) util.print_settings(logger, config, env) # DDPG if config['agent'] == 'DDPG': # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # print networks agent.actor.summary() agent.target_actor.summary() agent.critic.summary() # add callbacks def p_info(episode_info): util.print_episode_info(logger, episode_info, pid) def save_nets(episode_info): paths = {} paths["actor"] = os.path.join(log_dir, "actor.h5") paths["critic"] = os.path.join(log_dir, "critic.h5") paths["target"] = os.path.join(log_dir, "target.h5") agent = episode_info["agent"] agent.save_models(paths) def save_snapshots(episode_info): agent = episode_info["agent"] episode = episode_info["episode"] if episode % save_snapshot_every == 0: paths = {} paths["actor"] = os.path.join(log_dir, "actor_{}.h5".format(episode)) paths["critic"] = os.path.join(log_dir, "critic_{}.h5".format(episode)) paths["target"] = os.path.join(log_dir, "target_{}.h5".format(episode)) agent.save_models(paths) memory_path = os.path.join(log_dir, "replaybuffer.npz") agent.save_memory(memory_path) logger.info("Snapshots saved. (pid={})".format(pid)) agent.on_episode_end.append(p_info) agent.on_episode_end.append(save_nets) agent.on_episode_end.append(save_snapshots) # load existing model if trial_dir is not None and os.path.exists(trial_dir): logger.info("Loading networks from {} ...".format(trial_dir)) paths = {} paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = { k: os.path.join(trial_dir, v) for k, v in paths.iteritems() } logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) memory_path = os.path.join(trial_dir, "replaybuffer.npz") if os.path.exists(memory_path): agent.load_memory(memory_path) logger.info("Replay buffer loaded.") # learn util.print_sec_header(logger, "Training") reward_hist, steps_hist = agent.learn( total_episodes=config["total_episodes"], max_steps=config["max_steps"]) env.close() # send result img_file = os.path.join(log_dir, "train_stats.png") util.plot_stats(reward_hist, steps_hist, img_file) log_file = os.path.join(log_dir, "train.log") title = log_dir + "_" + config["title_prefix"] util.send_email(title, [img_file], [log_file], SMTP_SERVER) # TRPO elif config['agent'] == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') def env_maker(visualize=False): env = NIPS(visualize=visualize) monitor_dir = os.path.join(log_dir, "gym_monitor") env = gym.wrappers.Monitor(env, directory=monitor_dir, video_callable=False, force=False, resume=True, write_upon_reset=True) return env del env env = env_maker() agent = TRPO( env, env_maker, logger, log_dir, ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) if trial_dir is not None and os.path.exists(trial_dir): agent.load_models(trial_dir) agent.learn() logger.info("Finished (pid={}).".format(pid))
def test(agent, trial_dir, test_episode, visual_flag, submit_flag): pid = os.getpid() logger, _ = prepare_for_logging("pid_{}".format(pid), False) logger.info("trial_dir={}".format(trial_dir)) if not os.path.exists(trial_dir): logger.info("trial_dir does not exist") return # create environment env = NIPS(visualize=visual_flag) # load config with open(os.path.join(trial_dir, "config.pk"), "rb") as f: config = pickle.load(f) if agent == 'DDPG': config["scale_action"] = scale_action # observation processor if "ob_processor" not in config or config["ob_processor"] == "dummy": ob_processor = ObservationProcessor() elif config["ob_processor"] == "2ndorder": ob_processor = SecondOrderAugmentor() else: ob_processor = BodySpeedAugmentor() config["ob_aug_dim"] = ob_processor.get_aug_dim() util.print_settings(logger, config, env) # create random process oup = create_rand_process(env, config) # create replay buffer memory = create_memory(env, config) # create ddpg agent agent = DDPG(env, memory, oup, ob_processor, config) agent.build_nets(actor_hiddens=config["actor_hiddens"], scale_action=config["scale_action"], critic_hiddens=config["critic_hiddens"]) # load weights paths = {} if test_episode > 0: paths["actor"] = "actor_{}.h5".format(test_episode) paths["critic"] = "critic_{}.h5".format(test_episode) paths["target"] = "target_{}.h5".format(test_episode) else: paths["actor"] = "actor.h5" paths["critic"] = "critic.h5" paths["target"] = "target.h5" paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()} logger.info("Paths to models: {}".format(paths)) agent.load_models(paths) elif agent == 'TRPO': def ob_processor_maker(): if config["ob_processor"] == "normal": return ObservationProcessor() elif config["ob_processor"] == "2ndorder": return SecondOrderAugmentor() elif config['ob_processor'] == 'bodyspeed': return BodySpeedAugmentor() else: raise ValueError('invalid ob processor type') config = { "agent": 'TRPO', "batch_size": 5000, "n_envs": 16, "n_iters": 5000, "ob_processor": "bodyspeed", # "hidden_nonlinearity": "relu", # "action_nonlinearity": "tanh", # "policy_hiddens": [128, 128, 64, 64], # "baseline_hiddens": [128, 128, 64, 64], "policy_hiddens": [256, 128, 64], "baseline_hiddens": [256, 128, 64], "hidden_nonlinearity": "tanh", "action_nonlinearity": None, } agent = TRPO( env, env_maker=None, logger=logger, log_dir=None, ob_processor_maker=ob_processor_maker, policy_hiddens=config['policy_hiddens'], baseline_hiddens=config['baseline_hiddens'], hidden_nonlinearity=config['hidden_nonlinearity'], action_nonlinearity=config['action_nonlinearity'], n_envs=config['n_envs'], batch_size=config['batch_size'], n_iters=config['n_iters'], ) agent.load_models(trial_dir) else: raise ValueError('invalid agent type') if submit_flag: submit(agent, logger) else: rewards = [] for i in xrange(10): steps, reward = agent.test(max_steps=1000) logger.info("episode={}, steps={}, reward={}".format( i, steps, reward)) rewards.append(reward) logger.info("avg_reward={}".format(np.mean(rewards)))