def evaluate(args): env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) agent.load_model(load_model_remark=args.load_model_remark) parent_conn, child_conn = Pipe() worker = AtariEnvironment(args.env, 1, child_conn, is_render=True, max_episode_step=args.max_episode_step) worker.start() for i_episode in range(100): obs = worker.reset() while True: obs = np.expand_dims(obs, axis=0) action = agent.choose_action(obs / 255) parent_conn.send(action[0]) obs_, r, done, info = parent_conn.recv() obs = obs_ if done: break
def paralle_train(args): logger = SummaryWriter(log_dir='results/{}_{}_{}'.format( args.env, args.seed, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) np.random.seed(args.seed) torch.manual_seed(args.seed) env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) workers, parent_conns, children_conns = workers_initialize(args) obs = np.zeros(shape=[args.num_worker, 4, 84, 84], dtype=np.float32) #initialize obs_normalizer print('Start initialize obs normalizer....') next_obs_batch = [] for step in range(args.initialize_episode * args.max_episode_step): actions = np.random.randint(0, env_params['a_dim'], size=(args.num_worker)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: obs_, r, done, info = parent_conn.recv() next_obs_batch.append(obs_) if len(next_obs_batch) % (10 * args.num_worker) == 0: next_obs_batch = np.stack(next_obs_batch) agent.normalizer_obs.update(next_obs_batch) next_obs_batch = [] print('End initialize obs normalizer....') log_reward_ex = 0 log_reward_in = 0 log_step = 0 log_episode = 0 for i_epoch in range(args.max_epoch): epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob = [], [], [], [], [], [], [] for i_step in range(args.rollout_len): actions, log_probs = agent.choose_action(obs) for action, parent_conn in zip(actions, parent_conns): parent_conn.send(action) batch_re, batch_mask, batch_next_obs = [], [], [] for parent_conn in parent_conns: obs_, r_e, done, info = parent_conn.recv() batch_next_obs.append(obs_) batch_re.append(r_e) batch_mask.append(0 if done else 1) batch_next_obs = np.stack(batch_next_obs) batch_re = np.stack(batch_re) batch_mask = np.stack(batch_mask) batch_ri = agent.compute_intrinsic_reward(batch_next_obs.copy()) #for log log_reward_ex += batch_re[args.log_env_idx] log_reward_in += batch_ri[args.log_env_idx] log_step += 1 if batch_mask[args.log_env_idx] == 0: log_episode += 1 logger.add_scalar('Indicator/Reward_ex', log_reward_ex, log_episode) logger.add_scalar('Indicator/Reward_in', log_reward_in, log_episode) log_reward_ex = 0 log_reward_in = 0 epoch_obs.append(obs) epoch_action.append(actions) epoch_next_obs.append(batch_next_obs) epoch_ri.append(batch_ri) epoch_re.append(batch_re) epoch_mask.append(batch_mask) epoch_logprob.append(log_probs) obs = batch_next_obs[:, :, :, :] epoch_obs = np.stack(epoch_obs) epoch_action = np.stack(epoch_action) epoch_ri = np.stack(epoch_ri) epoch_re = np.stack(epoch_re) epoch_mask = np.stack(epoch_mask) epoch_next_obs = np.stack(epoch_next_obs) epoch_logprob = np.stack(epoch_logprob) epoch_obs = np.transpose(epoch_obs, axes=[1, 0, 2, 3, 4]) epoch_action = np.transpose(epoch_action, axes=[1, 0]) epoch_ri = np.transpose(epoch_ri, axes=[1, 0]) epoch_re = np.transpose(epoch_re, axes=[1, 0]) epoch_mask = np.transpose(epoch_mask, axes=[1, 0]) epoch_next_obs = np.transpose(epoch_next_obs, axes=[1, 0, 2, 3, 4]) epoch_logprob = np.transpose(epoch_logprob, axes=[1, 0]) loss_rnd, loss_a, loss_c = agent.update(epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob) used_sample_num = args.rollout_len * args.num_worker * i_epoch logger.add_scalar('Loss/loss_RND', loss_rnd, used_sample_num) logger.add_scalar('Loss/loss_a', loss_a, used_sample_num) logger.add_scalar('Loss/loss_c', loss_c, used_sample_num) if i_epoch % args.save_model_interval == 0: agent.save_model(remark='{}'.format(i_epoch))