def evaluate(self,j,dist_entropy,value_loss,action_loss,model_file=None):
		end = time.time()
		total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps
		print("Updates {}, num timesteps {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
			format(j, total_num_steps,
				   self.final_rewards.mean(),
				   self.final_rewards.median(),
				   self.final_rewards.min(),
				   self.final_rewards.max(), dist_entropy.data[0],
				   value_loss.data[0], action_loss.data[0]))

		try:
			# Sometimes monitor doesn't properly flush the outputs
			self.win = visdom_plot(self.viz, self.win, self.args.log_dir, 
				self.args.env_name, self.args.algo)
		except IOError:
			pass
	def train(self, num_updates):
		start = time.time()
		for j in range(num_updates):
			dist_entropy, value_loss, action_loss = self.run()

			if j % self.args.save_interval == 0 and self.args.save_dir != "":
				save_path = os.path.join(self.args.save_dir, self.args.algo)
				try:
					os.makedirs(save_path)
				except OSError:
					pass

				# A really ugly way to save a model to CPU
				save_model = self.actor_critic
				if self.args.cuda:
					save_model = copy.deepcopy(self.actor_critic).cpu()

				save_model = [save_model,
								hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None]

				torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt"))

			if j % self.args.log_interval == 0:
				end = time.time()
				total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps
				print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
					format(j, total_num_steps,
						   int(total_num_steps / (end - start)),
						   self.final_rewards.mean(),
						   self.final_rewards.median(),
						   self.final_rewards.min(),
						   self.final_rewards.max(), dist_entropy.data[0],
						   value_loss.data[0], action_loss.data[0]))
			if self.args.vis and j % self.args.vis_interval == 0:
				try:
					# Sometimes monitor doesn't properly flush the outputs
					self.win = visdom_plot(self.viz, self.win, self.args.log_dir, 
						self.args.env_name, self.args.algo)
				except IOError:
					pass
                total_num_steps,
                int(total_num_steps / (end - start)),  # FPS
                final_rewards.mean(),
                final_rewards.median(),
                final_rewards.min(),
                final_rewards.max(),
                dist_entropy,
                value_loss,
                action_loss))

    # print('j: ', j, '  args.vis: ', args.vis, '  args.vis_interval:  ', args.vis_interval)
    if args.vis and j % args.vis_interval == 0:
        try:
            # Sometimes monitor doesn't properly flush the outputs
            win = visdom_plot(viz,
                              win,
                              args.log_dir,
                              args.env_name,
                              args.algo,
                              args.num_frames,
                              bullet=True)
        except IOError:
            pass

save_path = os.path.join(args.save_dir, args.algo)
with open(save_path + '/Ave_Reward_per_epi(' + args.algo + ').txt', 'w') as f:
    for s in episodic_reward_graph:
        f.write(str(s) + '\n')

print('Finish!!')
Пример #4
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #5
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'
    #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    #os.environ['CUDA_VISIBLE_DEVICES'] = "9"
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    if args.use_cell:
        hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1)
        ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1)
    else:
        hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)
        ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)

    if args.cuda:
        actor_critic=actor_critic.cuda()
        hs = hs.cuda()
        ft = ft.cuda()
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr,
                                eps=args.eps,
                                max_grad_norm=args.max_grad_norm,
                                num_processes=args.num_processes,
                                num_steps=args.num_steps,
                                use_cell=args.use_cell,
                                lenhs=args.lenhs,lenft=args.lenft,
                                plan=args.plan,
                                ac_intv=args.ac_interval,
                                hs_intv=args.hs_interval,
                                ft_intv=args.ft_interval
                                )
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size,
                              feat_size=512)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)


    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    rec_x = []
    rec_y = []
    file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w')

    hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda()
    hs_ind = torch.IntTensor(args.num_processes, 1).zero_()

    epinfobuf = deque(maxlen=100)
    start_time = time.time()
    for j in range(num_updates):
        print('begin sample, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                                time.gmtime(time.time() - start_time))))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step])

                if args.use_cell:
                    for i in range(args.num_processes):
                        h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        start_ind = max(hs_ind[i],step+1-args.lenhs)
                        for ind in range(start_ind,step+1):
                            h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c)
                        hs_info[i,:]=h.view(1,2*actor_critic.hid_size)
                        del h,c
                        gc.collect()
                else:
                    for i in range(args.num_processes):
                        start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                        hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i])

                hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info)
                value, action, action_log_prob, states = actor_critic.act(
                        hidden_feat,
                        rollouts.states[step])
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(cpu_actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo:
                    epinfobuf.extend([maybeepinfo['r']])
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks)
        with torch.no_grad():
            rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1])
            if args.use_cell:
                for i in range(args.num_processes):
                    h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    start = max(hs_ind[i], step + 1 - args.lenhs)
                    for ind in range(start, step + 1):
                        h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c)
                    hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size)
                    del h,c
            else:
                for i in range(args.num_processes):
                    start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                    hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i])
            hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info)
            next_value = actor_critic.get_value(hidden_feat).detach()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        rollouts.compute_ft_ind()

        print('begin update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time))))
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print('end update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                            time.gmtime(time.time() - start_time))))
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            v_mean,v_median,v_min,v_max = safe(epinfobuf)
            print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time)),
                       int(total_num_steps / (end - start_time)),
                       v_mean, v_median, v_min, v_max,
                       dist_entropy,
                       value_loss, action_loss))

            if not (v_mean==np.nan):
                rec_x.append(total_num_steps)
                rec_y.append(v_mean)
                file.write(str(total_num_steps))
                file.write(' ')
                file.writelines(str(v_mean))
                file.write('\n')

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
    plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name,
              args.env_name, args.num_frames)
    file.close()
Пример #6
0
def main():
    print("###############################################################")
    print("#################### VISDOOM LEARNER START ####################")
    print("###############################################################")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    global envs
    envs = VecEnv(
        [make_env(i, args.config_path) for i in range(args.num_processes)],
        logging=True,
        log_dir=args.log_dir)

    obs_shape = envs.observation_space_shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.algo == 'a2c' or args.algo == 'acktr':
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space_shape)
    elif args.algo == 'a2t':
        source_models = []
        files = glob.glob(os.path.join(args.source_models_path, '*.pt'))
        for file in files:
            print(file, 'loading model...')
            source_models.append(torch.load(file))
        actor_critic = A2TPolicy(obs_shape[0], envs.action_space_shape,
                                 source_models)
    elif args.algo == 'resnet':
        # args.num_stack = 3
        actor_critic = ResnetPolicy(obs_shape[0], envs.action_space_shape)

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c' or args.algo == 'resnet':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'a2t':
        a2t_params = [p for p in actor_critic.parameters() if p.requires_grad]
        optimizer = optim.RMSprop(a2t_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space_shape)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space_shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            # print ('Actions:', cpu_actions, 'Rewards:', reward)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr', 'a2t', 'resnet']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c' or args.algo == 'resnet':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            elif args.algo == 'a2t':
                nn.utils.clip_grad_norm(a2t_params, args.max_grad_norm)

            optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            envs.log()
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, 'VizDoom', args.algo)
            except IOError:
                pass
    envs.close()
    time.sleep(5)
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        viz_1 = Visdom()
        win = None
        win1 = None

    env_name_1 = 'HalfCheetahSmallFoot-v0'
    args.env_name = 'HalfCheetahSmallLeg-v0'

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    envs_1 = [
        make_env(env_name_1, args.seed, i, args.log_dir_1)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        envs_1 = SubprocVecEnv(envs_1)
    else:
        envs = DummyVecEnv(envs)
        envs_1 = DummyVecEnv(envs_1)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)
        envs_1 = VecNormalize(envs_1)

    #same for both tasks
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space)

    #same for both tasks
    action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()
        actor_critic_1.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)
    optimizer_1 = optim.RMSprop(actor_critic_1.parameters(),
                                args.lr,
                                eps=args.eps,
                                alpha=args.alpha)

    #Different for both tasks
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                                envs_1.action_space, actor_critic_1.state_size)
    current_obs_1 = torch.zeros(args.num_processes, *obs_shape)

    #Different update functions
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    def update_current_obs_1(obs):
        shape_dim0 = envs_1.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:]
        current_obs_1[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    obs_1 = envs_1.reset()
    update_current_obs_1(obs_1)

    rollouts.observations[0].copy_(current_obs)
    rollouts_1.observations[0].copy_(current_obs_1)

    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    episode_rewards_1 = torch.zeros([args.num_processes, 1])
    final_rewards_1 = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()
        current_obs_1 = current_obs_1.cuda()
        rollouts_1.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions from branch 1
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))

            cpu_actions = action.data.squeeze(1).cpu().numpy()

            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

            #Sample actions from branch 2
            value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act(
                Variable(rollouts_1.observations[step], volatile=True),
                Variable(rollouts_1.states[step], volatile=True),
                Variable(rollouts_1.masks[step], volatile=True))

            cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy()
            obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1)
            reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1),
                                                       1)).float()
            episode_rewards_1 += reward_1

            masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0]
                                         for done_ in done_1])
            final_rewards_1 *= masks_1
            final_rewards_1 += (1 - masks_1) * episode_rewards_1
            episode_rewards_1 *= masks_1

            if args.cuda:
                masks_1 = masks_1.cuda()

            if current_obs_1.dim() == 4:
                current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2)
            else:
                current_obs_1 *= masks_1

            update_current_obs_1(obs_1)
            rollouts_1.insert(step, current_obs_1, states_1.data,
                              action_1.data, action_log_prob_1.data,
                              value_1.data, reward_1, masks_1)

        #Update for branch 1
        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
            Variable(rollouts.masks[:-1].view(-1, 1)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()
        rollouts.after_update()

        #share params branch 1 -> branch 2
        actor_critic_1.a_fc1.weight.data = copy.deepcopy(
            actor_critic.a_fc1.weight.data)
        actor_critic_1.a_fc1.bias.data = copy.deepcopy(
            actor_critic.a_fc1.bias.data)
        actor_critic_1.v_fc1.weight.data = copy.deepcopy(
            actor_critic.v_fc1.weight.data)
        actor_critic_1.v_fc1.bias.data = copy.deepcopy(
            actor_critic.v_fc1.bias.data)

        #Update for branch 2
        next_value_1 = actor_critic_1(
            Variable(rollouts_1.observations[-1], volatile=True),
            Variable(rollouts_1.states[-1], volatile=True),
            Variable(rollouts_1.masks[-1], volatile=True))[0].data

        rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma,
                                   args.tau)

        values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions(
            Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)),
            Variable(rollouts_1.masks[:-1].view(-1, 1)),
            Variable(rollouts_1.actions.view(-1, action_shape)))

        values_1 = values_1.view(args.num_steps, args.num_processes, 1)
        action_log_probs_1 = action_log_probs_1.view(args.num_steps,
                                                     args.num_processes, 1)

        advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1
        value_loss_1 = advantages_1.pow(2).mean()

        action_loss_1 = -(Variable(advantages_1.data) *
                          action_log_probs_1).mean()

        optimizer_1.zero_grad()
        (value_loss_1 * args.value_loss_coef + action_loss_1 -
         dist_entropy_1 * args.entropy_coef).backward()
        nn.utils.clip_grad_norm(actor_critic_1.parameters(),
                                args.max_grad_norm)
        optimizer_1.step()
        rollouts_1.after_update()

        #share params branch 2 -> branch 1
        actor_critic.a_fc1.weight.data = copy.deepcopy(
            actor_critic_1.a_fc1.weight.data)
        actor_critic.a_fc1.bias.data = copy.deepcopy(
            actor_critic_1.a_fc1.bias.data)
        actor_critic.v_fc1.weight.data = copy.deepcopy(
            actor_critic_1.v_fc1.weight.data)
        actor_critic.v_fc1.bias.data = copy.deepcopy(
            actor_critic_1.v_fc1.bias.data)

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo,
                                     args.env_name + '_' + env_name_1)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_model = actor_critic_1
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_model_1 = copy.deepcopy(actor_critic_1).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]
            save_model_1 = [
                save_model_1,
                hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            torch.save(save_model_1, os.path.join(save_path,
                                                  env_name_1 + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
            print(
                "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards_1.mean(), final_rewards_1.median(),
                        final_rewards_1.min(), final_rewards_1.max(),
                        dist_entropy_1.data[0], value_loss_1.data[0],
                        action_loss_1.data[0]))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
                win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1,
                                   args.algo)
            except IOError:
                pass
Пример #8
0
def train_a_gym_model(env, config):
    """We train gym-type RL problem using ppo given environment and configuration"""
    torch.set_num_threads(1)

    seed = config.get('seed', 1)
    log_dir = config.get('log_dir', '/tmp/gym')
    log_interval = config.get('log_interval', 10)
    save_interval = config.get('save_interval', 100)
    save_dir = config.get('save_dir', 'trained_models/ppo')
    add_timestep = config.get('add_timestep', False)
    num_processes = config.get('num_processes', 4)
    gamma = config.get('gamma', 0.99)
    num_stack = config.get('num_stack', 1)
    recurrent_policy = config.get('recurrent_policy', False)
    cuda = config.get('cuda', True)
    vis = config.get('vis', True)
    vis_interval = config.get('vis_interval', 100)
    env_name = config['env_name']
    save_step = config.get('save_step', None)
    if save_step is not None:
        next_save_step = save_step

    # clean the log folder, if necessary
    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    if vis:
        from visdom import Visdom
        port = config.get('port', 8097)
        viz = Visdom(port=port)
        win = None

    envs = [make_env(env, seed, i, log_dir, add_timestep)
            for i in range(num_processes)]

    if num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if cuda:
        actor_critic.cuda()

    clip_param = config.get('clip_param', 0.2)
    ppo_epoch = config.get('ppo_epoch', 4)
    num_mini_batch = config.get('num_mini_batch', 32)
    value_loss_coef = config.get('value_loss_coef', 0.5)
    entropy_coef = config.get('entropy_coef', 0.01)
    lr = config.get('lr', 1e-3)
    eps = config.get('eps', 1e-5)
    max_grad_norm = config.get('max_grad_norm', 0.5)
    use_gae = config.get('use_gae', False)
    tau = config.get('tau', 0.95)
    num_steps = config.get('num_steps', 100)
    num_frames = config.get('num_frames', 1e6)

    num_updates = int(num_frames) // num_steps // num_processes

    agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch,
                     value_loss_coef, entropy_coef, lr=lr,
                     eps=eps,
                     max_grad_norm=max_grad_norm)

    rollouts = RolloutStorage(num_steps, num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, num_stack)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes, 1])
    final_rewards = torch.zeros([num_processes, 1])

    if cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    def save_the_model(num=None):
        """num is additional information"""
        # save it after training
        save_path = save_dir
        try:
            os.makedirs(save_path)
        except OSError:
            pass
        # A really ugly way to save a model to CPU
        save_model = actor_critic
        if cuda:
            save_model = copy.deepcopy(actor_critic).cpu()
        save_model = [save_model,
                      hasattr(envs, 'ob_rms') and envs.ob_rms or None]
        if num is None:
            save_name = '%s.pt' % env_name
        else:
            save_name = '%s_at_%d.pt' % (env_name, int(num))
        torch.save(save_model, os.path.join(save_path, save_name))

    start = time.time()
    for j in range(1, 1 + num_updates):
        for step in range(num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step],
                    rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, num_stack)
            rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, use_gae, gamma, tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % save_interval == 0 and save_dir != "":
            save_the_model()
            if save_step is not None:
                total_num_steps = j * num_processes * num_steps
                if total_num_steps > next_save_step:
                    save_the_model(total_num_steps)
                    next_save_step += save_step

        if j % log_interval == 0:
            end = time.time()
            total_num_steps = j * num_processes * num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                  format(j, total_num_steps,
                         int(total_num_steps / (end - start)),
                         final_rewards.mean(),
                         final_rewards.median(),
                         final_rewards.min(),
                         final_rewards.max(), dist_entropy,
                         value_loss, action_loss))
        if vis and j % vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, log_dir, env_name,
                                  'ppo', num_frames)
            except IOError:
                pass
    # finally save model again
    save_the_model()
Пример #9
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    print('args.lr')
    print(args.lr)

    #     print('args.stat_decay')
    #     print(args.stat_decay)

    #     sys.exit()

    if args.algo == 'a2c':

        #         print('args.eps')
        #         print(args.eps)

        #         sys.exit()

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo in ['acktr']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['acktr-h**o']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               if_homo=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['acktr-h**o-noEigen']:
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               acktr=True,
                               if_homo=True,
                               stat_decay=args.stat_decay,
                               if_eigen=False)
    elif args.algo in ['kbfgs']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o-invertA']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay=args.stat_decay,
                               if_invert_A=True)

    elif args.algo in ['kbfgs-h**o-invertA-decoupledDecay']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               stat_decay_A=args.stat_decay_A,
                               stat_decay_G=args.stat_decay_G,
                               if_invert_A=True,
                               if_decoupled_decay=True)
    elif args.algo in ['kbfgs-h**o-momentumGrad']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               if_momentumGrad=True,
                               stat_decay=args.stat_decay)
    elif args.algo in ['kbfgs-h**o-noClip']:

        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               kbfgs=True,
                               if_homo=True,
                               if_clip=False,
                               stat_decay=args.stat_decay)
    else:
        print('unknown args.algo for ' + args.algo)
        sys.exit()

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    record_rewards = []

    record_num_steps = []

    print('num_updates')
    print(num_updates)

    total_num_steps = 0

    start = time.time()
    for j in range(num_updates):

        print('j')
        print(j)

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)

            for info in infos:

                #                 print('info.keys()')
                #                 print(info.keys())

                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

                    print('info[episode][r]')
                    print(info['episode']['r'])

                    record_rewards.append(info['episode']['r'])

                    #                     print('total_num_steps')
                    #                     print(total_num_steps)

                    #                     print('total_num_steps + (step + 1) * args.num_processes')
                    #                     print(total_num_steps + (step + 1) * args.num_processes)

                    record_num_steps.append(total_num_steps +
                                            (step + 1) * args.num_processes)

#                     sys.exit()

# If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, update_signal = agent.update(
            rollouts)

        if update_signal == -1:
            #             sys.exit()
            break

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass

    print('record_rewards')
    print(record_rewards)

    dir_with_params = args.env_name + '/' +\
    args.algo + '/' +\
    'eps_' + str(args.eps) + '/' +\
    'lr_' + str(args.lr) + '/' +\
    'stat_decay_' + str(args.stat_decay) + '/'

    #     saving_dir = './result/' + args.env_name + '/' + args.algo + '/'
    saving_dir = './result/' + dir_with_params

    if not os.path.isdir(saving_dir):
        os.makedirs(saving_dir)

    import pickle

    with open(saving_dir + 'result.pkl', 'wb') as handle:
        pickle.dump(
            {
                'record_rewards': record_rewards,
                'record_num_steps': record_num_steps
            }, handle)

    print('args.log_dir')
    print(args.log_dir)

    print('os.listdir(args.log_dir)')
    print(os.listdir(args.log_dir))

    #     saving_dir_monitor = './result_monitor/' + args.env_name + '/' + args.algo + '/'

    saving_dir_monitor = './result_monitor/' + dir_with_params

    if os.path.isdir(saving_dir_monitor):
        import shutil

        shutil.rmtree(saving_dir_monitor)

    if not os.path.isdir(saving_dir_monitor):
        os.makedirs(saving_dir_monitor)

    print('saving_dir_monitor')
    print(saving_dir_monitor)

    import shutil

    for file_name in os.listdir(args.log_dir):

        full_file_name = os.path.join(args.log_dir, file_name)

        print('full_file_name')
        print(full_file_name)

        print('os.path.isfile(full_file_name)')
        print(os.path.isfile(full_file_name))

        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, saving_dir_monitor)

#     print('os.listdir(saving_dir_monitor)')
#     print(os.listdir(saving_dir_monitor))

#     print('len(os.listdir(saving_dir_monitor))')
#     print(len(os.listdir(saving_dir_monitor)))

#     print('args.num_processes')
#     print(args.num_processes)

    assert len(os.listdir(saving_dir_monitor)) == args.num_processes
Пример #10
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now(
    ).strftime("%Y-%m-%d-%H-%M-%S-%f")
    log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir,
                                                 args.save_dir)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name,
                         args.seed,
                         args.num_processes,
                         args.gamma,
                         log_dir,
                         args.add_timestep,
                         device,
                         False,
                         frame_skip=args.frame_skip)

    if args.load_path:
        actor_critic, _ob_rms = torch.load(args.load_path)
        vec_norm = get_vec_normalize(envs)
        if vec_norm is not None:
            vec_norm.train()
            vec_norm.ob_rms = _ob_rms
        actor_critic.train()
    else:
        actor_critic = Policy(envs.observation_space.shape,
                              envs.action_space,
                              beta=args.beta_dist,
                              base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=args.lr_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=args.lr_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         beta=args.sil_beta,
                         value_loss_coef=args.sil_value_loss_coef,
                         entropy_coef=args.sil_entropy_coef)
        replay = ReplayStorage(10000,
                               num_processes=args.num_processes,
                               gamma=args.gamma,
                               prio_alpha=args.sil_alpha,
                               obs_shape=envs.observation_space.shape,
                               action_space=envs.action_space,
                               recurrent_hidden_state_size=actor_critic.
                               recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    action_high = torch.from_numpy(envs.action_space.high).to(device)
    action_low = torch.from_numpy(envs.action_space.low).to(device)
    action_mid = 0.5 * (action_high + action_low)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)
    benchmark_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                # sample actions
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            if args.clip_action and isinstance(envs.action_space,
                                               gym.spaces.Box):
                clipped_action = action.clone()
                if args.shift_action:
                    # FIXME experimenting with this, so far resulting in
                    # faster learning when clipping guassian continuous
                    # output (vs leaving centred at 0 and unscaled)
                    clipped_action = 0.5 * clipped_action + action_mid
                clipped_action = torch.max(
                    torch.min(clipped_action, action_high), action_low)
            else:
                clipped_action = action

            # act in environment and observe
            obs, reward, done, infos = envs.step(clipped_action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])
                    if 'rb' in info['episode']:
                        benchmark_rewards.append(info['episode']['rb'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        train_eprew = np.mean(episode_rewards)
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), train_eprew,
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end='')
            if len(benchmark_rewards):
                print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format(
                    np.mean(benchmark_rewards), np.median(benchmark_rewards),
                    np.min(benchmark_rewards), np.max(benchmark_rewards)),
                      end='')
            print()

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            vec_norm = get_vec_normalize(eval_envs)
            if vec_norm is not None:
                vec_norm.eval()
                vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                clipped_action = action
                if args.clip_action and isinstance(envs.action_space,
                                                   gym.spaces.Box):
                    if args.shift_action:
                        clipped_action = 0.5 * clipped_action + action_mid
                    clipped_action = torch.max(
                        torch.min(clipped_action, action_high), action_low)

                obs, reward, done, infos = eval_envs.step(clipped_action)

                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            eval_eprew = np.mean(eval_episode_rewards)
            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), eval_eprew))

        if len(episode_rewards
               ) and j % args.save_interval == 0 and save_dir != "":
            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                getattr(get_vec_normalize(envs), 'ob_rms', None)
            ]

            ep_rewstr = ("%d" % train_eprew).replace("-", "n")
            save_filename = os.path.join(
                save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr))

            torch.save(save_model, save_filename)

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, log_dir, args.env_name, args.algo,
                                  args.num_frames)
            except IOError:
                pass
Пример #11
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # Instantiate the environment
    config = getattr(configs, args.config)()

    # We make this in order to get the shapes.
    dummy_env = make_env(args, config, -1,
                         [config['agent'](game_type=config['game_type'])])()
    envs_shape = dummy_env.observation_space.shape[1:]
    obs_shape = (envs_shape[0], *envs_shape[1:])
    action_space = dummy_env.action_space
    if len(envs_shape) == 3:
        if args.model == 'convnet':
            actor_critic = lambda saved_model: PommeCNNPolicySmall(
                obs_shape[0], action_space, args)
        elif args.model == 'resnet':
            actor_critic = lambda saved_model: PommeResnetPolicy(
                obs_shape[0], action_space, args)
    else:
        actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space
                                                     )

    # We need to get the agent = config.agent(agent_id, config.game_type) and then
    # pass that agent into the agent.PPOAgent
    training_agents = []
    saved_models = args.saved_models
    saved_models = saved_models.split(
        ',') if saved_models else [None] * args.nagents
    assert (len(saved_models)) == args.nagents
    for saved_model in saved_models:
        # TODO: implement the model loading.
        model = actor_critic(saved_model)
        agent = config['agent'](game_type=config['game_type'])
        agent = ppo_agent.PPOAgent(agent, model)
        training_agents.append(agent)

    if args.how_train == 'simple':
        # Simple trains a single agent against three SimpleAgents.
        assert (
            args.nagents == 1), "Simple training should have a single agent."
        num_training_per_episode = 1
    elif args.how_train == 'homogenous':
        # Homogenous trains a single agent against itself (self-play).
        assert (args.nagents == 1
                ), "Homogenous toraining should have a single agent."
        num_training_per_episode = 4
    elif args.how_train == 'heterogenous':
        assert (args.nagents >
                1), "Heterogenous training should have more than one agent."
        print("Heterogenous training is not implemented yet.")
        return

    # NOTE: Does this work correctly? Will the threads operate independently?
    envs = [
        make_env(args, config, i, training_agents)
        for i in range(args.num_processes)
    ]
    envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs)
    # TODO: Figure out how to render this for testing purposes. The following link may help:
    # https://github.com/MG2033/A2C/blob/master/envs/subproc_vec_env.py

    for agent in training_agents:
        agent.initialize(args, obs_shape, action_space,
                         num_training_per_episode)

    current_obs = torch.zeros(num_training_per_episode, args.num_processes,
                              *obs_shape)

    def update_current_obs(obs):
        current_obs = torch.from_numpy(obs).float()

    obs = envs.reset()
    update_current_obs(obs)
    if args.how_train == 'simple':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)
    elif args.how_train == 'homogenous':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])
    final_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        for agent in training_agents:
            agent.cuda()

    stats = utils.init_stats(args)
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            value_agents = []
            action_agents = []
            action_log_prob_agents = []
            states_agents = []
            episode_reward = []
            cpu_actions_agents = []

            if args.how_train == 'simple':
                value, action, action_log_prob, states = training_agents[
                    0].act_pytorch(step, 0)
                value_agents.append(value)
                action_agents.append(action)
                action_log_prob_agents.append(action_log_prob)
                states_agents.append(states)
                cpu_actions = action.data.squeeze(1).cpu().numpy()
                cpu_actions_agents = cpu_actions
            elif args.how_train == 'homogenous':
                cpu_actions_agents = [[] for _ in range(args.num_processes)]
                for i in range(4):
                    value, action, action_log_prob, states = training_agents[
                        0].act_pytorch(step, i)
                    value_agents.append(value)
                    action_agents.append(action)
                    action_log_prob_agents.append(action_log_prob)
                    states_agents.append(states)
                    cpu_actions = action.data.squeeze(1).cpu().numpy()
                    for num_process in range(args.num_processes):
                        cpu_actions_agents[num_process].append(
                            cpu_actions[num_process])

            obs, reward, done, info = envs.step(cpu_actions_agents)
            reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1)
            episode_rewards += reward

            # import pdb; pdb.set_trace()
            if args.how_train == 'simple':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode for done_ in done])
            elif args.how_train == 'homogenous':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode
                     for done_ in done]).transpose(0, 1)

            final_rewards *= masks  # nagents x nprocesses x 1
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()

            reward_all = reward.unsqueeze(2)
            if args.how_train == 'simple':
                masks_all = masks.transpose(0, 1).unsqueeze(2)
            elif args.how_train == 'homogenous':
                masks_all = masks.unsqueeze(2)

            current_obs *= masks_all.unsqueeze(2).unsqueeze(2)
            update_current_obs(obs)

            states_all = torch.from_numpy(
                np.stack([x.data for x in states_agents])).float()
            action_all = torch.from_numpy(
                np.stack([x.data for x in action_agents])).float()
            action_log_prob_all = torch.from_numpy(
                np.stack([x.data for x in action_log_prob_agents])).float()
            value_all = torch.from_numpy(
                np.stack([x.data for x in value_agents])).float()

            if args.how_train in ['simple', 'homogenous']:
                training_agents[0].insert_rollouts(step, current_obs,
                                                   states_all, action_all,
                                                   action_log_prob_all,
                                                   value_all, reward_all,
                                                   masks_all)

        next_value_agents = []
        if args.how_train == 'simple':
            agent = training_agents[0]
            next_value_agents.append(agent.run_actor_critic(-1, 0))
            advantages = [
                agent.compute_advantages(next_value_agents, args.use_gae,
                                         args.gamma, args.tau)
            ]
        elif args.how_train == 'homogenous':
            agent = training_agents[0]
            next_value_agents = [
                agent.run_actor_critic(-1, num_agent) for num_agent in range(4)
            ]
            advantages = [
                agent.compute_advantages(next_value_agents, args.use_gae,
                                         args.gamma, args.tau)
            ]

        final_action_losses = []
        final_value_losses = []
        final_dist_entropies = []

        for num_agent, agent in enumerate(training_agents):
            for _ in range(args.ppo_epoch):
                data_generator = agent.feed_forward_generator(
                    advantages[num_agent], args)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                        return_batch, masks_batch, old_action_log_probs_batch, \
                        adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = agent.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean()
                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()
                    agent.optimize(value_loss, action_loss, dist_entropy,
                                   args.entropy_coef, args.max_grad_norm)

            final_action_losses.append(action_loss)
            final_value_losses.append(value_loss)
            final_dist_entropies.append(dist_entropy)

            agent.after_update()

        #####
        # Save model.
        #####
        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # XXX: new way for saving model
            # XXX: we should also add the optimizer along with the state_dict
            for num_agent, agent in enumerate(training_agents):
                save_model = agent.get_model()
                save_optimizer = agent.get_optimizer()
                torch.save(
                    {
                        'epoch': j,
                        'arch': args.model,
                        'state_dict': save_model.state_dict(),
                        'optimizer': save_optimizer.state_dict(),
                    },
                    os.path.join(
                        save_path,
                        "train={}-config={}-model={}-agent={}.pt".format(
                            args.how_train, args.config, args.model,
                            num_agent)))

        #####
        # Log to console.
        #####
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, avg entropy {:.5f}, avg value loss {:.5f}, avg policy loss {:.5f}"
                .format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    np.mean([
                        dist_entropy.data[0]
                        for dist_entropy in final_dist_entropies
                    ]),
                    np.mean([
                        value_loss.data[0] for value_loss in final_value_losses
                    ]),
                    np.mean([
                        action_loss.data[0]
                        for action_loss in final_action_losses
                    ])))

            # save stats to h5 file
            # TODO: need to fix this error
            # stats = utils.log_stats(args, stats, j, int(total_num_steps / (end - start)), \
            #     final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), \
            #     np.mean([action_loss.data[0] for action_loss in final_action_losses]), \
            #     np.mean([value_loss.data[0] for value_loss in final_value_losses]), \
            #     np.mean([dist_entropy.data[0] for dist_entropy in final_dist_entropies]))
            #
            # log_path = os.path.join(args.log_dir)
            # filename_stats = '%s/stats.h5' % log_path
            # utils.save_dict(filename_stats, stats)

        #####
        # Log to Visdom.
        #####
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name, 'ppo')
            except IOError:
                pass
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    if args.num_processes > 1:
        if args.retro_contest == True:
            import json
            sonic_env_confs = json.load(open(args.sonic_config_file, 'r'))
            sonic_env_confs = sonic_env_confs['Train']
            sonic_env_confs = [v for _, v in sonic_env_confs.items()]
            envs = SubprocVecSonicEnv(sonic_env_confs, args.num_processes)
        else:
            envs = [
                make_env(args.env_name, args.seed, i, args.log_dir,
                         args.add_timestep) for i in range(args.num_processes)
            ]
            envs = SubprocVecEnv(envs)
    else:
        envs = [
            make_env(args.env_name, args.seed, i, args.log_dir,
                     args.add_timestep) for i in range(args.num_processes)
        ]
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    prev_saved_rew_median = float('-inf')
    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)
    if args.load_model:
        model_path = os.path.join(args.save_dir, args.algo,
                                  args.env_name) + ".pt"
        actor_critic, ob_rms, prev_saved_rew_median = torch.load(model_path)
        print("Loaded actor_critic model from:", model_path,
              "which got a median score of:", prev_saved_rew_median)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    prev_reward = 0.0
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and final_rewards.median(
        ) > prev_saved_rew_median and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None,
                final_rewards.median()
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            prev_saved_rew_median = final_rewards.median()
            # Save a separate copy just in case the main saved model ends up being worser.
            # Helps to have a few saved models to choose from at test/runtime
            torch.save(
                save_model,
                os.path.join(
                    save_path,
                    args.env_name + str(final_rewards.median()) + '.pt'))
            print("Saved the state which got a median reward of",
                  prev_saved_rew_median)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #13
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]
    # env = get_test_env("001")
    envs = [lambda: get_test_env("000") for _ in range(args.num_processes)]
    # num_states = len(env.all_possible_states())
    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = OptionCritic(num_options, obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        # assert not args.recurrent_policy, \
        #     "Recurrent policy is not implemented for the MLP controller"
        # actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
        raise NotImplementedError()

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        # optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
        raise NotImplementedError()
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps = args.eps)
    elif args.algo == 'acktr':
        # optimizer = KFACOptimizer(actor_critic)
        raise NotImplementedError()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size, num_options)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    optionSelection = 0
    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()
    start = time.time()
    #print(options)
    #print(options[0])
    for j in range(num_updates):
        options = [-1] * args.num_processes
        for step in range(args.num_steps):
            # Choose Option 
            t0 = time.time()
            selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
                   # print(new_option)
            for i in range(args.num_processes):
                if options[i] == -1:
                    options[i] = new_option[i].data[0]
            #print("option is:")
            #print(options)
            t1 = time.time()
            # Sample actions
            value, action, action_log_prob, states = actor_critic.get_output(
                    options,
                    Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            t2 = time.time()
            # Termination 
            term_value, termination, termination_log_prob, _ = actor_critic.get_termination(
                options,
                Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            termination = torch.LongTensor([termination[i].data[0] for i in range(termination.shape[0])])
            t3 = time.time()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # newIndex = obs_to_int(obs)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks


            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks
            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks, options, termination)
            
            for i in range(termination.shape[0]):
                if termination[i] == 1:
                    options[i] = -1
            t4 = time.time()
            #print("part1")
            #print(t1 - t0)
            #print("part2")
            #print(t2-t1)
            #print("part3")
            #print(t3-t2)
            #print("part4")
            #print(t4-t3)
        for i in range(args.num_processes):
            if options[i]== -1:
                selection_value, new_option, option_log_prob, states = actor_critic.get_option(Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
                # print(new_option)
            options[i] = new_option[i].data[0]
        rollouts.options[step+1].copy_(torch.LongTensor(options))
        next_value = actor_critic.get_output(options,Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            raise NotImplementedError()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                for i in range(args.num_steps):
                    # Get the ith step during exploration
                    options = rollouts.options[i]
                    #print(options)
                    adv_targ = Variable(advantages[i])
                    old_action_log_probs = rollouts.action_log_probs[i]
                    termination = rollouts.optionSelection[i]
                    #print(termination)
                    # Use critic value of option nn to update option parameters
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_option(
                        Variable(rollouts.observations[i]),
                        Variable(rollouts.states[i]),
                        Variable(rollouts.masks[i]),
                        Variable(rollouts.actions[i]), options)
                    #print(action_log_probs)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)
                    value_loss = (Variable(rollouts.returns[i]) - values).pow(2).mean()

                    selection_log_prob = actor_critic.evaluate_selection(
                        Variable(rollouts.observations[i]),
                        Variable(rollouts.states[i]),
                        Variable(rollouts.masks[i]),
                        Variable(termination),
                        Variable(rollouts.options[i].type(torch.cuda.LongTensor)))
                    V_Omega = selection_log_prob * values 

                    # Update termination parameters 
                    termination_log_prob = actor_critic.evaluate_termination(
                        Variable(rollouts.observations[i]),
                        Variable(rollouts.states[i]),
                        Variable(rollouts.masks[i]),
                        Variable(termination.type(torch.cuda.LongTensor)),
                        rollouts.options[i+1])
                    left_values = []
                    right_values = []
                    for i in range(args.num_processes):
                        if int(termination[i]) == 1:
                            left_values.append(V_Omega[i])
                            right_values.append(values[i])
                        elif int(termination[i]) == 0:
                            left_values.append(values[i])
                            right_values.append(V_Omega[i])
                    left_values = torch.cat(left_values)
                    right_values = torch.cat(right_values)
                    termination_loss = (- torch.exp(termination_log_prob) * left_values - (1 - torch.exp(termination_log_prob)) * right_values).mean()
                    optimizer.zero_grad()

                    (action_loss + value_loss+ termination_loss - V_Omega.mean()).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
            writer.add_scaler("final_reward_max", final_rewards.max(), plot_index)
            plot_index += 1
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                print("hit")
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
	def train_maml(self, num_updates):
		start = time.time()
		theta_list = []

		num_tasks = 1000
		sample_size = 10
		 
		# episode_id: episode_id%10==0)

		# env = gym.wrappers.Monitor(self.envs, self.args.save_dir, video_callable=lambda episode_id: episode_id%10==0)

		# Create the variations needed
		task_list = []
		for i in range(num_tasks):
			friction = np.random.randint(low=1, high=10, size=3).astype('float32')/10.
			friction_1 = np.random.uniform(low=0.1, high=0.8, size=3).astype('float32')
			task = {'default/geom': ['', 'friction', '{0:.1f} {1:.1f} {2:.1f}'.format(
				friction[0],
				friction[1],
				friction[2])],
				'worldbody/body/body/geom': [[['name', 'fthigh'], ['type', 'capsule']], 
											 'friction',
											 '{0:.1f} {1:.1f} {2:.1f}'.format(
											  friction_1[0],
											  friction_1[1],
											  friction_1[2])]
			}
			# task2 = {'option': ['gravity', '{0:.2f} {1:.2f} {2:.2f}'.format(0,0,gravity_z)]}
			task_list.append(task)


		for j in range(num_updates):

			sample_indexes = np.random.randint(0, num_tasks, size=sample_size)
			# Get the theta
			if j == 0:
				theta = self.get_weights()

			# Inner loop
			# First gradient
			for i, sample_index in enumerate(sample_indexes):

				# Get the task
				task = task_list[sample_index]
				env = self.envs.venv.envs[0]

				# env = gym.wrappers.Monitor(env.env, './videos2/', video_callable=lambda episode_id: episode_id%10==0)

				_tag_names = []
				_tag_identifiers = []
				_attributes = []
				_values = []

				for k in task.keys():
					v = task[k]
					_tag_names.append(k)
					_tag_identifiers.append(v[0])
					_attributes.append(v[1])
					_values.append(v[2])

				env.env.env.my_init(_tag_names, \
									_tag_identifiers,
                                    _attributes, \
                                    _values,
									None)

				# Set the model weights to theta before training
				self.set_weights(theta)

				dist_entropy, value_loss, action_loss = self.run()

				if j == 0:
					theta_list.append(self.get_weights())
				else:
					print(i)
					theta_list[i] = self.get_weights()

			# Second gradiet
			theta_copy = deepcopy(theta)
			for k1, sample_index in enumerate(sample_indexes):

				# Get the task
				task = task_list[sample_index]
				env = self.envs.venv.envs[0]

				_tag_names = []
				_tag_identifiers = []
				_attributes = []
				_values = []

				for k in task.keys():
					v = task[k]
					_tag_names.append(k)
					_tag_identifiers.append(v[0])
					_attributes.append(v[1])
					_values.append(v[2])

				env.env.env.my_init(_tag_names, \
									_tag_identifiers,
                                    _attributes, \
                                    _values,
									None)


				# Get the network loss for this task for 1 episode
				# TODO: There should be no while loop
				# while self.a2c.n_episodes < 1:
				dist_entropy, value_loss, action_loss = self.meta_run(theta_list[k1],theta_copy)

				theta = self.get_weights()

				# Set the model weights to theta
				# self.set_weights(theta)

				# Update theta
				# Change the update network function
				# theta['state_dict'] = self.agent.update_net(theta['state_dict'],dist_entropy,value_loss,action_loss)

			# env = gym.wrappers.Monitor(env, './videos/', video_callable=lambda episode_id: episode_id%10==0,force=True)	


			if j % self.args.save_interval == 0 and self.args.save_dir != "":
				save_path = os.path.join(self.args.save_dir, self.args.algo)
				try:
					os.makedirs(save_path)
				except OSError:
					pass

				model_state = {'num_updates': j,
						    'state_dict': self.actor_critic.state_dict(),
						    'optimizer': self.meta_optimizer.state_dict()
							}
				model_state = [model_state,hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None]

				torch.save(model_state, os.path.join(save_path, self.args.env_name + 'update_'+ str(j) +".pt"))

				# # A really ugly way to save a model to CPU
				# save_model = self.actor_critic
				# if self.args.cuda:
				# 	save_model = copy.deepcopy(self.actor_critic).cpu()

				# save_model = [save_model,
				# 				hasattr(self.envs, 'ob_rms') and self.envs.ob_rms or None]

				# torch.save(save_model, os.path.join(save_path, self.args.env_name + ".pt"))

			if j % self.args.log_interval == 0:
				end = time.time()
				total_num_steps = (j + 1) * self.args.num_processes * self.args.num_steps
				print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
					format(j, total_num_steps,
						   int(total_num_steps / (end - start)),
						   self.final_rewards.mean(),
						   self.final_rewards.median(),
						   self.final_rewards.min(),
						   self.final_rewards.max(), dist_entropy.data[0],
						   value_loss.data[0], action_loss.data[0]))
			if self.args.vis and j % self.args.vis_interval == 0:
				try:
					# Sometimes monitor doesn't properly flush the outputs
					self.win = visdom_plot(self.viz, self.win, self.args.log_dir, 
						self.args.env_name, self.args.algo)
				except IOError:
					pass
Пример #15
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if len(envs.observation_space.shape) == 3:
        actor_critic = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if args.cuda:
        actor_critic.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
        critic_optim = optim.Adam(critic.parameters(), lr=1e-4)
        gamma = 0.99
        tau = 0.001

    #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
    mem_buffer = ReplayBuffer()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)
            mem_buffer.add((pre_state, current_obs,
                            action_log_prob.data.cpu().numpy(), reward, done))
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        action, action_log_prob, states = actor_critic.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if True:
            state, next_state, action, reward, done = mem_buffer.sample(5)
            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, 6])
            next_q_values = critic_target(
                to_tensor(next_state, volatile=True),
                target_actor(to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True))[0])
            next_q_values.volatile = False
            target_q_batch = to_tensor(reward) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values
            critic.zero_grad()
            q_batch = critic(to_tensor(state), to_tensor(action))
            value_loss = criterion(q_batch, target_q_batch)
            value_loss.backward()
            critic_optim.step()
            actor_critic.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor_critic(to_tensor(state), to_tensor(state),
                             to_tensor(state))[0])
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            optimizer.step()
            soft_update(target_actor, actor_critic, tau)
            soft_update(critic_target, critic, tau)
        '''
        if args.algo in ['a2c', 'acktr']:
            action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))
            values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            #advantages = Variable(rollouts.returns[:-1]) - values
            advantages = rollouts.returns[:-1] - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages) * action_log_probs).mean()
            #action_loss = -(Variable(advantages.data) * action_log_probs).mean()


            optimizer.zero_grad()
            critic_optim.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
            critic_optim.step()
        '''
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Пример #16
0
            next_state, reward, done, _ = env.step(use_action)
            episode_reward += reward

            next_state = torch.Tensor([next_state])

            state = next_state
            if done:
                break

        #writer.add_scalar('reward/test', episode_reward, i_episode)
        '''
        end = time.time()
        total_num_steps = step * args.num_processes
        rewards.append(episode_reward)
        #print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:])))
        print(
            "Num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}, entropy {:.5f}"
            .format(total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(), value_loss,
                    policy_loss, entropy.item()))

    if args.vis and step % args.log_interval == 0 and len(
            memory) > args.warmup:
        try:
            win = visdom_plot(viz, win, args.log_dir, args.env_name,
                              'disc_ddpg', args.num_frames)
        except IOError:
            pass
env.close()
Пример #17
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.load_model is not None:
        actor_critic = torch.load(args.load_model)[0]
    else:
        actor_critic = Policy(obs_shape, envs.action_space,
                              args.recurrent_policy, args.hidden_size, args)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm,
                               pop_art=args.pop_art)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    obs = envs.reset()
    update_current_obs(obs, current_obs, obs_shape, args.num_stack)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    scale = 1.
    current_pdrr = [0., 0.]
    last_update = 0

    ### parameters for adaptive reward scaling ###
    t_stop = 0
    beta = .99
    R_prev = -1e9
    m_max = -1e9
    m_t = 0
    reverse = False

    last_scale_t = -1e9
    ###

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # reward *= args.reward_scaling

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs, obs_shape, args.num_stack)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        t = j // args.adaptive_interval
        if args.pop_art:
            value_loss, action_loss, dist_entropy = agent.pop_art_update(
                rollouts)
        else:
            if t - last_scale_t > 100:
                value_loss, action_loss, dist_entropy = agent.update(
                    rollouts, update_actor=True)
            else:
                value_loss, action_loss, dist_entropy = agent.update(
                    rollouts, update_actor=False)

        if agent.max_grad_norm < .5 and t - last_scale_t < 100:
            agent.max_grad_norm += 0.00001

        if j % args.adaptive_interval == 0 and j and t - last_scale_t > 100:
            t = j // args.adaptive_interval

            R_t = float('{}'.format(final_rewards.mean()))
            R_ts.append(R_t)
            assert type(R_t) == float
            t_stop += 1
            m_t = beta * m_t + (1 - beta) * R_t
            m_hat = m_t / (1 - beta**t)
            print('m_hat :{}, t_stop: {}'.format(m_hat, t_stop))
            print('agent.max_grad_norm, ', agent.max_grad_norm)
            if m_hat > m_max:
                m_max = m_hat
                t_stop = 0
            if t_stop > args.tolerance:
                if reverse and m_max <= R_prev:
                    break
                elif reverse and m_max > R_prev:
                    agent.max_grad_norm = args.max_grad_norm_after
                    actor_critic.rescale(args.cdec)
                    scale *= args.cdec
                    agent.reinitialize()
                    last_scale_t = t
                elif not reverse and m_max <= R_prev:
                    agent.max_grad_norm = args.max_grad_norm_after
                    actor_critic.rescale(args.cdec)
                    scale *= args.cdec
                    agent.reinitialize()
                    reverse = True
                    last_scale_t = t
                else:
                    agent.max_grad_norm = args.max_grad_norm_after
                    actor_critic.rescale(args.cinc)
                    scale *= args.cinc
                    agent.reinitialize()
                    last_scale_t = t

                R_prev = m_max
                j = t_stop = m_t = 0
                m_max = -1e9

        # if j % args.log_interval == 0:
        # this is used for testing saturation
        # relus = actor_critic.base_forward(
        # rollouts.observations[:-1].view(-1, *rollouts.observations.size()[2:]))

        rollouts.after_update()

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps

            # relus = log_saturation(fname=args.saturation_log,
            # first=(j==0),
            # relus=[relu.cpu().detach().numpy() for relu in relus])

            # print("saturation", relus)
            # if j > 0:
            # current_pdrr = incremental_update(current_pdrr, relus)

            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}, scale {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss, scale))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.plot_title,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #18
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    names = getListOfGames("train")

    envs = [make_env_train(names[i], args.seed, i, args.log_dir)
                for i in range(len(names))]
                
    # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
    args.num_processes = len(envs)
    # REMEMBER YOU CHENGED IT

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    #print(obs_shape)
    obs_shape = (obs_shape[0], *obs_shape[1:])
    #print(obs_shape)

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    # Making it paralel
    actor_critic = torch.nn.parallel.DataParallel(actor_critic).module

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
       actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
        # Make agent DataParallel
        agent = torch.nn.parallel.DataParallel(agent).module

    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    # Make rollouts DataParallel
    rollouts = torch.nn.parallel.DataParallel(RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)).module
    current_obs = torch.nn.parallel.DataParallel(torch.zeros(envs.nenvs, *obs_shape)).module

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        # if args.num_stack > 1:
        #     current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                    Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic.get_value(Variable(rollouts.observations[-1], volatile=True),
                                            Variable(rollouts.states[-1], volatile=True),
                                            Variable(rollouts.masks[-1], volatile=True)).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #19
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder)
    # logger.save_args(args)

    # print ("---------------------------------------")
    # print ('Saving to', logger.save_folder)
    # print ("---------------------------------------")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
        target_actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                        args.recurrent_policy)

    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
        target_actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    for param, target_param in zip(actor_critic.parameters(),
                                   target_actor_critic.parameters()):
        target_param.data.copy_(param.data)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    actor_regularizer_criterion = nn.KLDivLoss()
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
            Variable(rollouts.masks[:-1].view(-1, 1)),
            Variable(rollouts.actions.view(-1, action_shape)))
        """
        Used for KL Constraint in case of Continuous Action Stochastic Policies
        """
        # target_values, target_action_log_probs, target_dist_entropy, target_states, target_action_mean, target_action_std = target_actor_critic.evaluate_actions_mean_and_std(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
        #                                                                                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
        #                                                                                Variable(rollouts.masks[:-1].view(-1, 1)),
        #                                                                                Variable(rollouts.actions.view(-1, action_shape)))

        # actor_regularizer_loss = (torch.log(action_std/target_action_std) + (action_std.pow(2) + (action_mean - target_action_mean).pow(2))/(2*target_action_std.pow(2)) - 0.5)

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        ### Loss with regularizer added
        ##action_loss = -(Variable(advantages.data) * action_log_probs).mean() + args.actor_lambda * actor_regularizer_loss.mean(0).sum()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        total_loss = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef
        total_loss.backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()

        ## Exponential average for target updates
        #if (j%args.target_update_interval == 0):
        # for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()):
        #     target_param.data.copy_(args.target_tau * param.data + (1 - args.target_tau) * target_param.data)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max)
            # logger.save()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Пример #20
0
def distil(teacher, student, optimizer, envs_teacher, envs_student_train,
           envs_student_test):
    ''' Trains the student on the teachers soft targets
        Note assumes that we are just trying to match the actions of the teacher
        not the values of the critic?
    '''
    losses = []
    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win1 = [None] * args.num_heads  #student reward plots
        win2 = None  #loss plots

    obs_shape = envs_teacher[0].observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    if envs_teacher[0].action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs_teacher[0].action_space.shape[0]

    teacher_storage = []
    student_storage_train = []
    student_storage_test = []
    for i in range(args.num_heads):
        teacher_storage.append(
            get_storage(envs_teacher[i], args.num_steps, args.num_processes,
                        obs_shape, envs_teacher[i].action_space))
        student_storage_train.append(
            get_storage(envs_student_train[i], args.num_steps,
                        args.num_processes, obs_shape,
                        envs_student_train[i].action_space))
        student_storage_test.append(
            get_storage(envs_student_test[i], args.num_steps,
                        args.num_processes, obs_shape,
                        envs_student_test[i].action_space))

    if args.cuda:
        for i in range(args.num_heads):
            teacher_storage[i]['current_obs'] = teacher_storage[i][
                'current_obs'].cuda()
            student_storage_train[i]['current_obs'] = student_storage_train[i][
                'current_obs'].cuda()
            student_storage_test[i]['current_obs'] = student_storage_test[i][
                'current_obs'].cuda()
            teacher_storage[i]['rollouts'].cuda()
            student_storage_train[i]['rollouts'].cuda()
            student_storage_test[i]['rollouts'].cuda()

    start = time.time()
    teacher_student_prob = [
        1 - args.frac_student_rollouts, args.frac_student_rollouts
    ]
    for j in range(num_updates):
        head = np.random.randint(args.num_heads)
        roll = np.random.choice(2, p=teacher_student_prob)
        #print('j: %d, Head: %d, Roll: %d'%(j,head, roll))

        if roll == 1:
            # use student trajectory
            sample_rollouts(student, envs_student_train[head],
                            student_storage_train[head], head)
            rollouts = student_storage_train[head]['rollouts']
        else:
            # use teacher trajectory
            sample_rollouts(teacher[head], envs_teacher[head],
                            teacher_storage[head])
            rollouts = teacher_storage[head]['rollouts']

        next_value = teacher[head](
            Variable(rollouts.observations[-1],
                     volatile=True))[0].data  # value function

        # no clue what this does
        if hasattr(teacher[head], 'obs_filter'):
            teacher[head].obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))
        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        # get loss and take grad step on student params
        loss = get_loss(student, teacher[head], rollouts, obs_shape, head)
        losses.append(loss.data.cpu().numpy())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (j + 1) % args.save_interval == 0 and args.save_dir != "":
            save_checkpoint(student, optimizer, j)
            save_data(losses)

        # collect test trajectories
        sample_rollouts(student, envs_student_test[head],
                        student_storage_test[head], head)
        student_next_value_test = student(
            Variable(student_storage_test[head]['rollouts'].observations[-1],
                     volatile=True))[0].data  # value function
        if hasattr(student, 'obs_filter'):
            student.obs_filter.update(
                student_storage_test[head]['rollouts'].observations[:-1].view(
                    -1, *obs_shape))
        student_storage_test[head]['rollouts'].compute_returns(
            student_next_value_test, args.use_gae, args.gamma, args.tau)

        # log student performance
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            for head in range(args.num_heads):
                print(
                    "Head {} : Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, loss {:.5f}"
                    .format(
                        head, j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        student_storage_test[head]['final_rewards'].mean(),
                        student_storage_test[head]['final_rewards'].median(),
                        student_storage_test[head]['final_rewards'].min(),
                        student_storage_test[head]['final_rewards'].max(),
                        loss.data[0]))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                print('visualizing')
                for head in range(args.num_heads):
                    win1[head] = visdom_plot(viz, win1[head],
                                             log_dir_student_test[head],
                                             args.env_name[head],
                                             'Distilation Reward for Student')
                win2 = visdom_data_plot(viz, win2, args.env_name,
                                        'Distilation Loss Plot', losses,
                                        'loss')
            except IOError:
                pass
Пример #21
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data,
                            reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1],
                                           volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.states[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(states_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print(
                "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, (j + 1) * args.num_processes * args.num_steps,
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

        if j % args.vis_interval == 0:
            win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
Пример #22
0
def eval_pomme(
        saved_models='train=simple-config=ffa_v0-model=convnet-agent=0.pt'):
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(server=args.server, port=8097)
        # viz = Visdom(port=args.port)
        win = None

    # Instantiate the environment
    config = getattr(configs, args.config)()

    # We make this in order to get the shapes.
    dummy_env = make_env(args, config, -1,
                         [config['agent'](game_type=config['game_type'])])()
    envs_shape = dummy_env.observation_space.shape[1:]
    obs_shape = (envs_shape[0], *envs_shape[1:])
    action_space = dummy_env.action_space
    if len(envs_shape) == 3:
        if args.model == 'convnet':
            actor_critic = lambda saved_model: PommeCNNPolicySmall(
                obs_shape[0], action_space, args)
        elif args.model == 'resnet':
            actor_critic = lambda saved_model: PommeResnetPolicy(
                obs_shape[0], action_space, args)
    else:
        actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space
                                                     )

    # TODO: this only works for simple - need a list of checkpoints for self-play
    # We need to get the agent = config.agent(agent_id, config.game_type) and then
    # pass that agent into the agent.PPOAgent
    training_agents = []

    # TODO: this is a bit hacky and doesn't work for more than 1 model
    # saved_models = args.saved_models

    save_path = os.path.join(args.save_dir)
    saved_models = [os.path.join(save_path, saved_models)]
    # saved_models = saved_models.split(',') if saved_models else [None]*args.nagents

    assert (len(saved_models)) == args.nagents
    if len(envs_shape) == 3:
        if args.model == 'convnet':
            actor_critic_model = PommeCNNPolicySmall(obs_shape[0],
                                                     action_space, args)
        elif args.model == 'resnet':
            actor_critic_model = PommeResnetPolicy(obs_shape[0], action_space,
                                                   args)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic_model = MLPPolicy(obs_shape[0], action_space)

    print("****")
    for saved_model in saved_models:
        # TODO: implement the model loading.
        loaded_model = torch.load(saved_model)
        print("epoch of model {} is: {}".format(saved_model,
                                                loaded_model['epoch']))
        loaded_actor_critic_model = actor_critic_model.load_state_dict(
            loaded_model['state_dict'])
        model = actor_critic(loaded_actor_critic_model)
        model.eval()
        agent = config['agent'](game_type=config['game_type'])
        agent = ppo_agent.PPOAgent(agent, model)
        training_agents.append(agent)
    print("****")

    if args.how_train == 'simple':
        # Simple trains a single agent against three SimpleAgents.
        assert (
            args.nagents == 1), "Simple training should have a single agent."
        num_training_per_episode = 1
    elif args.how_train == 'homogenous':
        # Homogenous trains a single agent against itself (self-play).
        assert (args.nagents == 1
                ), "Homogenous toraining should have a single agent."
        num_training_per_episode = 4
    elif args.how_train == 'heterogenous':
        assert (args.nagents >
                1), "Heterogenous training should have more than one agent."
        print("Heterogenous training is not implemented yet.")
        return

    # NOTE: Does this work correctly? Will the threads operate independently?
    envs = [
        make_env(args, config, i, training_agents)
        for i in range(args.num_processes)
    ]
    envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs)

    for agent in training_agents:
        agent.initialize(args, obs_shape, action_space,
                         num_training_per_episode)

    current_obs = torch.zeros(num_training_per_episode, args.num_processes,
                              *obs_shape)

    def update_current_obs(obs):
        current_obs = torch.from_numpy(obs).float()

    obs = envs.reset()
    update_current_obs(obs)
    if args.how_train == 'simple':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)
    elif args.how_train == 'homogenous':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])
    final_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        for agent in training_agents:
            agent.cuda()

    start = time.time()
    for j in range(args.num_steps_eval):
        for step in range(args.num_steps):
            value_agents = []
            action_agents = []
            action_log_prob_agents = []
            states_agents = []
            episode_reward = []
            cpu_actions_agents = []

            if args.how_train == 'simple':
                value, action, action_log_prob, states = training_agents[
                    0].act_pytorch(step, 0)
                value_agents.append(value)
                action_agents.append(action)
                action_log_prob_agents.append(action_log_prob)
                states_agents.append(states)
                cpu_actions = action.data.squeeze(1).cpu().numpy()
                cpu_actions_agents = cpu_actions
            elif args.how_train == 'homogenous':
                cpu_actions_agents = [[] for _ in range(args.num_processes)]
                for i in range(4):
                    value, action, action_log_prob, states = training_agents[
                        0].act_pytorch(step, i)
                    value_agents.append(value)
                    action_agents.append(action)
                    action_log_prob_agents.append(action_log_prob)
                    states_agents.append(states)
                    cpu_actions = action.data.squeeze(1).cpu().numpy()
                    for num_process in range(args.num_processes):
                        cpu_actions_agents[num_process].append(
                            cpu_actions[num_process])

            obs, reward, done, info = envs.step(cpu_actions_agents)
            reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1)
            episode_rewards += reward

            if args.how_train == 'simple':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode for done_ in done])
            elif args.how_train == 'homogenous':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode
                     for done_ in done]).transpose(0, 1)

            masks = torch.FloatTensor(
                [[0.0] * num_training_per_episode if done_ else [1.0] *
                 num_training_per_episode for done_ in done]).transpose(0, 1)

            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()

            reward_all = reward.unsqueeze(2)

            masks_all = masks.unsqueeze(2)

            if args.how_train == 'simple':
                masks_all = masks.transpose(0, 1).unsqueeze(2)
            elif args.how_train == 'homogenous':
                masks_all = masks.unsqueeze(2)

            current_obs *= masks_all.unsqueeze(2).unsqueeze(2)
            update_current_obs(obs)

            states_all = torch.from_numpy(
                np.stack([x.data for x in states_agents])).float()
            action_all = torch.from_numpy(
                np.stack([x.data for x in action_agents])).float()
            action_log_prob_all = torch.from_numpy(
                np.stack([x.data for x in action_log_prob_agents])).float()
            value_all = torch.from_numpy(
                np.stack([x.data for x in value_agents])).float()

            if args.how_train in ['simple', 'homogenous']:
                training_agents[0].insert_rollouts(step, current_obs,
                                                   states_all, action_all,
                                                   action_log_prob_all,
                                                   value_all, reward_all,
                                                   masks_all)

        if step % args.log_interval == 0:
            print("step ", step)
            end = time.time()
            total_num_steps = (step +
                               1) * args.num_processes * args.num_steps_eval
            final_rewards_tr = torch.zeros(
                [args.num_processes, args.nagents, 1])
            final_rewards_tr.copy_(final_rewards)
            final_rewards_tr = final_rewards_tr.view(args.num_processes,
                                                     args.nagents).transpose(
                                                         0, 1)
            for i in range(args.nagents):
                print("agent # ", i)
                print(
                    "Updates {}, Agent {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(step, i, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards_tr[i].mean(),
                            final_rewards_tr[i].median(),
                            final_rewards_tr[i].min(),
                            final_rewards_tr[i].max()), "\n")
            print("\n")

        if args.vis and step % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name)
            except IOError:
                pass
Пример #23
0
def main():
    # Tensorboard Setup
    import tensorflow as tf
    import datetime
    # limit tf memory
    physical_devices = tf.config.list_physical_devices('GPU')
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    except:
        # Invalid device or cannot modify virtual devices once initialized.
        pass
    # setup tensorboard
    if args.tb_dir == 'tb':
        tb_log_dir = os.path.join(
            args.tb_dir, args.algo,
            datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    else:
        tb_log_dir = os.path.join('tb', args.tb_dir)
    tb_summary_writer = tf.summary.create_file_writer(tb_log_dir)

    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                         args.gamma, args.log_dir, args.add_timestep, device,
                         False)

    meta = True if args.algo == 'ppometa' else False
    actor_critic = Policy(envs.observation_space.shape,
                          envs.action_space,
                          meta,
                          base_kwargs={'recurrent': args.recurrent_policy})
    actor_critic.to(device)

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    elif args.algo == 'ppometa':
        agent = algo.PPOMeta(actor_critic,
                             args.clip_param,
                             args.ppo_epoch,
                             args.num_mini_batch,
                             args.value_loss_coef,
                             args.entropy_coef,
                             lr=args.lr,
                             eps=args.eps,
                             max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              envs.observation_space.shape, envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=100)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step], rollouts.actions[step],
                    rollouts.prev_rewards[step], rollouts.prev_actions[step],
                    rollouts.infos[step])

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(action)
            """
            for info in infos:
                if 'episode' in info.keys():
                    print(reward)
                    episode_rewards.append(info['episode']['r'])
            """

            # FIXME: works only for environments with sparse rewards
            for idx, eps_done in enumerate(done):
                if eps_done:
                    episode_rewards.append(reward[idx])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            rollouts_infos = None
            if len(infos) > 0 and 'box' in infos[0].keys(
            ) and 'agent' in infos[0].keys():
                rollouts_infos = []
                for info in infos:
                    rollouts_infos.append(
                        np.concatenate([info['box'].pos, info['agent'].pos]))
                rollouts_infos = torch.tensor(rollouts_infos)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks,
                            rollouts_infos)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1], rollouts.actions[-1],
                rollouts.prev_rewards[-1], rollouts.prev_actions[-1],
                rollouts.infos[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j > 1 and j % args.save_interval == 0 and args.save_dir != "":
            print('Saving model')
            print()

            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs.venv, 'ob_rms') and envs.venv.ob_rms or None
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + "_" + str(j) + ".pt"))
            # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if j > 1 and j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, success rate {:.2f}\n"
                .format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    len(episode_rewards), np.mean(episode_rewards),
                    np.median(episode_rewards), np.min(episode_rewards),
                    np.max(episode_rewards),
                    np.count_nonzero(np.greater(episode_rewards, 0)) /
                    len(episode_rewards)))

            with tb_summary_writer.as_default():
                tf.summary.scalar('mean reward',
                                  np.mean(episode_rewards),
                                  step=total_num_steps)
                tf.summary.scalar('median reward',
                                  np.median(episode_rewards),
                                  step=total_num_steps)
                tf.summary.scalar(
                    'success rate',
                    np.count_nonzero(np.greater(episode_rewards, 0)) /
                    len(episode_rewards),
                    step=total_num_steps)

        if args.eval_interval is not None and len(
                episode_rewards) > 1 and j % args.eval_interval == 0:
            eval_envs = make_vec_envs(args.env_name,
                                      args.seed + args.num_processes,
                                      args.num_processes, args.gamma,
                                      eval_log_dir, args.add_timestep, device,
                                      True)

            if eval_envs.venv.__class__.__name__ == "VecNormalize":
                eval_envs.venv.ob_rms = envs.venv.ob_rms

                # An ugly hack to remove updates
                def _obfilt(self, obs):
                    if self.ob_rms:
                        obs = np.clip((obs - self.ob_rms.mean) /
                                      np.sqrt(self.ob_rms.var + self.epsilon),
                                      -self.clipob, self.clipob)
                        return obs
                    else:
                        return obs

                eval_envs.venv._obfilt = types.MethodType(_obfilt, envs.venv)

            eval_episode_rewards = []

            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                args.num_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(args.num_processes, 1, device=device)

            while len(eval_episode_rewards) < 10:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                                for done_ in done])
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            eval_envs.close()

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

            with tb_summary_writer.as_default():
                tf.summary.scalar('eval mean reward',
                                  np.mean(eval_episode_rewards),
                                  step=total_num_steps)
                tf.summary.scalar('eval median reward',
                                  np.median(eval_episode_rewards),
                                  step=total_num_steps)
                tf.summary.scalar(
                    'eval success rate',
                    np.count_nonzero(np.greater(eval_episode_rewards, 0)) /
                    len(eval_episode_rewards),
                    step=total_num_steps)

        if j > 1 and args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, total_num_steps)
            except IOError:
                pass

    envs.close()
Пример #24
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = []
        win_dic ={}
        for i in range(len(mt_env_id_dic_selected)):
            win += [None]
        win_afs_per_m = None
        win_afs_loss = None
        win_basic_loss = None
    
    plot_dic = {}
    envs = []
    ''' Because the oral program has only one game per model, so Song add loop i
        So whatever you wanna run , just put in SubprocVecEnvMt!
    '''
    for i in range(len(mt_env_id_dic_selected)):
        log_dir = args.log_dir+mt_env_id_dic_selected[i]+'/'
        for j in range(args.num_processes):
            envs += [make_env(mt_env_id_dic_selected[i], args.seed, j, log_dir)]
    ''' This envs is an intergration of all the running env'''
    envs = SubprocVecEnvMt(envs)

    num_processes_total = args.num_processes * len(mt_env_id_dic_selected)
    '''(1,128,128)'''
    obs_shape = envs.observation_space.shape
    #num_stack :number of frames to stack
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    from arguments import is_restore
    if is_restore and args.save_dir:
        load_path = os.path.join(args.save_dir, args.algo)
        actor_critic =torch.load(os.path.join(load_path, args.env_name + ".pt"))
        # print ("restored previous model!")
        # print (actor_critic.Variable)
        # print (sss)
    else:
        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)
    #'args.num_steps: number of forward steps in A2C
    #rollouts is an intergration of state\ reward\ next state\action and so on
    rollouts = RolloutStorage(args.num_steps, num_processes_total, obs_shape, envs.action_space)
    current_state = torch.zeros(num_processes_total, *obs_shape)
    ''' not sure about it'''
    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        # print (shape_dim0)
        # print (sss)
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes_total, 1])
    final_rewards = torch.zeros([num_processes_total, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    from arguments import ewc, ewc_lambda, ewc_interval

    afs_per_m = []
    afs_offset = [0.0]*gtn_M

    afs_loss_list = []
    basic_loss_list = []
    episode_reward_rec = 0.0
    one = torch.FloatTensor([1]).cuda()
    mone = one * -1
    '''for one whole game '''
    for j in range(num_updates):
        for step in range(args.num_steps):
            if ewc == 1:
                try:
                    states_store = torch.cat([states_store, rollouts.states[step].clone()], 0)
                except Exception as e:
                    states_store = rollouts.states[step].clone()
            # Sample actions
            '''act fun refer to "observe it!"'''
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done = envs.step(cpu_actions)
            '''record the last 100 episodes rewards'''
            episode_reward_rec += reward
            episode_reward_rec = rec_last_100_epi_reward(episode_reward_rec,done)
            
            
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            '''reward is shape of process_num_total, not batch-size'''
            # print ((reward).size())
            # print (done)
            # print (sss)
            episode_rewards += reward
            ################
            # rec_last_100_epi_reward(reward,done)
            
            # episode_reward_ppo += reward[0]
            # If done then clean the history of observations. final_rewards is used for compute after one whole num_step
            
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            # reset gradient
            optimizer.zero_grad()

            # forward
            values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
            # pre-process
            values = values.view(args.num_steps, num_processes_total, 1)
            action_log_probs = action_log_probs.view(args.num_steps, num_processes_total, 1)

            # compute afs loss
            afs_per_m_temp, afs_loss = actor_critic.get_afs_per_m(
                action_log_probs=action_log_probs,
                conv_list=conv_list,
            )
            if len(afs_per_m_temp)>0:
                afs_per_m += [afs_per_m_temp]

            if (afs_loss is not None) and (afs_loss.data.cpu().numpy()[0]!=0.0):
                afs_loss.backward(mone, retain_graph=True)
                afs_loss_list += [afs_loss.data.cpu().numpy()[0]]

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()
            final_loss_basic = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef

            ewc_loss = None
            if j != 0:
                if ewc == 1:
                    ewc_loss = actor_critic.get_ewc_loss(lam=ewc_lambda)
            
            if ewc_loss is None:
                final_loss = final_loss_basic
            else:
                final_loss = final_loss_basic + ewc_loss
            # print (final_loss_basic.data.cpu().numpy()[0])
            # final_loss_basic
            basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
            final_loss.backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(range(num_processes_total * args.num_steps)), args.batch_size * num_processes_total, drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _, old_conv_list= old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    final_loss_basic = (value_loss + action_loss - dist_entropy * args.entropy_coef)
                    
                    basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
                    final_loss_basic.backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        # if j % int(num_updates/2-10) == 0 and args.save_dir != "":
        if j % args.save_interval == 0 and args.save_dir != "":
         
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            import pickle
            with open(os.path.join(save_path, args.env_name + "_last_100_reward"), "wb") as f:
                pickle.dump(reward_dict, f)



        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, (j + 1) * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))

            try:
                print("ewc loss {:.5f}".
                format(ewc_loss.data.cpu().numpy()[0]))
            except Exception as e:
                pass
            

        if j > 5 and j % args.vis_interval == 0 and args.vis:
            ''' load from the folder'''
            for ii in range(len(mt_env_id_dic_selected)):
                log_dir = args.log_dir+mt_env_id_dic_selected[ii]+'/'
                win[ii] = visdom_plot(viz, win[ii], log_dir, mt_env_id_dic_selected[ii], args.algo)

            plot_dic = reward_dict
            for plot_name in plot_dic.keys():
                # if plot_name not in win_dic:
                # win_dic[plot_name] = None
                if plot_name in win_dic.keys():
                    if len(plot_dic[plot_name]) > 0:
                        win_dic[plot_name] = viz.line(
                            torch.from_numpy(np.asarray(plot_dic[plot_name])), 
                            win=win_dic[plot_name],
                            opts=dict(title=break_line_html(exp+'>>'+plot_name))
                        )
                    

                else:
                    win_dic[plot_name] = None
            
            if len(afs_per_m)>0:
                win_afs_per_m = viz.line(
                    torch.from_numpy(np.asarray(afs_per_m)), 
                    win=win_afs_per_m,
                    opts=dict(title=title_html+'>>afs')
                )

            # print (basic_loss_list)
            '''a2c:len(basic_loss_list) is vis_interval+1. because j start from 0
               ppo:len(basic_loss_list) is (vis_interval+1)*ppo_epoch_4*len(BatchSampler)
            '''
            
            # print (len(basic_loss_list))
            # print (ss)
            win_basic_loss = viz.line(
                torch.from_numpy(np.asarray(basic_loss_list)), 
                win=win_basic_loss,
                opts=dict(title=title_html+'>>basic_loss')
            )

            if len(afs_loss_list) > 0:
                win_afs_loss = viz.line(
                    torch.from_numpy(np.asarray(afs_loss_list)), 
                    win=win_afs_loss,
                    opts=dict(title=title_html+'>>afs_loss')
                )

        from arguments import parameter_noise, parameter_noise_interval
        if parameter_noise == 1:
            if j % parameter_noise_interval == 0:
                actor_critic.parameter_noise()

        if ewc == 1:
            if j % ewc_interval == 0 or j==0:
                actor_critic.compute_fisher(states_store)
                states_store = None
                actor_critic.star()
Пример #25
0
def main():
    saved_model = os.path.join(args.save_dir, args.env_name + '.pt')
    if os.path.exists(saved_model) and not args.overwrite:
        actor_critic, ob_rms = \
                torch.load(saved_model)
        agent = \
            torch.load(os.path.join(args.save_dir, args.env_name + '_agent.pt'))
        for i in agent.optimizer.state_dict():
            print(dir(agent.optimizer))
            print(getattr(agent.optimizer, 'steps'))
            print(agent.optimizer.state_dict()[i])
        past_steps = agent.optimizer.steps
    else: 
        actor_critic = False
        agent = False
        past_steps = 0
        try:
            os.makedirs(args.log_dir)
        except OSError:
            files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
            for f in files:
                os.remove(f)
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None
        win_eval = None

    envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                        args.gamma, args.log_dir, args.add_timestep, device, False, None,

                        args=args)

    if actor_critic:
        pass
      # vec_norm = get_vec_normalize(envs)
      # if vec_norm is not None:
      #     vec_norm.eval()
      #     vec_norm.ob_rms = ob_rms
        
    else:
        actor_critic = Policy(envs.observation_space.shape, envs.action_space,
            base_kwargs={'map_width': args.map_width, 'num_actions': 18, 'recurrent': args.recurrent_policy},
            curiosity=args.curiosity, algo=args.algo, model=args.model, args=args)
    actor_critic.to(device)

    evaluator = None

    if not agent:
        if args.algo == 'a2c':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef,
                                   args.entropy_coef, lr=args.lr,
                                   eps=args.eps, alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm,
                                   curiosity=args.curiosity, args=args)
        elif args.algo == 'ppo':
            agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                             args.value_loss_coef, args.entropy_coef, lr=args.lr,
                                   eps=args.eps,
                                   max_grad_norm=args.max_grad_norm)
        elif args.algo == 'acktr':
            agent = algo.A2C_ACKTR_NOREWARD(actor_critic, args.value_loss_coef,
                                   args.entropy_coef, lr=args.lr,
                                   eps=args.eps, alpha=args.alpha,
                                   max_grad_norm=args.max_grad_norm,
                                   acktr=True,
                                   curiosity=args.curiosity, args=args)

    if args.curiosity:
        rollouts = CuriosityRolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space,
                            actor_critic.recurrent_hidden_state_size, actor_critic.base.feature_state_size(), args=args)
    else:
        rollouts = RolloutStorage(args.num_steps, args.num_processes,
                            envs.observation_space.shape, envs.action_space,
                            actor_critic.recurrent_hidden_state_size, args=args)

    obs = envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates - past_steps):
        if args.drop_path:
            actor_critic.base.get_drop_path()
        player_act = None
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():

                value, action, action_log_probs, recurrent_hidden_states = actor_critic.act(
                        rollouts.obs[step],
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step],
                        player_act=player_act,
                        icm_enabled=args.curiosity)

            # Observe reward and next obs
            obs, reward, done, infos = envs.step(action)

            player_act = None
            if args.render:

                if infos[0]:
                    if 'player_move' in infos[0].keys():
                        player_act = infos[0]['player_move']
            

            if args.curiosity:
                # run icm
                with torch.no_grad():


                    feature_state, feature_state_pred, action_dist_pred = actor_critic.icm_act(
                            (rollouts.obs[step], obs, action_bin)
                            )

                intrinsic_reward = args.eta * ((feature_state - feature_state_pred).pow(2)).sum() / 2.
                if args.no_reward:
                    reward = 0
                reward += intrinsic_reward.cpu()

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            if args.curiosity:
                rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks,
                                feature_state, feature_state_pred, action_bin, action_dist_pred)
            else:
                rollouts.insert(obs, recurrent_hidden_states, action, action_log_probs, value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.obs[-1],
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        
        if args.curiosity:
            value_loss, action_loss, dist_entropy, fwd_loss, inv_loss = agent.update(rollouts)
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          getattr(get_vec_normalize(envs), 'ob_rms', None)]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            save_agent = copy.deepcopy(agent)

            torch.save(save_agent, os.path.join(save_path, args.env_name + '_agent.pt'))
           #torch.save(actor_critic.state_dict(), os.path.join(save_path, args.env_name + "_weights.pt"))

        total_num_steps = (j + 1) * args.num_processes * args.num_steps

        if not dist_entropy:
            dist_entropy = 0
        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n \
dist entropy {:.1f}, val/act loss {:.1f}/{:.1f},".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       len(episode_rewards),
                       np.mean(episode_rewards),
                       np.median(episode_rewards),
                       np.min(episode_rewards),
                       np.max(episode_rewards), dist_entropy,
                       value_loss, action_loss))
            if args.curiosity:
                print("fwd/inv icm loss {:.1f}/{:.1f}\n".
                format(
                       fwd_loss, inv_loss))

        if (args.eval_interval is not None and len(episode_rewards) > 1
                and j % args.eval_interval == 0):
            if evaluator is None:
                evaluator = Evaluator(args, actor_critic, device)


            if args.model == 'fractal':
                n_cols = evaluator.actor_critic.base.n_cols
                for i in range(-1, n_cols):
                    evaluator.evaluate(column=i)
               #num_eval_frames = (args.num_frames // (args.num_steps * args.eval_interval * args.num_processes)) * args.num_processes *  args.max_step
                win_eval = visdom_plot(viz, win_eval, evaluator.eval_log_dir, args.env_name,
                              args.algo, args.num_frames, n_graphs=args.n_recs)
            else:
                evaluator.evaluate(column=None)



        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
def run(number_of_workers, log_dir, vis_title):
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    print("#######")
    print("num_updates: {}".format(num_updates))
    print("#######")

    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # Done: change make_env behaviour such that simple env is created; see custom_envs.py
    # args.env_name has to start with ng_ currently only WorkerMaintenanceEnv is working
    env_config = ENV_CONFIG.copy()
    # env_config['path_to_keras_expert_model'] = args.path_to_keras_expert_model
    env_config['number_of_workers'] = number_of_workers
    env_config['enable_0action_boost'] = args.enable_0action_boost
    envs = [
        make_env(args.env_name, args.seed, i, log_dir, args.add_timestep,
                 env_config) for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs,
                            ob=not args.disable_env_normalize_ob,
                            ret=not args.disable_env_normalize_rw,
                            gamma=args.gamma)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    # Done: 2018/06/24. change Model in Policy to LSTM/GRU model (ref. CNN with gru); see model.py

    print("#######")
    print("action space.n : {}".format(envs.action_space.n))
    print("#######")
    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

            if args.enable_debug_info_print:
                print("#####")
                print("cpu_action: {}".format(cpu_actions))
                print("envs reward: {}".format(reward))
                print("info stats reward: {}".format(
                    info[0]["stats_relative_reward_regret"] +
                    info[0]["stats_relative_reward_penalty"]))
                print("final_rewards after masks: {}".format(final_rewards))
                print(
                    "episode_rewards after masks: {}".format(episode_rewards))

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]
            model_name = "{}-{}-{}_w{}-{}.pt".format(args.env_name, args.algo,
                                                     args.save_model_postfix,
                                                     number_of_workers, j)
            torch.save(save_model, os.path.join(save_path, model_name))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, log_dir, vis_title, args.algo,
                                  args.num_frames)
            except IOError:
                pass
    # save final policy
    save_path = os.path.join(args.save_dir, args.algo)
    try:
        os.makedirs(save_path)
    except OSError:
        pass

    # A really ugly way to save a model to CPU
    save_model = actor_critic
    if args.cuda:
        save_model = copy.deepcopy(actor_critic).cpu()

    save_model = [save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None]
    model_name = "{}-{}-{}_w{}-final.pt".format(args.env_name, args.algo,
                                                args.save_model_postfix,
                                                number_of_workers)
    torch.save(save_model, os.path.join(save_path, model_name))
    return True
Пример #27
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
Пример #28
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    with open(args.eval_env_seeds_file, 'r') as f:
        eval_env_seeds = json.load(f)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    eval_dir = os.path.join(args.log_dir, "eval/")
    if not os.path.exists(eval_dir):
        os.makedirs(eval_dir)
    eval_env = [
        make_env(args.env_name,
                 args.seed,
                 0,
                 eval_dir,
                 args.add_timestep,
                 early_resets=True)
    ]
    eval_env = DummyVecEnv(eval_env)

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs, gamma=args.gamma)

    if len(envs.observation_space.shape) == 1:
        # Don't touch rewards for evaluation
        eval_env = VecNormalize(eval_env, ret=False)
        # set running filter to be the same
        eval_env.ob_rms = envs.ob_rms

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass

    validation_returns = evaluate_with_seeds(eval_env, actor_critic, args.cuda,
                                             eval_env_seeds)

    report_results([
        dict(name='validation_return',
             type='objective',
             value=np.mean(validation_returns))
    ])
Пример #29
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    torch.set_num_threads(1)

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()

    lmdb_idx = 0
    try:
        os.makedirs(os.path.join(args.lmdb_path, args.env_name))
        os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test'))
    except:
        print('Directory already exists.')

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, states = actor_critic.act(
                    rollouts.observations[step], rollouts.states[step],
                    rollouts.masks[step])
            cpu_actions = action.squeeze(1).cpu().numpy()

            # Observe reward and next obs
            # obs, reward, done, info = envs.step(cpu_actions)
            '''unwrapped obs, reward'''
            obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions)
            # sample images
            # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2)
            for img, rwd in zip(wr_obs, wr_reward):
                if rwd > 0:
                    lmdb_idx += 1
                    convert_to_lmdb(
                        img, rwd, os.path.join(args.lmdb_path, args.env_name),
                        lmdb_idx)

            # Evaluate unwrapped rewards
            # model = Model()
            # model.load(args.digit_checkpoint)
            # model.cuda()
            # accuracy = digit_eval(image, length_labels, digits_labels, model)
            # img.show()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states, action, action_log_prob,
                            value, reward, masks)

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.observations[-1],
                                                rollouts.states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(), dist_entropy,
                        value_loss, action_loss))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
Пример #30
0
def main():
    torch.set_num_threads(1)
    device = torch.device("cuda:0" if args.cuda else "cpu")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    train_envs = make_vec_envs(args.env_name,
                               args.seed,
                               args.num_processes,
                               args.gamma,
                               args.no_norm,
                               args.num_stack,
                               args.log_dir,
                               args.add_timestep,
                               device,
                               allow_early_resets=False)

    if args.eval_interval:
        eval_seed = args.seed if args.seed is None else args.seed + args.num_processes
        eval_envs = make_vec_envs(args.env_name,
                                  eval_seed,
                                  args.num_processes // 4,
                                  args.gamma,
                                  args.no_norm,
                                  args.num_stack,
                                  eval_log_dir,
                                  args.add_timestep,
                                  device=device,
                                  allow_early_resets=True,
                                  eval=True,
                                  rank_offsest=args.num_processes)

        if eval_envs.venv.__class__.__name__ == "VecNormalize":
            eval_envs.venv.ob_rms = train_envs.venv.ob_rms
    else:
        eval_envs = None

    print(train_envs.observation_space.shape)

    noisy_net = True

    actor_critic = create_policy(
        train_envs.observation_space,
        train_envs.action_space,
        name='basic',
        nn_kwargs={
            #'batch_norm': False if args.algo == 'acktr' else True,
            'recurrent': 'lstm' if args.recurrent_policy else '',
            'hidden_size': 512,
        },
        noisy_net=noisy_net,
        train=True)

    if args.resume and os.path.isfile(args.resume):
        print('Resuming from checkpoint (%s)' % args.resume)
        state_dict, ob_rms = torch.load(args.resume, map_location='cpu')
        actor_critic.load_state_dict(state_dict)

    actor_critic.to(device)

    if args.algo.startswith('a2c'):
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               lr=args.lr,
                               lr_schedule=lr_update_schedule,
                               eps=args.eps,
                               alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo.startswith('ppo'):
        agent = algo.PPO(actor_critic,
                         args.clip_param,
                         args.ppo_epoch,
                         args.num_mini_batch,
                         args.value_loss_coef,
                         args.entropy_coef,
                         lr=args.lr,
                         lr_schedule=lr_update_schedule,
                         eps=args.eps,
                         max_grad_norm=args.max_grad_norm)
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic,
                               args.value_loss_coef,
                               args.entropy_coef,
                               acktr=True)

    if args.algo.endswith('sil'):
        agent = algo.SIL(agent,
                         update_ratio=args.sil_update_ratio,
                         epochs=args.sil_epochs,
                         batch_size=args.sil_batch_size,
                         value_loss_coef=args.sil_value_loss_coef
                         or args.value_loss_coef,
                         entropy_coef=args.sil_entropy_coef
                         or args.entropy_coef)
        replay = ReplayStorage(1e5,
                               args.num_processes,
                               args.gamma,
                               0.1,
                               train_envs.observation_space.shape,
                               train_envs.action_space,
                               actor_critic.recurrent_hidden_state_size,
                               device=device)
    else:
        replay = None

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                              train_envs.observation_space.shape,
                              train_envs.action_space,
                              actor_critic.recurrent_hidden_state_size)

    obs = train_envs.reset()
    rollouts.obs[0].copy_(obs)
    rollouts.to(device)

    episode_rewards = deque(maxlen=10)

    start = time.time()
    for j in range(num_updates):
        if noisy_net:
            actor_critic.reset_noise()

        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step], rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

            # Obser reward and next obs
            obs, reward, done, infos = train_envs.step(action)

            for info in infos:
                if 'episode' in info.keys():
                    episode_rewards.append(info['episode']['r'])

            # If done then clean the history of observations.
            masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done],
                                 device=device)
            rollouts.insert(obs, recurrent_hidden_states, action,
                            action_log_prob, value, reward, masks)
            if replay is not None:
                replay.insert(rollouts.obs[step],
                              rollouts.recurrent_hidden_states[step], action,
                              reward, done)

        with torch.no_grad():
            next_value = actor_critic.get_value(
                rollouts.obs[-1], rollouts.recurrent_hidden_states[-1],
                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        value_loss, action_loss, dist_entropy, other_metrics = agent.update(
            rollouts, j, replay)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model.state_dict(),
                hasattr(train_envs.venv, 'ob_rms') and train_envs.venv.ob_rms
                or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        total_num_steps = (j + 1) * update_factor

        if j % args.log_interval == 0 and len(episode_rewards) > 1:
            end = time.time()
            print(
                "Updates {}, num timesteps {}, FPS {}, last {} mean/median reward {:.1f}/{:.1f}, "
                "min / max reward {:.1f}/{:.1f}, value/action loss {:.5f}/{:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        len(episode_rewards), np.mean(episode_rewards),
                        np.median(episode_rewards), np.min(episode_rewards),
                        np.max(episode_rewards), dist_entropy, value_loss,
                        action_loss),
                end=', ' if other_metrics else '\n')
            if 'sil_value_loss' in other_metrics:
                print("SIL value/action loss {:.1f}/{:.1f}.".format(
                    other_metrics['sil_value_loss'],
                    other_metrics['sil_action_loss']))

        if args.eval_interval and len(
                episode_rewards) > 1 and j > 0 and j % args.eval_interval == 0:
            actor_critic.eval()

            eval_episode_rewards = []
            num_eval_processes = args.num_processes // 4
            obs = eval_envs.reset()
            eval_recurrent_hidden_states = torch.zeros(
                2,
                num_eval_processes,
                actor_critic.recurrent_hidden_state_size,
                device=device)
            eval_masks = torch.zeros(num_eval_processes, 1, device=device)

            while len(eval_episode_rewards) < 50:
                with torch.no_grad():
                    _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                        obs,
                        eval_recurrent_hidden_states,
                        eval_masks,
                        deterministic=True)

                # Obser reward and next obs
                obs, reward, done, infos = eval_envs.step(action)
                eval_masks = torch.tensor([[0.0] if done_ else [1.0]
                                           for done_ in done],
                                          device=device)
                for info in infos:
                    if 'episode' in info.keys():
                        eval_episode_rewards.append(info['episode']['r'])

            print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
                len(eval_episode_rewards), np.mean(eval_episode_rewards)))

            actor_critic.train()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass