예제 #1
0
    def __init__(self, envs, hparams):

        # self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        # self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']



        if hparams['dropout'] == True:
            print ('CNNPolicy_dropout2')
            actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space)
        elif len(envs.observation_space.shape) == 3:
            print ('CNNPolicy2')
            actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage_list() #self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            # rollouts.cuda()

        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        else:
            print ('no opt specified')

        self.actor_critic = actor_critic
        self.rollouts = rollouts
	def select_network(self):
		if len(self.envs.observation_space.shape) == 3:
			actor_critic = CNNPolicy(self.obs_shape[0], self.envs.action_space, 
				self.args.recurrent_policy)
		else:
			assert not self.args.recurrent_policy, \
				"Recurrent policy is not implemented for the MLP controller"
			actor_critic = MLPPolicy(self.obs_shape[0], self.envs.action_space)
			#actor_critic = BPW_MLPPolicy(obs_shape[0], self.envs.action_space)     
		return actor_critic
예제 #3
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.ppo_epoch = hparams['ppo_epoch']
        self.batch_size = hparams['batch_size']
        self.clip_param = hparams['clip_param']



        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        self.eps = hparams['eps']

        # self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])

        # if hparams['lr_schedule'] == 'linear':
        self.init_lr = hparams['lr']
        self.final_lr = hparams['final_lr']
        #     lr_func = lambda epoch: max( init_lr*(1.-(epoch/500.)), final_lr)
        #     self.optimizer2 = lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_func)
        # self.current_lr = hparams['lr']


        self.actor_critic = actor_critic
        self.rollouts = rollouts


        self.old_model = copy.deepcopy(self.actor_critic)
예제 #4
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']



        if hparams['dropout'] == True:
            print ('CNNPolicy_dropout2')
            actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space)
        elif len(envs.observation_space.shape) == 3:
            print ('CNNPolicy2')
            actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
        else:
            print ('no opt specified')

        self.actor_critic = actor_critic
        self.rollouts = rollouts


        self.rollouts_list = RolloutStorage_list()
예제 #5
0
    def load_policy(self):
        actor_critic = MLPPolicy(self.args.obs_shape[1], self.args.full_state_shape[1], self.env.robot.action_space,
                                 symm_policy=self.args.symm_policy)
        print(os.path.join(self.args.load_dir + self.args.algo, self.args.phase, self.args.env_name, self.args.env_name
                           + self.args.tr_itr + ".pt"))
        state_dict, ob_rms, st_rms, ret_rms = \
            torch.load(
                os.path.join(self.args.load_dir + self.args.algo, self.args.phase, self.args.env_name,
                             self.args.env_name
                             + self.args.tr_itr + ".pt"),
                map_location='cpu')

        actor_critic.load_state_dict(state_dict)
        actor_critic.train(False)
        actor_critic.eval()
        self.env.robot.ob_rms = ob_rms
        return actor_critic
예제 #6
0
    def __init__(self, envs, cuda, num_steps, num_processes, obs_shape, lr,
                 eps, alpha, use_gae, gamma, tau, value_loss_coef,
                 entropy_coef):

        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]

        if cuda:
            actor_critic.cuda()

        # if args.algo == 'a2c':
        self.optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps,
                                       alpha)

        rollouts = RolloutStorage(num_steps, num_processes, obs_shape,
                                  envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if cuda:
            rollouts.cuda()

        self.actor_critic = actor_critic
        self.rollouts = rollouts

        self.use_gae = use_gae
        self.gamma = gamma
        self.tau = tau

        self.obs_shape = obs_shape
        self.action_shape = action_shape
        self.num_steps = num_steps
        self.num_processes = num_processes
        self.value_loss_coef = value_loss_coef
        self.entropy_coef = entropy_coef
예제 #7
0
파일: main.py 프로젝트: rfarouni/Other_Code
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'






    print (args.cuda)
    print (args.num_steps)
    print (args.num_processes)
    print (args.lr)
    print (args.eps)
    print (args.alpha)
    print (args.use_gae)
    print (args.gamma)
    print (args.tau)
    print (args.value_loss_coef)
    print (args.entropy_coef)
    fdsafasd

    # if args.vis:
    #     from visdom import Visdom
    #     viz = Visdom()
    #     win = None





    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    # print('here3')
    # fdasf





    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    # elif args.algo == 'ppo':
    #     optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    # elif args.algo == 'acktr':
    #     optimizer = KFACOptimizer(actor_critic)








    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward




    current_state = torch.zeros(args.num_processes, *obs_shape)
    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)








    rollouts.states[0].copy_(current_state)
    #set the first state to current state






    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    # if args.algo == 'ppo':
    #     old_model = copy.deepcopy(actor_critic)

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):



            # Sample actions
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            # make prediction using state that you put into rollouts


            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            # print (state.shape) # [nProcesss, ndims, height, width]
            # fsdf
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)




            rollouts.insert(step, current_state, action.data, value.data, reward, masks)
            # insert all that info into current step
            # not exactly why













        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
        # use last state to make prediction of next value









        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
        #not sure what this is








        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step









        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
            # I think this aciton log prob could have been computed and stored earlier 
            # and didnt we already store the value prediction???






            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            # if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
            #     # Sampled fisher, see Martens 2014
            #     actor_critic.zero_grad()
            #     pg_fisher_loss = -action_log_probs.mean()

            #     value_noise = Variable(torch.randn(values.size()))
            #     if args.cuda:
            #         value_noise = value_noise.cuda()

            #     sample_values = values + value_noise
            #     vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

            #     fisher_loss = pg_fisher_loss + vf_fisher_loss
            #     optimizer.acc_stats = True
            #     fisher_loss.backward(retain_graph=True)
            #     optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            # if args.algo == 'a2c':
            #     nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        # elif args.algo == 'ppo':
        #     advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        #     advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        #     old_model.load_state_dict(actor_critic.state_dict())
        #     if hasattr(actor_critic, 'obs_filter'):
        #         old_model.obs_filter = actor_critic.obs_filter

        #     for _ in range(args.ppo_epoch):
        #         sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
        #         for indices in sampler:
        #             indices = torch.LongTensor(indices)
        #             if args.cuda:
        #                 indices = indices.cuda()
        #             states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
        #             actions_batch = rollouts.actions.view(-1, action_shape)[indices]
        #             return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

        #             # Reshape to do in a single forward pass for all steps
        #             values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

        #             _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

        #             ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
        #             adv_targ = Variable(advantages.view(-1, 1)[indices])
        #             surr1 = ratio * adv_targ
        #             surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
        #             action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

        #             value_loss = (Variable(return_batch) - values).pow(2).mean()

        #             optimizer.zero_grad()
        #             (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
        #             optimizer.step()




        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous 









        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
예제 #8
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")
    print(args)

    try:
        os.makedirs(args.log_dir)
    except OSError:
        files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)

    for gamma in args.gamma:
        with open(args.log_dir + '/MSE_' + str(gamma) + '_monitor.csv',
                  "wt") as monitor_file:
            monitor = csv.writer(monitor_file)
            monitor.writerow([
                'update', 'error',
                str(int(args.num_frames) // args.num_steps)
            ])

    os.environ['OMP_NUM_THREADS'] = '1'

    print("Using env {}".format(args.env_name))

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    num_heads = len(
        args.gamma) if not args.reward_predictor else len(args.gamma) - 1

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0],
                                 envs.action_space,
                                 num_heads=num_heads,
                                 hidden_size=args.hidden_size)
    else:
        actor_critic = MLPPolicy(obs_shape[0],
                                 envs.action_space,
                                 num_heads=num_heads,
                                 reward_predictor=args.reward_predictor,
                                 use_s=args.use_s,
                                 use_s_a=args.use_s_a,
                                 use_s_a_sprime=args.use_s_a_sprime)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    lrs = [args.lr] * len(actor_critic.param_groups)

    if not args.reward_predictor:
        assert len(actor_critic.param_groups) == len(lrs)
        model_params = [{
            'params': model_p,
            'lr': args.lr
        } for model_p, lr in zip(actor_critic.param_groups, lrs)]
    else:
        model_params = [{
            'params': model_p,
            'lr': p_lr
        } for model_p, p_lr in zip(actor_critic.param_groups[:-1], lrs)]
        model_params.append({
            'params': actor_critic.param_groups[-1],
            'lr': args.lr_rp
        })

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(model_params,
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)

    elif args.algo == 'ppo':
        optimizer = optim.Adam(model_params, args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps,
                              args.num_processes,
                              obs_shape,
                              envs.action_space,
                              actor_critic.state_size,
                              gamma=args.gamma,
                              use_rp=args.reward_predictor)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, obs_tensor):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            obs_tensor[:, :-shape_dim0] = obs_tensor[:, shape_dim0:]
        obs_tensor[:, -shape_dim0:] = obs

    obs = envs.reset()

    update_current_obs(obs, current_obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    advantages_list = []

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            cpu_actions = add_gaussian_noise(cpu_actions, args.action_noise)

            # Obser reward and next obs
            obs, raw_reward, done, info = envs.step(cpu_actions)

            reward = np.copy(raw_reward)

            reward = add_gaussian_noise(reward, args.reward_noise)

            reward = epsilon_greedy(reward, args.reward_epsilon,
                                    args.reward_high, args.reward_low)

            raw_reward = torch.from_numpy(
                np.expand_dims(np.stack(raw_reward), 1)).float()

            episode_rewards += raw_reward

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs, current_obs)

            if args.reward_predictor:
                r_hat = actor_critic.predict_reward(
                    Variable(rollouts.observations[step], volatile=True),
                    action, Variable(current_obs, volatile=True))
                p_hat = min(args.rp_burn_in, j) / args.rp_burn_in
                estimate_reward = (1 -
                                   p_hat) * reward + p_hat * r_hat.data.cpu()
                reward = torch.cat([reward, estimate_reward], dim=-1)
                value = torch.cat([r_hat, value], dim=-1).data
            else:
                value = value.data

            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value, reward, masks,
                            raw_reward)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        if args.reward_predictor:
            if args.use_s or args.use_s_a:
                r_hat = actor_critic.predict_reward(
                    Variable(rollouts.observations[-1], volatile=True),
                    Variable(rollouts.actions[-1], volatile=True), None).data
                next_value = torch.cat([r_hat, next_value], dim=-1)
            else:
                next_value = torch.cat([
                    torch.zeros(list(next_value.size())[:-1] + [1]), next_value
                ],
                                       dim=-1)

        rollouts.compute_returns(next_value, args.use_gae, args.tau)

        if args.algo in ['a2c']:

            batch_states = Variable(rollouts.states[0].view(
                -1, actor_critic.state_size))
            batch_masks = Variable(rollouts.masks[:-1].view(-1, 1))
            batch_obs = Variable(rollouts.observations[:-1].view(
                -1, *obs_shape))
            batch_actions = Variable(rollouts.actions.view(-1, action_shape))

            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                batch_obs, batch_states, batch_masks, batch_actions)
            if args.reward_predictor:
                batch_obs_prime = Variable(rollouts.observations[1:].view(
                    -1, *obs_shape))
                values = torch.cat([
                    actor_critic.predict_reward(batch_obs, batch_actions,
                                                batch_obs_prime), values
                ],
                                   dim=-1)
            returns_as_variable = Variable(rollouts.returns[:-1])

            batched_v_loss = 0

            values = values.view(returns_as_variable.size())
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = returns_as_variable - values
            value_loss = advantages.pow(2).sum(-1).mean()
            action_loss = -(Variable(advantages[:, :, -1].unsqueeze(-1).data) *
                            action_log_probs).mean()
            if args.reward_predictor:
                rp_error = (values[:, :, 0].data -
                            rollouts.raw_rewards).pow(2).mean()
                advantages_list.append([
                    rp_error,
                    advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0]
                ])
            else:
                advantages_list.append(
                    advantages[:, :, -1].pow(2).mean().data.cpu().numpy()[0])

            optimizer.zero_grad()
            (batched_v_loss + value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]

            advantages = advantages[:, :, -1]

            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                data_generator = rollouts.feed_forward_generator(
                    advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ, observations_batch_prime, true_rewards_batch, \
                            noisy_observations_batch, true_observations_batch = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    if args.reward_predictor:
                        values = torch.cat([
                            actor_critic.predict_reward(
                                Variable(observations_batch),
                                Variable(actions_batch),
                                Variable(observations_batch_prime)), values
                        ],
                                           dim=-1)

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    td = (Variable(return_batch) - values).pow(2)

                    value_loss = td.sum(-1).mean()

                    if args.reward_predictor:
                        rp_error = (values[:, 0].data -
                                    true_rewards_batch).pow(2).mean()
                        advantages_list.append(
                            [rp_error, td[:, -1].mean(0).data.cpu().numpy()])
                    else:
                        advantages_list.append(
                            td[:, -1].mean(0).data.cpu().numpy())

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()

                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, "
                "entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    dist_entropy.data[0], value_loss.data[0],
                    action_loss.data[0]))

            if len(advantages_list) > 2:
                advantages_array = np.array(advantages_list).reshape(
                    -1, len(args.gamma)).T
                for g, gamma in enumerate(args.gamma):
                    with open(
                            args.log_dir + '/MSE_' + str(gamma) +
                            '_monitor.csv', "a") as monitor_file:
                        monitor = csv.writer(monitor_file)
                        monitor.writerow(
                            [total_num_steps,
                             np.mean(advantages_array[g])])

            advantages_list = []
예제 #9
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monit`or (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # logger = Logger(algorithm_name = args.algo, environment_name = args.env_name, folder = args.folder)
    # logger.save_args(args)

    # print ("---------------------------------------")
    # print ('Saving to', logger.save_folder)
    # print ("---------------------------------------")

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
        target_actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                        args.recurrent_policy)

    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
        target_actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    for param, target_param in zip(actor_critic.parameters(),
                                   target_actor_critic.parameters()):
        target_param.data.copy_(param.data)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    actor_regularizer_criterion = nn.KLDivLoss()
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
            Variable(rollouts.masks[:-1].view(-1, 1)),
            Variable(rollouts.actions.view(-1, action_shape)))
        """
        Used for KL Constraint in case of Continuous Action Stochastic Policies
        """
        # target_values, target_action_log_probs, target_dist_entropy, target_states, target_action_mean, target_action_std = target_actor_critic.evaluate_actions_mean_and_std(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
        #                                                                                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
        #                                                                                Variable(rollouts.masks[:-1].view(-1, 1)),
        #                                                                                Variable(rollouts.actions.view(-1, action_shape)))

        # actor_regularizer_loss = (torch.log(action_std/target_action_std) + (action_std.pow(2) + (action_mean - target_action_mean).pow(2))/(2*target_action_std.pow(2)) - 0.5)

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        ### Loss with regularizer added
        ##action_loss = -(Variable(advantages.data) * action_log_probs).mean() + args.actor_lambda * actor_regularizer_loss.mean(0).sum()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        total_loss = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef
        total_loss.backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()

        ## Exponential average for target updates
        #if (j%args.target_update_interval == 0):
        # for param, target_param in zip(actor_critic.parameters(), target_actor_critic.parameters()):
        #     target_param.data.copy_(args.target_tau * param.data + (1 - args.target_tau) * target_param.data)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            final_rewards_mean = [final_rewards.mean()]
            final_rewards_median = [final_rewards.median()]
            final_rewards_min = [final_rewards.min()]
            final_rewards_max = [final_rewards.max()]

            # logger.record_data(final_rewards_mean, final_rewards_median, final_rewards_min, final_rewards_max)
            # logger.save()

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
예제 #10
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    # T choose whetehr to visualize
    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    # T get shape of observation array of the environment
    obs_shape = envs.observation_space.shape
    # T adjusting the shape; not sure what the * is
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    #T initialize the actor critic; MLP and CNN classes imported from model.py
    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    #T - some kind of setup with the actor_critic
    if args.finetune:
        checkpoint_path = save_path = os.path.join(args.save_dir, args.algo,
                                                   args.checkpoint)
        state_dict = torch.load(checkpoint_path)
        print("Finetuning from checkpoint: %s, at step: %d" %
              (checkpoint_path, state_dict['update']))
        actor_critic.load_state_dict(state_dict['model_state_dict'])
        keep_layers = [
            'v_fc3.weight', 'v_fc3.bias', 'a_fc2.weight', 'a_fc2.bias',
            'dist.fc_mean.weight', 'dist.fc_mean.bias', 'dist.logstd._bias'
        ]
        for name, param in actor_critic.named_parameters():
            if name not in keep_layers:
                param.requires_grad = False
        for name, param in actor_critic.named_parameters():
            print('Param name: %s, requires_grad: %d' %
                  (name, param.requires_grad))
    # T set up dimensions of the action space
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # T all arguments imported from arguments.py
    # T enable cuda pythorch tensor support
    if args.cuda:
        actor_critic.cuda()

    # T - pull arguments and choose algorithm and optimizer
    if args.algo == 'a2c':
        optimizer = optim.RMSprop(filter(lambda p: p.requires_grad,
                                         actor_critic.parameters()),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    #TO-DO figure out how to restore optimizer parameters when freezing some weights
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    # return all zeros, so nothing observed
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    # T-not sure what this function is doing??
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    # T - reset the environment; call function to update observation
    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    # T - initialize rewards to be zero
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    start = time.time()
    # T - begin iterative loop
    for j in range(num_updates):
        # T - take steps through single instance
        # T - this is the loop where action/critic happens
        for step in range(args.num_steps):
            # Sample actions
            # T - buried by the action method ultimately comes from torch.nn.Module
            value, action = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # T done bool returned by steps; indicates if failure occurred (done)

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks
            #T - now update the observation matrix
            update_current_obs(obs)
            #T - store what happened in this step
            rollouts.insert(step, current_obs, action.data, value.data, reward,
                            masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.observations[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    observations_batch = rollouts.observations[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(observations_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.observations[0].copy_(rollouts.observations[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            file_name = FILE_PREFIX + '.pt'
            #torch.save(save_model, os.path.join(save_path, file_name))
            data = {
                'update': j,
                'model_state_dict': save_model.state_dict(),
                'optim_state_dict': optimizer.state_dict()
            }
            torch.save(data, os.path.join(save_path, file_name))
        # T - write out some log information (not important for us)
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
    viz = Visdom(port=args.port)
    win = None

env = blt.bullet_env('DIRECT', args.env_name, 50, args.log_dir)
obs_shape = (env.robot_dict[args.env_name].observation_space.shape[0], )
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

# load or new
if args.cont_learning == True:
    print('#### continue learning... ####')
    actor_critic, ob_rms = \
        torch.load(os.path.join(args.save_dir + args.algo + '/', args.env_name + ".pt"))
    env.robot_dict[args.env_name].ob_rms = ob_rms
else:
    print('$$$$ new learning... $$$$')
    actor_critic = MLPPolicy(obs_shape[0],
                             env.robot_dict[args.env_name].action_space)
    print('model desc: ', actor_critic)

if args.cuda:
    print('cuda is available..2')
    actor_critic.cuda()

# algorithm selection
if args.algo == 'a2c':
    agent = algo.A2C_ACKTR(actor_critic,
                           args.value_loss_coef,
                           args.entropy_coef,
                           lr=args.lr,
                           eps=args.eps,
                           alpha=args.alpha,
                           max_grad_norm=args.max_grad_norm)
예제 #12
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    # Instantiate the environment
    config = getattr(configs, args.config)()

    # We make this in order to get the shapes.
    dummy_env = make_env(args, config, -1,
                         [config['agent'](game_type=config['game_type'])])()
    envs_shape = dummy_env.observation_space.shape[1:]
    obs_shape = (envs_shape[0], *envs_shape[1:])
    action_space = dummy_env.action_space
    if len(envs_shape) == 3:
        if args.model == 'convnet':
            actor_critic = lambda saved_model: PommeCNNPolicySmall(
                obs_shape[0], action_space, args)
        elif args.model == 'resnet':
            actor_critic = lambda saved_model: PommeResnetPolicy(
                obs_shape[0], action_space, args)
    else:
        actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space
                                                     )

    # We need to get the agent = config.agent(agent_id, config.game_type) and then
    # pass that agent into the agent.PPOAgent
    training_agents = []
    saved_models = args.saved_models
    saved_models = saved_models.split(
        ',') if saved_models else [None] * args.nagents
    assert (len(saved_models)) == args.nagents
    for saved_model in saved_models:
        # TODO: implement the model loading.
        model = actor_critic(saved_model)
        agent = config['agent'](game_type=config['game_type'])
        agent = ppo_agent.PPOAgent(agent, model)
        training_agents.append(agent)

    if args.how_train == 'simple':
        # Simple trains a single agent against three SimpleAgents.
        assert (
            args.nagents == 1), "Simple training should have a single agent."
        num_training_per_episode = 1
    elif args.how_train == 'homogenous':
        # Homogenous trains a single agent against itself (self-play).
        assert (args.nagents == 1
                ), "Homogenous toraining should have a single agent."
        num_training_per_episode = 4
    elif args.how_train == 'heterogenous':
        assert (args.nagents >
                1), "Heterogenous training should have more than one agent."
        print("Heterogenous training is not implemented yet.")
        return

    # NOTE: Does this work correctly? Will the threads operate independently?
    envs = [
        make_env(args, config, i, training_agents)
        for i in range(args.num_processes)
    ]
    envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs)
    # TODO: Figure out how to render this for testing purposes. The following link may help:
    # https://github.com/MG2033/A2C/blob/master/envs/subproc_vec_env.py

    for agent in training_agents:
        agent.initialize(args, obs_shape, action_space,
                         num_training_per_episode)

    current_obs = torch.zeros(num_training_per_episode, args.num_processes,
                              *obs_shape)

    def update_current_obs(obs):
        current_obs = torch.from_numpy(obs).float()

    obs = envs.reset()
    update_current_obs(obs)
    if args.how_train == 'simple':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)
    elif args.how_train == 'homogenous':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])
    final_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        for agent in training_agents:
            agent.cuda()

    stats = utils.init_stats(args)
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            value_agents = []
            action_agents = []
            action_log_prob_agents = []
            states_agents = []
            episode_reward = []
            cpu_actions_agents = []

            if args.how_train == 'simple':
                value, action, action_log_prob, states = training_agents[
                    0].act_pytorch(step, 0)
                value_agents.append(value)
                action_agents.append(action)
                action_log_prob_agents.append(action_log_prob)
                states_agents.append(states)
                cpu_actions = action.data.squeeze(1).cpu().numpy()
                cpu_actions_agents = cpu_actions
            elif args.how_train == 'homogenous':
                cpu_actions_agents = [[] for _ in range(args.num_processes)]
                for i in range(4):
                    value, action, action_log_prob, states = training_agents[
                        0].act_pytorch(step, i)
                    value_agents.append(value)
                    action_agents.append(action)
                    action_log_prob_agents.append(action_log_prob)
                    states_agents.append(states)
                    cpu_actions = action.data.squeeze(1).cpu().numpy()
                    for num_process in range(args.num_processes):
                        cpu_actions_agents[num_process].append(
                            cpu_actions[num_process])

            obs, reward, done, info = envs.step(cpu_actions_agents)
            reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1)
            episode_rewards += reward

            # import pdb; pdb.set_trace()
            if args.how_train == 'simple':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode for done_ in done])
            elif args.how_train == 'homogenous':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode
                     for done_ in done]).transpose(0, 1)

            final_rewards *= masks  # nagents x nprocesses x 1
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()

            reward_all = reward.unsqueeze(2)
            if args.how_train == 'simple':
                masks_all = masks.transpose(0, 1).unsqueeze(2)
            elif args.how_train == 'homogenous':
                masks_all = masks.unsqueeze(2)

            current_obs *= masks_all.unsqueeze(2).unsqueeze(2)
            update_current_obs(obs)

            states_all = torch.from_numpy(
                np.stack([x.data for x in states_agents])).float()
            action_all = torch.from_numpy(
                np.stack([x.data for x in action_agents])).float()
            action_log_prob_all = torch.from_numpy(
                np.stack([x.data for x in action_log_prob_agents])).float()
            value_all = torch.from_numpy(
                np.stack([x.data for x in value_agents])).float()

            if args.how_train in ['simple', 'homogenous']:
                training_agents[0].insert_rollouts(step, current_obs,
                                                   states_all, action_all,
                                                   action_log_prob_all,
                                                   value_all, reward_all,
                                                   masks_all)

        next_value_agents = []
        if args.how_train == 'simple':
            agent = training_agents[0]
            next_value_agents.append(agent.run_actor_critic(-1, 0))
            advantages = [
                agent.compute_advantages(next_value_agents, args.use_gae,
                                         args.gamma, args.tau)
            ]
        elif args.how_train == 'homogenous':
            agent = training_agents[0]
            next_value_agents = [
                agent.run_actor_critic(-1, num_agent) for num_agent in range(4)
            ]
            advantages = [
                agent.compute_advantages(next_value_agents, args.use_gae,
                                         args.gamma, args.tau)
            ]

        final_action_losses = []
        final_value_losses = []
        final_dist_entropies = []

        for num_agent, agent in enumerate(training_agents):
            for _ in range(args.ppo_epoch):
                data_generator = agent.feed_forward_generator(
                    advantages[num_agent], args)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                        return_batch, masks_batch, old_action_log_probs_batch, \
                        adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = agent.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean()
                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()
                    agent.optimize(value_loss, action_loss, dist_entropy,
                                   args.entropy_coef, args.max_grad_norm)

            final_action_losses.append(action_loss)
            final_value_losses.append(value_loss)
            final_dist_entropies.append(dist_entropy)

            agent.after_update()

        #####
        # Save model.
        #####
        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # XXX: new way for saving model
            # XXX: we should also add the optimizer along with the state_dict
            for num_agent, agent in enumerate(training_agents):
                save_model = agent.get_model()
                save_optimizer = agent.get_optimizer()
                torch.save(
                    {
                        'epoch': j,
                        'arch': args.model,
                        'state_dict': save_model.state_dict(),
                        'optimizer': save_optimizer.state_dict(),
                    },
                    os.path.join(
                        save_path,
                        "train={}-config={}-model={}-agent={}.pt".format(
                            args.how_train, args.config, args.model,
                            num_agent)))

        #####
        # Log to console.
        #####
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, avg entropy {:.5f}, avg value loss {:.5f}, avg policy loss {:.5f}"
                .format(
                    j, total_num_steps, int(total_num_steps / (end - start)),
                    final_rewards.mean(), final_rewards.median(),
                    final_rewards.min(), final_rewards.max(),
                    np.mean([
                        dist_entropy.data[0]
                        for dist_entropy in final_dist_entropies
                    ]),
                    np.mean([
                        value_loss.data[0] for value_loss in final_value_losses
                    ]),
                    np.mean([
                        action_loss.data[0]
                        for action_loss in final_action_losses
                    ])))

            # save stats to h5 file
            # TODO: need to fix this error
            # stats = utils.log_stats(args, stats, j, int(total_num_steps / (end - start)), \
            #     final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), \
            #     np.mean([action_loss.data[0] for action_loss in final_action_losses]), \
            #     np.mean([value_loss.data[0] for value_loss in final_value_losses]), \
            #     np.mean([dist_entropy.data[0] for dist_entropy in final_dist_entropies]))
            #
            # log_path = os.path.join(args.log_dir)
            # filename_stats = '%s/stats.h5' % log_path
            # utils.save_dict(filename_stats, stats)

        #####
        # Log to Visdom.
        #####
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name, 'ppo')
            except IOError:
                pass
예제 #13
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(
                Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data,
                            reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1],
                                           volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(
                -1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.states[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(
                    range(args.num_processes * args.num_steps)),
                                       args.batch_size * args.num_processes,
                                       drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(
                        -1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(
                        -1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                        Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _ = old_model.evaluate_actions(
                        Variable(states_batch, volatile=True),
                        Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print(
                "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, (j + 1) * args.num_processes * args.num_steps,
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        -dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

        if j % args.vis_interval == 0:
            win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
예제 #14
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    action_shape = envs.action_space.shape[0]

    print('+++++++++++++++++++++++++++++++++++++')
    print('obs_shape:', obs_shape)
    print('action_shape:', action_shape)
    print('+++++++++++++++++++++++++++++++++++++')

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)

            print('%3d  [%3d  %3d  %3d  %3d]  %3d' % (step,
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 0]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 1]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 2]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 3]),
                                                      reward[0]))

            if reward[0] >= search_done_reward:
                sys.exit()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        old_model.load_state_dict(actor_critic.state_dict())
        if hasattr(actor_critic, 'obs_filter'):
            old_model.obs_filter = actor_critic.obs_filter

        for _ in range(args.ppo_epoch):
            sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
            for indices in sampler:
                indices = torch.LongTensor(indices)
                if args.cuda:
                    indices = indices.cuda()
                states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                # Reshape to do in a single forward pass for all steps
                values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                adv_targ = Variable(advantages.view(-1, 1)[indices])
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                value_loss = (Variable(return_batch) - values).pow(2).mean()

                optimizer.zero_grad()
                (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, j * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
예제 #15
0
        SubprocVecEnv([
            make_env(args.env_name[j], args.seed, i, log_dir_teacher[j])
            for i in range(args.num_processes)
        ]) for j in range(args.num_heads)
    ]

    obs_shape = envs_teacher[0].observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs_teacher[0].observation_space.shape) == 3:
        teacher = CNNPolicy(obs_shape[0], envs_teacher.action_space)
        # TODO: change student
        student = CNNPolicy(obs_shape[0], envs_student_train.action_space)
    else:
        teacher = [
            MLPPolicy(obs_shape[0], envs_teacher[i].action_space)
            for i in range(args.num_heads)
        ]
        # TODO: change student
        student = MultiHead_MLPPolicy(obs_shape[0],
                                      envs_student_train[0].action_space,
                                      num_heads=args.num_heads)

    # load teacher model from checkpoint
    for i in range(args.num_heads):
        assert os.path.exists(args.checkpoint[i])
        state_dict = torch.load(args.checkpoint[i])
        print('Loading teacher network from : %s' % args.checkpoint[i])
        teacher[i].load_state_dict(state_dict['model_state_dict'])

    if args.cuda:
예제 #16
0
writer = SummaryWriter()

# Hyper-parameters
BATCH_SIZE = 512
MEMORY_SIZE = 5000
LR = 0.001
test_interval = 1000
test_episodes = 100
TIMESTEPS = 10000
EPSILON_ENDT = 3000

env = gym.make('CartPole-v0')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
agent = DQNAgent(d_actions=env.action_space.n, device=device, batch_size=BATCH_SIZE, memory_size=MEMORY_SIZE, lr=LR,
                 epsilon_endt=EPSILON_ENDT)
agent.policy_net = MLPPolicy(d_state=env.observation_space.shape[0], d_hidden=20,
                             d_action=env.action_space.n).to(device)

init = time.time()
print("Init time {}".format(init-start))

num_episode = 0
episode_t = 0

state = env.reset()
state = torch.from_numpy(state).unsqueeze_(0).to(device=device, dtype=torch.float)
while agent.time_step < TIMESTEPS:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action.item())
    episode_t += 1

    if not done:
예제 #17
0
 def select_network(self):
     actor_critic = MLPPolicy(self.obs_shape[0], self.env.action_space)
     return actor_critic
예제 #18
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    names = getListOfGames("train")

    envs = [make_env_train(names[i], args.seed, i, args.log_dir)
                for i in range(len(names))]
                
    # TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
    args.num_processes = len(envs)
    # REMEMBER YOU CHENGED IT

    if len(envs) > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    #print(obs_shape)
    obs_shape = (obs_shape[0], *obs_shape[1:])
    #print(obs_shape)

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    # Making it paralel
    actor_critic = torch.nn.parallel.DataParallel(actor_critic).module

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
       actor_critic.cuda()

    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, lr=args.lr,
                               eps=args.eps,
                               max_grad_norm=args.max_grad_norm)
        # Make agent DataParallel
        agent = torch.nn.parallel.DataParallel(agent).module

    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    # Make rollouts DataParallel
    rollouts = torch.nn.parallel.DataParallel(RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)).module
    current_obs = torch.nn.parallel.DataParallel(torch.zeros(envs.nenvs, *obs_shape)).module

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        # if args.num_stack > 1:
        #     current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                    Variable(rollouts.observations[step], volatile=True),
                    Variable(rollouts.states[step], volatile=True),
                    Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic.get_value(Variable(rollouts.observations[-1], volatile=True),
                                            Variable(rollouts.states[-1], volatile=True),
                                            Variable(rollouts.masks[-1], volatile=True)).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        value_loss, action_loss, dist_entropy = agent.update(rollouts)

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
예제 #19
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        viz_1 = Visdom()
        win = None
        win1 = None

    env_name_1 = 'HalfCheetahSmallFoot-v0'
    args.env_name = 'HalfCheetahSmallLeg-v0'

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    envs_1 = [
        make_env(env_name_1, args.seed, i, args.log_dir_1)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        envs_1 = SubprocVecEnv(envs_1)
    else:
        envs = DummyVecEnv(envs)
        envs_1 = DummyVecEnv(envs_1)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)
        envs_1 = VecNormalize(envs_1)

    #same for both tasks
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space)

    #same for both tasks
    action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()
        actor_critic_1.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)
    optimizer_1 = optim.RMSprop(actor_critic_1.parameters(),
                                args.lr,
                                eps=args.eps,
                                alpha=args.alpha)

    #Different for both tasks
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                                envs_1.action_space, actor_critic_1.state_size)
    current_obs_1 = torch.zeros(args.num_processes, *obs_shape)

    #Different update functions
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    def update_current_obs_1(obs):
        shape_dim0 = envs_1.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:]
        current_obs_1[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    obs_1 = envs_1.reset()
    update_current_obs_1(obs_1)

    rollouts.observations[0].copy_(current_obs)
    rollouts_1.observations[0].copy_(current_obs_1)

    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    episode_rewards_1 = torch.zeros([args.num_processes, 1])
    final_rewards_1 = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()
        current_obs_1 = current_obs_1.cuda()
        rollouts_1.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions from branch 1
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))

            cpu_actions = action.data.squeeze(1).cpu().numpy()

            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

            #Sample actions from branch 2
            value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act(
                Variable(rollouts_1.observations[step], volatile=True),
                Variable(rollouts_1.states[step], volatile=True),
                Variable(rollouts_1.masks[step], volatile=True))

            cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy()
            obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1)
            reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1),
                                                       1)).float()
            episode_rewards_1 += reward_1

            masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0]
                                         for done_ in done_1])
            final_rewards_1 *= masks_1
            final_rewards_1 += (1 - masks_1) * episode_rewards_1
            episode_rewards_1 *= masks_1

            if args.cuda:
                masks_1 = masks_1.cuda()

            if current_obs_1.dim() == 4:
                current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2)
            else:
                current_obs_1 *= masks_1

            update_current_obs_1(obs_1)
            rollouts_1.insert(step, current_obs_1, states_1.data,
                              action_1.data, action_log_prob_1.data,
                              value_1.data, reward_1, masks_1)

        #Update for branch 1
        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
            Variable(rollouts.masks[:-1].view(-1, 1)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()
        rollouts.after_update()

        #share params branch 1 -> branch 2
        actor_critic_1.a_fc1.weight.data = copy.deepcopy(
            actor_critic.a_fc1.weight.data)
        actor_critic_1.a_fc1.bias.data = copy.deepcopy(
            actor_critic.a_fc1.bias.data)
        actor_critic_1.v_fc1.weight.data = copy.deepcopy(
            actor_critic.v_fc1.weight.data)
        actor_critic_1.v_fc1.bias.data = copy.deepcopy(
            actor_critic.v_fc1.bias.data)

        #Update for branch 2
        next_value_1 = actor_critic_1(
            Variable(rollouts_1.observations[-1], volatile=True),
            Variable(rollouts_1.states[-1], volatile=True),
            Variable(rollouts_1.masks[-1], volatile=True))[0].data

        rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma,
                                   args.tau)

        values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions(
            Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)),
            Variable(rollouts_1.masks[:-1].view(-1, 1)),
            Variable(rollouts_1.actions.view(-1, action_shape)))

        values_1 = values_1.view(args.num_steps, args.num_processes, 1)
        action_log_probs_1 = action_log_probs_1.view(args.num_steps,
                                                     args.num_processes, 1)

        advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1
        value_loss_1 = advantages_1.pow(2).mean()

        action_loss_1 = -(Variable(advantages_1.data) *
                          action_log_probs_1).mean()

        optimizer_1.zero_grad()
        (value_loss_1 * args.value_loss_coef + action_loss_1 -
         dist_entropy_1 * args.entropy_coef).backward()
        nn.utils.clip_grad_norm(actor_critic_1.parameters(),
                                args.max_grad_norm)
        optimizer_1.step()
        rollouts_1.after_update()

        #share params branch 2 -> branch 1
        actor_critic.a_fc1.weight.data = copy.deepcopy(
            actor_critic_1.a_fc1.weight.data)
        actor_critic.a_fc1.bias.data = copy.deepcopy(
            actor_critic_1.a_fc1.bias.data)
        actor_critic.v_fc1.weight.data = copy.deepcopy(
            actor_critic_1.v_fc1.weight.data)
        actor_critic.v_fc1.bias.data = copy.deepcopy(
            actor_critic_1.v_fc1.bias.data)

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo,
                                     args.env_name + '_' + env_name_1)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_model = actor_critic_1
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_model_1 = copy.deepcopy(actor_critic_1).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]
            save_model_1 = [
                save_model_1,
                hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            torch.save(save_model_1, os.path.join(save_path,
                                                  env_name_1 + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
            print(
                "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards_1.mean(), final_rewards_1.median(),
                        final_rewards_1.min(), final_rewards_1.max(),
                        dist_entropy_1.data[0], value_loss_1.data[0],
                        action_loss_1.data[0]))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
                win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1,
                                   args.algo)
            except IOError:
                pass
예제 #20
0
obs_shape = env.robot.observation_space.shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
state_shape = env.robot.state_space.shape
state_shape = (state_shape[0] * args.num_stack, *state_shape[1:])
action_shape = env.robot.action_space.shape
full_state_shape = (state_shape[0] * args.num_stack,
                    obs_shape[1] + state_shape[1])

if args.symm_policy:
    full_state_shape = state_shape
"""
----[ Load Policy ]----
"""
actor_critic = MLPPolicy(obs_shape[1],
                         full_state_shape[1],
                         env.robot.action_space,
                         symm_policy=args.symm_policy)

print(
    os.path.join(args.load_dir + args.algo, args.phase, args.env_name,
                 args.env_name + args.tr_itr + ".pt"))
# state_dict, ob_rms = \
state_dict, ob_rms, st_rms, ret_rms = \
    torch.load(os.path.join(args.load_dir + args.algo, args.phase, args.env_name, args.env_name + args.tr_itr + ".pt"),
               map_location='cpu')

actor_critic.load_state_dict(state_dict)
actor_critic.train(False)
actor_critic.eval()  # TODO

print('ob_rms: ', ob_rms)
예제 #21
0
def main():
    print("######")
    print("HELLO! Returns start with infinity values")
    print("######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.random_task:
        env_params = {
            'wt': np.round(np.random.uniform(0.5, 1.0), 2),
            'x': np.round(np.random.uniform(-0.1, 0.1), 2),
            'y': np.round(np.random.uniform(-0.1, 0.1), 2),
            'z': np.round(np.random.uniform(0.15, 0.2), 2),
        }
    else:
        env_params = {
            'wt': args.euclidean_weight,
            'x': args.goal_x,
            'y': args.goal_y,
            'z': args.goal_z,
        }
    envs = [make_env(args.env_name, args.seed, i, args.log_dir, **env_params)
            for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    envs = VecNormalize(envs, ob=False)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    actor_critic.input_norm.update(rollouts.observations[0])

    last_return = -np.inf
    best_return = -np.inf
    best_models = None

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)
            actor_critic.input_norm.update(rollouts.observations[step + 1])

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(advantages,
                                                            args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(advantages,
                                                            args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(observations_batch),
                                                                                                   Variable(states_batch),
                                                                                                   Variable(masks_batch),
                                                                                                   Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if args.vis and j % args.vis_interval == 0:
            last_return = plot(logger, args.log_dir)

        if last_return > best_return:
            best_return = last_return
            try:
                os.makedirs(os.path.dirname(args.save_path))
            except OSError:
                pass

            info = {
                'return': best_return,
                'reward_norm': np.sqrt(envs.ret_rms.var + envs.epsilon)
            }

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            torch.save((save_model, env_params, info), args.save_path)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, average return {:.5f}, best_return {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                  format(j, total_num_steps,
                         int(total_num_steps / (end - start)),
                         last_return, best_return,
                         value_loss.data[0], action_loss.data[0]))
예제 #22
0
파일: main.py 프로젝트: nudles/a2c
def main():
    print("#######")
    print("WARNING: All rewards are not clipped or normalized ")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = rafiki.Envs(args.num_processes, args.num_models, args.policy,
                       args.beta, args.obs_size, args.max_latency, args.tau,
                       args.cycle_len)
    obs_shape = envs.observation_space.shape

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    info_set = Info(args)

    for j in range(num_updates):
        for step in range(args.num_steps):
            logger.info('------------%d----------------' % j)
            # Sample actions
            with torch.no_grad():
                action, probs, action_log_prob = actor_critic.act(
                    Variable(rollouts.observations[step]))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            logger.info(probs)
            obs, reward, info = envs.step(cpu_actions)
            info_set.insert(info)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            update_current_obs(obs)
            rollouts.insert(step, current_obs, action.data,
                            action_log_prob.data, reward)

        if args.algo in ['a2c', 'ppo']:
            action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            R = rollouts.rewards.detach()

            optimizer.zero_grad()
            policy_loss = -R.reshape(args.num_steps,
                                     args.num_processes).mul(action_log_probs)
            policy_loss = sum(policy_loss) / len(policy_loss)
            policy_loss.backward()

            # nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()

        with torch.no_grad():
            action, probs, action_log_prob = actor_critic.act(
                Variable(rollouts.observations[-1]))
        logger.info(probs)

        rollouts.after_update()

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, reward {}, policy loss {}".
                  format(j, total_num_steps, R.data,
                         policy_loss.reshape(-1).data))

    logger.info(args)
    info_set.show()
예제 #23
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]
                 )  # I guess the obs_shape[0] is channel number

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # args.num_steps should be the length of interactions before each updating/training
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy(
            )  # returns are state value, sampled action, act_log_prob, hidden states

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(
                step, current_obs, states.data, action.data,
                action_log_prob.data, value.data, reward, masks
            )  # so the rollout stores one batch of interaction sequences, each sequence has length of args.num_steps

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))
            # values should be values of observations, states are the hidden states used in rnn module, by pwang8

            values = values.view(
                args.num_steps, args.num_processes,
                1)  # values are estimated current state values
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            # rollouts.returns are current "Action" value calculted following Bellmans' eqaution gamma * State_value(t+1) + reward(t)
            advantages = Variable(
                rollouts.returns[:-1]
            ) - values  # This is also the definition of advantage value (action_value - state_value).
            value_loss = advantages.pow(
                2).mean()  # values are estimated current state_value(t)

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            # If ACKTR is utilized, it is not only a different optimizer is used, they also added some new loss source
            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(
                    values - Variable(sample_values.data)
                ).pow(2).mean(
                )  # don't know what is the difference between this and just randomly sample some noise

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:
                                                                      -1]  # calculating the advantage value of an action
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            # The difference from this ppo optimization to the optimization above is that: it updates params for
            # multiple epochs in ppo optimization. Because of this, it samples from the rollouts storage a minibatch
            # every time to calculate gradient. Sampling is conducted for optimization purpose.
            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))
                    # For the 1st epoch of updating, I guess the action_log_probls is the same as old_action_log_probs_batch
                    # because params of the NN have not been updated at that time. But later, in other updating epochs,
                    # this ratio will generate some error. The old_action_log_probs_batch will not be updated during
                    # these param updating epochs.
                    # action_log_probs is the log prob of that action taken by the agent. So it's one value here, not
                    # log_prob for all actions with certain input observation/state. By pwang8, Dec 31, 2017
                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)
                    # compared to a2c, the major difference for ppo is that action_loss is calculated in controlled way
                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
예제 #24
0
파일: main.py 프로젝트: siqili1230/ppo-plus
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'
    #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    #os.environ['CUDA_VISIBLE_DEVICES'] = "9"
    if args.vis:
        from visdom import Visdom
        viz = Visdom(port=args.port)
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,args.hid_size, args.feat_size,args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    if args.use_cell:
        hs = HistoryCell(obs_shape[0], actor_critic.feat_size, 2*actor_critic.hidden_size, 1)
        ft = FutureCell(obs_shape[0], actor_critic.feat_size, 2 * actor_critic.hidden_size, 1)
    else:
        hs = History(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)
        ft = Future(obs_shape[0], actor_critic.feat_size, actor_critic.hidden_size, 2, 1)

    if args.cuda:
        actor_critic=actor_critic.cuda()
        hs = hs.cuda()
        ft = ft.cuda()
    if args.algo == 'a2c':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, lr=args.lr,
                               eps=args.eps, alpha=args.alpha,
                               max_grad_norm=args.max_grad_norm)
    elif args.algo == 'ppo':
        agent = algo.PPO(actor_critic, hs,ft,args.clip_param, args.ppo_epoch, args.num_mini_batch,
                         args.value_loss_coef, args.entropy_coef, args.hf_loss_coef,ac_lr=args.lr,hs_lr=args.lr,ft_lr=args.lr,
                                eps=args.eps,
                                max_grad_norm=args.max_grad_norm,
                                num_processes=args.num_processes,
                                num_steps=args.num_steps,
                                use_cell=args.use_cell,
                                lenhs=args.lenhs,lenft=args.lenft,
                                plan=args.plan,
                                ac_intv=args.ac_interval,
                                hs_intv=args.hs_interval,
                                ft_intv=args.ft_interval
                                )
    elif args.algo == 'acktr':
        agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                               args.entropy_coef, acktr=True)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size,
                              feat_size=512)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)


    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    rec_x = []
    rec_y = []
    file = open('./rec/' + args.env_name + '_' + args.method_name + '.txt', 'w')

    hs_info = torch.zeros(args.num_processes, 2 * actor_critic.hidden_size).cuda()
    hs_ind = torch.IntTensor(args.num_processes, 1).zero_()

    epinfobuf = deque(maxlen=100)
    start_time = time.time()
    for j in range(num_updates):
        print('begin sample, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                                time.gmtime(time.time() - start_time))))
        for step in range(args.num_steps):
            # Sample actions
            with torch.no_grad():
                rollouts.feat[step]=actor_critic.get_feat(rollouts.observations[step])

                if args.use_cell:
                    for i in range(args.num_processes):
                        h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                        start_ind = max(hs_ind[i],step+1-args.lenhs)
                        for ind in range(start_ind,step+1):
                            h,c=hs(rollouts.feat[ind,i].unsqueeze(0),h,c)
                        hs_info[i,:]=h.view(1,2*actor_critic.hid_size)
                        del h,c
                        gc.collect()
                else:
                    for i in range(args.num_processes):
                        start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                        hs_info[i,:]=hs(rollouts.feat[start_ind:step+1,i])

                hidden_feat=actor_critic.cat(rollouts.feat[step],hs_info)
                value, action, action_log_prob, states = actor_critic.act(
                        hidden_feat,
                        rollouts.states[step])
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(cpu_actions)
            for info in infos:
                maybeepinfo = info.get('episode')
                if maybeepinfo:
                    epinfobuf.extend([maybeepinfo['r']])
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            hs_ind = ((1-masks)*(step+1)+masks*hs_ind.float()).int()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(current_obs, hs_ind,states.data, action.data, action_log_prob.data, value.data, reward, masks)
        with torch.no_grad():
            rollouts.feat[-1] = actor_critic.get_feat(rollouts.observations[-1])
            if args.use_cell:
                for i in range(args.num_processes):
                    h = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    c = torch.zeros(1, 2 * actor_critic.hid_size).cuda()
                    start = max(hs_ind[i], step + 1 - args.lenhs)
                    for ind in range(start, step + 1):
                        h, c = hs(rollouts.feat[ind, i].unsqueeze(0), h, c)
                    hs_info[i, :] = h.view(1, 2 * actor_critic.hid_size)
                    del h,c
            else:
                for i in range(args.num_processes):
                    start_ind = max(hs_ind[i], step + 1 - args.lenhs)
                    hs_info[i, :] = hs(rollouts.feat[start_ind:step + 1, i])
            hidden_feat = actor_critic.cat(rollouts.feat[-1],hs_info)
            next_value = actor_critic.get_value(hidden_feat).detach()
        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        rollouts.compute_ft_ind()

        print('begin update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time))))
        value_loss, action_loss, dist_entropy = agent.update(rollouts)
        print('end update, time  {}'.format(time.strftime("%Hh %Mm %Ss",
                                                            time.gmtime(time.time() - start_time))))
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                            hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            v_mean,v_median,v_min,v_max = safe(epinfobuf)
            print("Updates {}, num timesteps {},time {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       time.strftime("%Hh %Mm %Ss",
                                     time.gmtime(time.time() - start_time)),
                       int(total_num_steps / (end - start_time)),
                       v_mean, v_median, v_min, v_max,
                       dist_entropy,
                       value_loss, action_loss))

            if not (v_mean==np.nan):
                rec_x.append(total_num_steps)
                rec_y.append(v_mean)
                file.write(str(total_num_steps))
                file.write(' ')
                file.writelines(str(v_mean))
                file.write('\n')

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo, args.num_frames)
            except IOError:
                pass
    plot_line(rec_x, rec_y, './imgs/' + args.env_name + '_' + args.method_name + '.png', args.method_name,
              args.env_name, args.num_frames)
    file.close()
예제 #25
0
def eval_pomme(
        saved_models='train=simple-config=ffa_v0-model=convnet-agent=0.pt'):
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom(server=args.server, port=8097)
        # viz = Visdom(port=args.port)
        win = None

    # Instantiate the environment
    config = getattr(configs, args.config)()

    # We make this in order to get the shapes.
    dummy_env = make_env(args, config, -1,
                         [config['agent'](game_type=config['game_type'])])()
    envs_shape = dummy_env.observation_space.shape[1:]
    obs_shape = (envs_shape[0], *envs_shape[1:])
    action_space = dummy_env.action_space
    if len(envs_shape) == 3:
        if args.model == 'convnet':
            actor_critic = lambda saved_model: PommeCNNPolicySmall(
                obs_shape[0], action_space, args)
        elif args.model == 'resnet':
            actor_critic = lambda saved_model: PommeResnetPolicy(
                obs_shape[0], action_space, args)
    else:
        actor_critic = lambda saved_model: MLPPolicy(obs_shape[0], action_space
                                                     )

    # TODO: this only works for simple - need a list of checkpoints for self-play
    # We need to get the agent = config.agent(agent_id, config.game_type) and then
    # pass that agent into the agent.PPOAgent
    training_agents = []

    # TODO: this is a bit hacky and doesn't work for more than 1 model
    # saved_models = args.saved_models

    save_path = os.path.join(args.save_dir)
    saved_models = [os.path.join(save_path, saved_models)]
    # saved_models = saved_models.split(',') if saved_models else [None]*args.nagents

    assert (len(saved_models)) == args.nagents
    if len(envs_shape) == 3:
        if args.model == 'convnet':
            actor_critic_model = PommeCNNPolicySmall(obs_shape[0],
                                                     action_space, args)
        elif args.model == 'resnet':
            actor_critic_model = PommeResnetPolicy(obs_shape[0], action_space,
                                                   args)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic_model = MLPPolicy(obs_shape[0], action_space)

    print("****")
    for saved_model in saved_models:
        # TODO: implement the model loading.
        loaded_model = torch.load(saved_model)
        print("epoch of model {} is: {}".format(saved_model,
                                                loaded_model['epoch']))
        loaded_actor_critic_model = actor_critic_model.load_state_dict(
            loaded_model['state_dict'])
        model = actor_critic(loaded_actor_critic_model)
        model.eval()
        agent = config['agent'](game_type=config['game_type'])
        agent = ppo_agent.PPOAgent(agent, model)
        training_agents.append(agent)
    print("****")

    if args.how_train == 'simple':
        # Simple trains a single agent against three SimpleAgents.
        assert (
            args.nagents == 1), "Simple training should have a single agent."
        num_training_per_episode = 1
    elif args.how_train == 'homogenous':
        # Homogenous trains a single agent against itself (self-play).
        assert (args.nagents == 1
                ), "Homogenous toraining should have a single agent."
        num_training_per_episode = 4
    elif args.how_train == 'heterogenous':
        assert (args.nagents >
                1), "Heterogenous training should have more than one agent."
        print("Heterogenous training is not implemented yet.")
        return

    # NOTE: Does this work correctly? Will the threads operate independently?
    envs = [
        make_env(args, config, i, training_agents)
        for i in range(args.num_processes)
    ]
    envs = SubprocVecEnv(envs) if args.num_processes > 1 else DummyVecEnv(envs)

    for agent in training_agents:
        agent.initialize(args, obs_shape, action_space,
                         num_training_per_episode)

    current_obs = torch.zeros(num_training_per_episode, args.num_processes,
                              *obs_shape)

    def update_current_obs(obs):
        current_obs = torch.from_numpy(obs).float()

    obs = envs.reset()
    update_current_obs(obs)
    if args.how_train == 'simple':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)
    elif args.how_train == 'homogenous':
        training_agents[0].update_rollouts(obs=current_obs, timestep=0)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])
    final_rewards = torch.zeros(
        [num_training_per_episode, args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        for agent in training_agents:
            agent.cuda()

    start = time.time()
    for j in range(args.num_steps_eval):
        for step in range(args.num_steps):
            value_agents = []
            action_agents = []
            action_log_prob_agents = []
            states_agents = []
            episode_reward = []
            cpu_actions_agents = []

            if args.how_train == 'simple':
                value, action, action_log_prob, states = training_agents[
                    0].act_pytorch(step, 0)
                value_agents.append(value)
                action_agents.append(action)
                action_log_prob_agents.append(action_log_prob)
                states_agents.append(states)
                cpu_actions = action.data.squeeze(1).cpu().numpy()
                cpu_actions_agents = cpu_actions
            elif args.how_train == 'homogenous':
                cpu_actions_agents = [[] for _ in range(args.num_processes)]
                for i in range(4):
                    value, action, action_log_prob, states = training_agents[
                        0].act_pytorch(step, i)
                    value_agents.append(value)
                    action_agents.append(action)
                    action_log_prob_agents.append(action_log_prob)
                    states_agents.append(states)
                    cpu_actions = action.data.squeeze(1).cpu().numpy()
                    for num_process in range(args.num_processes):
                        cpu_actions_agents[num_process].append(
                            cpu_actions[num_process])

            obs, reward, done, info = envs.step(cpu_actions_agents)
            reward = torch.from_numpy(np.stack(reward)).float().transpose(0, 1)
            episode_rewards += reward

            if args.how_train == 'simple':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode for done_ in done])
            elif args.how_train == 'homogenous':
                masks = torch.FloatTensor(
                    [[0.0] * num_training_per_episode if done_ else [1.0] *
                     num_training_per_episode
                     for done_ in done]).transpose(0, 1)

            masks = torch.FloatTensor(
                [[0.0] * num_training_per_episode if done_ else [1.0] *
                 num_training_per_episode for done_ in done]).transpose(0, 1)

            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()

            reward_all = reward.unsqueeze(2)

            masks_all = masks.unsqueeze(2)

            if args.how_train == 'simple':
                masks_all = masks.transpose(0, 1).unsqueeze(2)
            elif args.how_train == 'homogenous':
                masks_all = masks.unsqueeze(2)

            current_obs *= masks_all.unsqueeze(2).unsqueeze(2)
            update_current_obs(obs)

            states_all = torch.from_numpy(
                np.stack([x.data for x in states_agents])).float()
            action_all = torch.from_numpy(
                np.stack([x.data for x in action_agents])).float()
            action_log_prob_all = torch.from_numpy(
                np.stack([x.data for x in action_log_prob_agents])).float()
            value_all = torch.from_numpy(
                np.stack([x.data for x in value_agents])).float()

            if args.how_train in ['simple', 'homogenous']:
                training_agents[0].insert_rollouts(step, current_obs,
                                                   states_all, action_all,
                                                   action_log_prob_all,
                                                   value_all, reward_all,
                                                   masks_all)

        if step % args.log_interval == 0:
            print("step ", step)
            end = time.time()
            total_num_steps = (step +
                               1) * args.num_processes * args.num_steps_eval
            final_rewards_tr = torch.zeros(
                [args.num_processes, args.nagents, 1])
            final_rewards_tr.copy_(final_rewards)
            final_rewards_tr = final_rewards_tr.view(args.num_processes,
                                                     args.nagents).transpose(
                                                         0, 1)
            for i in range(args.nagents):
                print("agent # ", i)
                print(
                    "Updates {}, Agent {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}"
                    .format(step, i, total_num_steps,
                            int(total_num_steps / (end - start)),
                            final_rewards_tr[i].mean(),
                            final_rewards_tr[i].median(),
                            final_rewards_tr[i].min(),
                            final_rewards_tr[i].max()), "\n")
            print("\n")

        if args.vis and step % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name)
            except IOError:
                pass
예제 #26
0
    str(args.seed))
if os.path.exists(log_path):
    shutil.rmtree(log_path)  # clean directory each time
os.makedirs(log_path)

# Logging settings
log_str_set = '# Settings: [T={:d}, Num_iter_algo={:d}, Num_iter_policy={:d}, lr_dynamics={:.4f}, lr_policy={:.4f}, c_sigma={:.2f}]\n'
print(
    log_str_set.format(args.T, args.num_iter_algo, args.num_iter_policy,
                       args.lr_dynamics, args.lr_policy, args.c_sigma))

# Create policy and its optimizer
if args.policy_type == 'LinearPolicy':
    policy = LinearPolicy(env).cuda()
elif args.policy_type == 'MLPPolicy':
    policy = MLPPolicy(env, hidden_size=args.hidden_size).cuda()
else:
    raise TypeError('Policy type must be either LinearPolicy or MLPPolicy')

# Initialize policy parameters to ensure small values
for param in policy.parameters():
    nn.init.normal(param, mean=0, std=1e-5)

policy_optimizer = optim.Adam(policy.parameters(),
                              lr=args.lr_policy)  # 1e-2, RMSprop

# Create dynamics and its optimizer
dynamics = DynamicsModel(env, hidden_size=200, drop_prob=args.drop_p).cuda()
dynamics_optimizer = optim.Adam(dynamics.parameters(),
                                lr=args.lr_dynamics,
                                weight_decay=1e-4)
예제 #27
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = []
        win_dic ={}
        for i in range(len(mt_env_id_dic_selected)):
            win += [None]
        win_afs_per_m = None
        win_afs_loss = None
        win_basic_loss = None
    
    plot_dic = {}
    envs = []
    ''' Because the oral program has only one game per model, so Song add loop i
        So whatever you wanna run , just put in SubprocVecEnvMt!
    '''
    for i in range(len(mt_env_id_dic_selected)):
        log_dir = args.log_dir+mt_env_id_dic_selected[i]+'/'
        for j in range(args.num_processes):
            envs += [make_env(mt_env_id_dic_selected[i], args.seed, j, log_dir)]
    ''' This envs is an intergration of all the running env'''
    envs = SubprocVecEnvMt(envs)

    num_processes_total = args.num_processes * len(mt_env_id_dic_selected)
    '''(1,128,128)'''
    obs_shape = envs.observation_space.shape
    #num_stack :number of frames to stack
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    from arguments import is_restore
    if is_restore and args.save_dir:
        load_path = os.path.join(args.save_dir, args.algo)
        actor_critic =torch.load(os.path.join(load_path, args.env_name + ".pt"))
        # print ("restored previous model!")
        # print (actor_critic.Variable)
        # print (sss)
    else:
        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    
    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)
    #'args.num_steps: number of forward steps in A2C
    #rollouts is an intergration of state\ reward\ next state\action and so on
    rollouts = RolloutStorage(args.num_steps, num_processes_total, obs_shape, envs.action_space)
    current_state = torch.zeros(num_processes_total, *obs_shape)
    ''' not sure about it'''
    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        # print (shape_dim0)
        # print (sss)
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([num_processes_total, 1])
    final_rewards = torch.zeros([num_processes_total, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    if args.algo == 'ppo':
        old_model = copy.deepcopy(actor_critic)

    from arguments import ewc, ewc_lambda, ewc_interval

    afs_per_m = []
    afs_offset = [0.0]*gtn_M

    afs_loss_list = []
    basic_loss_list = []
    episode_reward_rec = 0.0
    one = torch.FloatTensor([1]).cuda()
    mone = one * -1
    '''for one whole game '''
    for j in range(num_updates):
        for step in range(args.num_steps):
            if ewc == 1:
                try:
                    states_store = torch.cat([states_store, rollouts.states[step].clone()], 0)
                except Exception as e:
                    states_store = rollouts.states[step].clone()
            # Sample actions
            '''act fun refer to "observe it!"'''
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next state
            state, reward, done = envs.step(cpu_actions)
            '''record the last 100 episodes rewards'''
            episode_reward_rec += reward
            episode_reward_rec = rec_last_100_epi_reward(episode_reward_rec,done)
            
            
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            '''reward is shape of process_num_total, not batch-size'''
            # print ((reward).size())
            # print (done)
            # print (sss)
            episode_rewards += reward
            ################
            # rec_last_100_epi_reward(reward,done)
            
            # episode_reward_ppo += reward[0]
            # If done then clean the history of observations. final_rewards is used for compute after one whole num_step
            
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            # reset gradient
            optimizer.zero_grad()

            # forward
            values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape)))
            # pre-process
            values = values.view(args.num_steps, num_processes_total, 1)
            action_log_probs = action_log_probs.view(args.num_steps, num_processes_total, 1)

            # compute afs loss
            afs_per_m_temp, afs_loss = actor_critic.get_afs_per_m(
                action_log_probs=action_log_probs,
                conv_list=conv_list,
            )
            if len(afs_per_m_temp)>0:
                afs_per_m += [afs_per_m_temp]

            if (afs_loss is not None) and (afs_loss.data.cpu().numpy()[0]!=0.0):
                afs_loss.backward(mone, retain_graph=True)
                afs_loss_list += [afs_loss.data.cpu().numpy()[0]]

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()
            final_loss_basic = value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef

            ewc_loss = None
            if j != 0:
                if ewc == 1:
                    ewc_loss = actor_critic.get_ewc_loss(lam=ewc_lambda)
            
            if ewc_loss is None:
                final_loss = final_loss_basic
            else:
                final_loss = final_loss_basic + ewc_loss
            # print (final_loss_basic.data.cpu().numpy()[0])
            # final_loss_basic
            basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
            final_loss.backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            old_model.load_state_dict(actor_critic.state_dict())
            if hasattr(actor_critic, 'obs_filter'):
                old_model.obs_filter = actor_critic.obs_filter

            for _ in range(args.ppo_epoch):
                sampler = BatchSampler(SubsetRandomSampler(range(num_processes_total * args.num_steps)), args.batch_size * num_processes_total, drop_last=False)
                for indices in sampler:
                    indices = torch.LongTensor(indices)
                    if args.cuda:
                        indices = indices.cuda()
                    states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                    actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                    return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, conv_list = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                    _, old_action_log_probs, _, old_conv_list= old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                    adv_targ = Variable(advantages.view(-1, 1)[indices])
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    final_loss_basic = (value_loss + action_loss - dist_entropy * args.entropy_coef)
                    
                    basic_loss_list += [final_loss_basic.data.cpu().numpy()[0]]
                    final_loss_basic.backward()
                    optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        # if j % int(num_updates/2-10) == 0 and args.save_dir != "":
        if j % args.save_interval == 0 and args.save_dir != "":
         
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))
            import pickle
            with open(os.path.join(save_path, args.env_name + "_last_100_reward"), "wb") as f:
                pickle.dump(reward_dict, f)



        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, (j + 1) * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))

            try:
                print("ewc loss {:.5f}".
                format(ewc_loss.data.cpu().numpy()[0]))
            except Exception as e:
                pass
            

        if j > 5 and j % args.vis_interval == 0 and args.vis:
            ''' load from the folder'''
            for ii in range(len(mt_env_id_dic_selected)):
                log_dir = args.log_dir+mt_env_id_dic_selected[ii]+'/'
                win[ii] = visdom_plot(viz, win[ii], log_dir, mt_env_id_dic_selected[ii], args.algo)

            plot_dic = reward_dict
            for plot_name in plot_dic.keys():
                # if plot_name not in win_dic:
                # win_dic[plot_name] = None
                if plot_name in win_dic.keys():
                    if len(plot_dic[plot_name]) > 0:
                        win_dic[plot_name] = viz.line(
                            torch.from_numpy(np.asarray(plot_dic[plot_name])), 
                            win=win_dic[plot_name],
                            opts=dict(title=break_line_html(exp+'>>'+plot_name))
                        )
                    

                else:
                    win_dic[plot_name] = None
            
            if len(afs_per_m)>0:
                win_afs_per_m = viz.line(
                    torch.from_numpy(np.asarray(afs_per_m)), 
                    win=win_afs_per_m,
                    opts=dict(title=title_html+'>>afs')
                )

            # print (basic_loss_list)
            '''a2c:len(basic_loss_list) is vis_interval+1. because j start from 0
               ppo:len(basic_loss_list) is (vis_interval+1)*ppo_epoch_4*len(BatchSampler)
            '''
            
            # print (len(basic_loss_list))
            # print (ss)
            win_basic_loss = viz.line(
                torch.from_numpy(np.asarray(basic_loss_list)), 
                win=win_basic_loss,
                opts=dict(title=title_html+'>>basic_loss')
            )

            if len(afs_loss_list) > 0:
                win_afs_loss = viz.line(
                    torch.from_numpy(np.asarray(afs_loss_list)), 
                    win=win_afs_loss,
                    opts=dict(title=title_html+'>>afs_loss')
                )

        from arguments import parameter_noise, parameter_noise_interval
        if parameter_noise == 1:
            if j % parameter_noise_interval == 0:
                actor_critic.parameter_noise()

        if ewc == 1:
            if j % ewc_interval == 0 or j==0:
                actor_critic.compute_fisher(states_store)
                states_store = None
                actor_critic.star()
예제 #28
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
예제 #29
0
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if len(envs.observation_space.shape) == 3:
        actor_critic = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        target_actor = Actor(obs_shape[0], envs.action_space,
                             args.recurrent_policy, envs.action_space.n)
        critic = Critic(in_channels=4, num_actions=envs.action_space.n)
        critic_target = Critic(in_channels=4, num_actions=envs.action_space.n)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if args.cuda:
        actor_critic.cuda()
        critic.cuda()
        critic_target.cuda()
        target_actor.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
        critic_optim = optim.Adam(critic.parameters(), lr=1e-4)
        gamma = 0.99
        tau = 0.001

    #memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length)
    mem_buffer = ReplayBuffer()

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size,
                              envs.action_space.n)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            value = critic.forward(
                Variable(rollouts.observations[step], volatile=True),
                action_log_prob)
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            pre_state = rollouts.observations[step].cpu().numpy()
            update_current_obs(obs)
            mem_buffer.add((pre_state, current_obs,
                            action_log_prob.data.cpu().numpy(), reward, done))
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        action, action_log_prob, states = actor_critic.act(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))  #[0].data

        next_value = critic.forward(
            Variable(rollouts.observations[-1], volatile=True),
            action_log_prob).data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        if True:
            state, next_state, action, reward, done = mem_buffer.sample(5)
            next_state = next_state.reshape([-1, *obs_shape])
            state = state.reshape([-1, *obs_shape])
            action = action.reshape([-1, 6])
            next_q_values = critic_target(
                to_tensor(next_state, volatile=True),
                target_actor(to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True),
                             to_tensor(next_state, volatile=True))[0])
            next_q_values.volatile = False
            target_q_batch = to_tensor(reward) + args.gamma * to_tensor(
                done.astype(np.float)) * next_q_values
            critic.zero_grad()
            q_batch = critic(to_tensor(state), to_tensor(action))
            value_loss = criterion(q_batch, target_q_batch)
            value_loss.backward()
            critic_optim.step()
            actor_critic.zero_grad()
            policy_loss = -critic(
                to_tensor(state),
                actor_critic(to_tensor(state), to_tensor(state),
                             to_tensor(state))[0])
            policy_loss = policy_loss.mean()
            policy_loss.backward()
            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)
            optimizer.step()
            soft_update(target_actor, actor_critic, tau)
            soft_update(critic_target, critic, tau)
        '''
        if args.algo in ['a2c', 'acktr']:
            action_log_probs, probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                           Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                           Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                           Variable(rollouts.actions.view(-1, action_shape)))
            values = critic.forward(Variable(rollouts.observations[:-1].view(-1, *obs_shape)), probs).data

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            #advantages = Variable(rollouts.returns[:-1]) - values
            advantages = rollouts.returns[:-1] - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages) * action_log_probs).mean()
            #action_loss = -(Variable(advantages.data) * action_log_probs).mean()


            optimizer.zero_grad()
            critic_optim.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
            critic_optim.step()
        '''
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        value_loss.data.cpu().numpy()[0],
                        policy_loss.data.cpu().numpy()[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass
예제 #30
0
def main():
    print("#######")
    print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'





    print (args.cuda)
    print (args.num_steps)
    print (args.num_processes)
    print (args.lr)
    print (args.eps)
    print (args.alpha)
    print (args.use_gae)
    print (args.gamma)
    print (args.tau)
    print (args.value_loss_coef)
    print (args.entropy_coef)
    # fsdaf





    # Create environment
    envs = SubprocVecEnv([
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ])
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if len(envs.observation_space.shape) == 3:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
    else:
        actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]
    # action_shape = action_shape


    # shape_dim0 = envs.observation_space.shape[0]

    # if args.cuda:
    #     dtype = torch.cuda.FloatTensor
    # else:
    #     dtype = torch.FloatTensor

    hparams = {'cuda':args.cuda,
                'num_steps':args.num_steps,
                'num_processes':args.num_processes, 
                'obs_shape':obs_shape,
                'lr':args.lr,
                'eps':args.eps, 
                'alpha':args.alpha,
                'use_gae':args.use_gae, 
                'gamma':args.gamma, 
                'tau':args.tau,
                'value_loss_coef':args.value_loss_coef, 
                'entropy_coef':args.entropy_coef}




    # Create agent
    # agent = a2c(envs, hparams)


    # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
    #it has a self.state that is [steps, processes, obs]
    #steps is used to compute expected reward

    if args.cuda:
        actor_critic.cuda()
        # rollouts.cuda()
    optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])







    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)



    # Init state

    current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype)
    def update_current_state(state):#, shape_dim0):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state
        # return current_state


    state = envs.reset()

    update_current_state(state)#, shape_dim0) 
    # agent.insert_first_state(current_state)
    rollouts.states[0].copy_(current_state)
        #set the first state to current state


    # These are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()#type(dtype)
        # if args.cuda:
        rollouts.cuda()
    #Begin training
    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):

            # Act
            # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True))
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observe reward and next state
            state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width]

            # Record rewards
            # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward
            # If done then clean the history of observations.
            # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet
            # oh its just clearing the env that finished, and resetting its episode_reward
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks
            if args.cuda:
                masks = masks.cuda()
            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks
            # return reward, masks, final_rewards, episode_rewards, current_state




            # Update state
            update_current_state(state)#, shape_dim0)

            # Agent record step
            # agent.insert_data(step, current_state, action.data, value.data, reward, masks)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)



        #Optimize agent
        # agent.update()
        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data
        # use last state to make prediction of next value



        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))
        #not sure what this is




        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
        # this computes R =  r + r+ ...+ V(t)  for each step



        values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                                                    Variable(rollouts.states[:-1].view(-1, *obs_shape)), 
                                                    Variable(rollouts.actions.view(-1, action_shape)))
        # I think this aciton log prob could have been computed and stored earlier 
        # and didnt we already store the value prediction???

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        optimizer.step()




        rollouts.states[0].copy_(rollouts.states[-1])
        # the first state is now the last state of the previous 









        # #Save model
        # if j % args.save_interval == 0 and args.save_dir != "":
        #     save_path = os.path.join(args.save_dir, args.algo)
        #     try:
        #         os.makedirs(save_path)
        #     except OSError:
        #         pass

        #     # A really ugly way to save a model to CPU
        #     save_model = actor_critic
        #     if args.cuda:
        #         save_model = copy.deepcopy(actor_critic).cpu()
        #     torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        #Print updates
        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            #     format(j, total_num_steps,
            #            int(total_num_steps / (end - start)),
            #            final_rewards.mean(),
            #            final_rewards.median(),
            #            final_rewards.min(),
            #            final_rewards.max(),
            #            end - start))#, -dist_entropy.data[0],
            #            # value_loss.data[0], action_loss.data[0]))

            # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}".
            #     format(j, total_num_steps,
            #            final_rewards.min(),
            #            final_rewards.median(),
            #            final_rewards.mean(),
            #            final_rewards.max(),
            #            int(total_num_steps / (end - start)),
            #            end - start))

            if j % (args.log_interval*30) == 0:
                print("Upts, n_timesteps, min/med/mean/max, FPS, Time")

            print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}".
                    format(j, total_num_steps,
                           final_rewards.min(),
                           final_rewards.median(),
                           final_rewards.mean(),
                           final_rewards.max(),
                           int(total_num_steps / (end - start)),
                           end - start))
예제 #31
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    test_envs = [
        make_env(args.env_name, args.seed, i, args.log_dir,
                 args.start_container) for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        test_envs = SubprocVecEnv(test_envs)
    else:
        envs = DummyVecEnv(envs)
        test_envs = DummyVecEnv(test_envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.saved_encoder_model:
        obs_shape = (args.num_stack, args.latent_space_size)

    obs_numel = reduce(operator.mul, obs_shape, 1)

    if len(obs_shape) == 3 and obs_numel > 1024:
        actor_critic = CNNPolicy(obs_shape[0], envs.action_space,
                                 args.recurrent_policy)
    else:
        assert not args.recurrent_policy, \
            "Recurrent policy is not implemented for the MLP controller"
        actor_critic = MLPPolicy(obs_numel, envs.action_space)

    modelSize = 0
    for p in actor_critic.parameters():
        pSize = reduce(operator.mul, p.size(), 1)
        modelSize += pSize
    print(str(actor_critic))
    print('Total model size: %d' % modelSize)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.resume_experiment:
        print("\n############## Loading saved model ##############\n")
        actor_critic, ob_rms = torch.load(
            os.path.join(save_path, args.env_name + args.save_tag + ".pt"))
        tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p"))

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    print(obs_shape)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes,
                                   obs_shape, envs.action_space,
                                   actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)
    current_obs_test = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs, test=False):
        shape_dim0 = envs.observation_space.shape[0]
        if args.saved_encoder_model:
            shape_dim0 = 1
            obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs)))
            obs = obs.data.cpu().numpy()
        obs = torch.from_numpy(obs).float()
        if not test:
            if args.num_stack > 1:
                current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
            current_obs[:, -shape_dim0:] = obs
        else:
            if args.num_stack > 1:
                current_obs_test[:, :
                                 -shape_dim0] = current_obs_test[:,
                                                                 shape_dim0:]
            current_obs_test[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])
    reward_avg = 0

    if args.cuda:
        current_obs = current_obs.cuda()
        current_obs_test = current_obs_test.cuda()
        rollouts.cuda()
        rollouts_test.cuda()

    start = time.time()

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Observation, reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)

            # Maxime: clip the reward within [0,1] for more reliable training
            # This code deals poorly with large reward values
            reward = np.clip(reward, a_min=0, a_max=None) / 400

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            tr.episodes_done += args.num_processes - masks.sum()

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)
        tr.iterations_done += 1

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps,
                                                     args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) *
                            action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values -
                                   Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss -
             dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(),
                                        args.max_grad_norm)

            optimizer.step()

        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() +
                                                             1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(
                        advantages, args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(
                        advantages, args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                       return_batch, masks_batch, old_action_log_probs_batch, \
                            adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch), Variable(states_batch),
                        Variable(masks_batch), Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs -
                                      Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                        1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(
                        surr1,
                        surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) -
                                  values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss -
                     dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(),
                                            args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]

            torch.save(
                save_model,
                os.path.join(save_path, args.env_name + args.save_tag + ".pt"))

            total_test_reward_list = []
            step_test_list = []

            for _ in range(args.num_tests):
                test_obs = test_envs.reset()
                update_current_obs(test_obs, test=True)
                rollouts_test.observations[0].copy_(current_obs_test)
                step_test = 0
                total_test_reward = 0

                while step_test < args.num_steps_test:
                    value_test, action_test, action_log_prob_test, states_test = actor_critic.act(
                        Variable(rollouts_test.observations[step_test],
                                 volatile=True),
                        Variable(rollouts_test.states[step_test],
                                 volatile=True),
                        Variable(rollouts_test.masks[step_test],
                                 volatile=True))
                    cpu_actions_test = action_test.data.squeeze(
                        1).cpu().numpy()

                    # Observation, reward and next obs
                    obs_test, reward_test, done_test, info_test = test_envs.step(
                        cpu_actions_test)

                    # masks here doesn't really matter, but still
                    masks_test = torch.FloatTensor(
                        [[0.0] if done_test_ else [1.0]
                         for done_test_ in done_test])

                    # Maxime: clip the reward within [0,1] for more reliable training
                    # This code deals poorly with large reward values
                    reward_test = np.clip(reward_test, a_min=0,
                                          a_max=None) / 400

                    total_test_reward += reward_test[0]
                    reward_test = torch.from_numpy(
                        np.expand_dims(np.stack(reward_test), 1)).float()

                    update_current_obs(obs_test)
                    rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\
                     value_test.data, reward_test, masks_test)

                    step_test += 1

                    if done_test:
                        break

                #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget
                total_test_reward_list.append(total_test_reward)
                step_test_list.append(step_test)

            append_to(tr.test_reward, tr,
                      sum(total_test_reward_list) / args.num_tests)
            append_to(tr.test_episode_len, tr,
                      sum(step_test_list) / args.num_tests)

            logger.log_scalar_rl(
                "test_reward", tr.test_reward[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "test_episode_len", tr.test_episode_len[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])

            # Saving all the MyContainer variables
            tr.save(
                os.path.join(log_path, args.env_name + args.save_tag + ".p"))

        if j % args.log_interval == 0:
            reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean()
            end = time.time()
            tr.global_steps_done = (j +
                                    1) * args.num_processes * args.num_steps

            print(
                "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, tr.global_steps_done,
                        int(tr.global_steps_done / (end - start)), reward_avg,
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))

            append_to(tr.pg_loss, tr, action_loss.data[0])
            append_to(tr.val_loss, tr, value_loss.data[0])
            append_to(tr.entropy_loss, tr, dist_entropy.data[0])
            append_to(tr.train_reward_avg, tr, reward_avg)

            logger.log_scalar_rl(
                "train_pg_loss", tr.pg_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_val_loss", tr.val_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            logger.log_scalar_rl(
                "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize,
                [tr.episodes_done, tr.global_steps_done, tr.iterations_done])
            """
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(
                    j,
                    total_num_steps,
                    int(total_num_steps / (end - start)),
                    final_rewards.mean(),
                    final_rewards.median(),
                    final_rewards.min(),
                    final_rewards.max(), dist_entropy.data[0],
                    value_loss.data[0], action_loss.data[0])
                )
            """

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
            except IOError:
                pass