예제 #1
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']



        if hparams['dropout'] == True:
            print ('CNNPolicy_dropout2')
            actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space)
        elif len(envs.observation_space.shape) == 3:
            print ('CNNPolicy2')
            actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
        else:
            print ('no opt specified')

        self.actor_critic = actor_critic
        self.rollouts = rollouts


        self.rollouts_list = RolloutStorage_list()
예제 #2
0
    def __init__(self, envs, hparams):

        self.use_gae = hparams['use_gae']
        self.gamma = hparams['gamma']
        self.tau = hparams['tau']

        self.obs_shape = hparams['obs_shape']
        self.num_steps = hparams['num_steps']
        self.num_processes = hparams['num_processes']
        self.value_loss_coef = hparams['value_loss_coef']
        self.entropy_coef = hparams['entropy_coef']
        self.cuda = hparams['cuda']
        self.opt = hparams['opt']
        self.grad_clip = hparams['grad_clip']



        if hparams['dropout'] == True:
            print ('CNNPolicy_dropout2')
            actor_critic = CNNPolicy_dropout2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy_dropout(self.obs_shape[0], envs.action_space)
        elif len(envs.observation_space.shape) == 3:
            print ('CNNPolicy2')
            actor_critic = CNNPolicy2(self.obs_shape[0], envs.action_space)
            # actor_critic = CNNPolicy(self.obs_shape[0], envs.action_space)
        else:
            actor_critic = MLPPolicy(self.obs_shape[0], envs.action_space)

        if envs.action_space.__class__.__name__ == "Discrete":
            action_shape = 1
        else:
            action_shape = envs.action_space.shape[0]
        self.action_shape = action_shape

        rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space)
        #it has a self.state that is [steps, processes, obs]
        #steps is used to compute expected reward

        if self.cuda:
            actor_critic.cuda()
            rollouts.cuda()

        if self.opt == 'rms':
            self.optimizer = optim.RMSprop(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha'])
        elif self.opt == 'adam':
            self.optimizer = optim.Adam(params=actor_critic.parameters(), lr=hparams['lr'], eps=hparams['eps'])
        elif self.opt == 'sgd':
            self.optimizer = optim.SGD(params=actor_critic.parameters(), lr=hparams['lr'], momentum=hparams['mom'])
        else:
            print ('no opt specified')

        self.actor_critic = actor_critic
        self.rollouts = rollouts


        self.rollouts_list = RolloutStorage_list()
예제 #3
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    action_shape = envs.action_space.shape[0]

    print('+++++++++++++++++++++++++++++++++++++')
    print('obs_shape:', obs_shape)
    print('action_shape:', action_shape)
    print('+++++++++++++++++++++++++++++++++++++')

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)

            print('%3d  [%3d  %3d  %3d  %3d]  %3d' % (step,
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 0]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 1]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 2]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 3]),
                                                      reward[0]))

            if reward[0] >= search_done_reward:
                sys.exit()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        old_model.load_state_dict(actor_critic.state_dict())
        if hasattr(actor_critic, 'obs_filter'):
            old_model.obs_filter = actor_critic.obs_filter

        for _ in range(args.ppo_epoch):
            sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
            for indices in sampler:
                indices = torch.LongTensor(indices)
                if args.cuda:
                    indices = indices.cuda()
                states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                # Reshape to do in a single forward pass for all steps
                values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                adv_targ = Variable(advantages.view(-1, 1)[indices])
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                value_loss = (Variable(return_batch) - values).pow(2).mean()

                optimizer.zero_grad()
                (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, j * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
예제 #4
0
파일: main.py 프로젝트: nudles/a2c
def main():
    print("#######")
    print("WARNING: All rewards are not clipped or normalized ")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    envs = rafiki.Envs(args.num_processes, args.num_models, args.policy,
                       args.beta, args.obs_size, args.max_latency, args.tau,
                       args.cycle_len)
    obs_shape = envs.observation_space.shape

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(),
                                  args.lr,
                                  eps=args.eps,
                                  alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(),
                               args.lr,
                               eps=args.eps)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)
    rollouts.observations[0].copy_(current_obs)

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    info_set = Info(args)

    for j in range(num_updates):
        for step in range(args.num_steps):
            logger.info('------------%d----------------' % j)
            # Sample actions
            with torch.no_grad():
                action, probs, action_log_prob = actor_critic.act(
                    Variable(rollouts.observations[step]))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            # Obser reward and next obs
            logger.info(probs)
            obs, reward, info = envs.step(cpu_actions)
            info_set.insert(info)

            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()

            update_current_obs(obs)
            rollouts.insert(step, current_obs, action.data,
                            action_log_prob.data, reward)

        if args.algo in ['a2c', 'ppo']:
            action_log_probs, dist_entropy = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.actions.view(-1, action_shape)))

            R = rollouts.rewards.detach()

            optimizer.zero_grad()
            policy_loss = -R.reshape(args.num_steps,
                                     args.num_processes).mul(action_log_probs)
            policy_loss = sum(policy_loss) / len(policy_loss)
            policy_loss.backward()

            # nn.utils.clip_grad_norm_(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()

        with torch.no_grad():
            action, probs, action_log_prob = actor_critic.act(
                Variable(rollouts.observations[-1]))
        logger.info(probs)

        rollouts.after_update()

        if j % args.log_interval == 0:
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, reward {}, policy loss {}".
                  format(j, total_num_steps, R.data,
                         policy_loss.reshape(-1).data))

    logger.info(args)
    info_set.show()
def main():
    print("#######")
    print(
        "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards"
    )
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        viz_1 = Visdom()
        win = None
        win1 = None

    env_name_1 = 'HalfCheetahSmallFoot-v0'
    args.env_name = 'HalfCheetahSmallLeg-v0'

    envs = [
        make_env(args.env_name, args.seed, i, args.log_dir)
        for i in range(args.num_processes)
    ]

    envs_1 = [
        make_env(env_name_1, args.seed, i, args.log_dir_1)
        for i in range(args.num_processes)
    ]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
        envs_1 = SubprocVecEnv(envs_1)
    else:
        envs = DummyVecEnv(envs)
        envs_1 = DummyVecEnv(envs_1)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)
        envs_1 = VecNormalize(envs_1)

    #same for both tasks
    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    actor_critic_1 = MLPPolicy(obs_shape[0], envs_1.action_space)

    #same for both tasks
    action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()
        actor_critic_1.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(),
                              args.lr,
                              eps=args.eps,
                              alpha=args.alpha)
    optimizer_1 = optim.RMSprop(actor_critic_1.parameters(),
                                args.lr,
                                eps=args.eps,
                                alpha=args.alpha)

    #Different for both tasks
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                              envs.action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    rollouts_1 = RolloutStorage(args.num_steps, args.num_processes, obs_shape,
                                envs_1.action_space, actor_critic_1.state_size)
    current_obs_1 = torch.zeros(args.num_processes, *obs_shape)

    #Different update functions
    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    def update_current_obs_1(obs):
        shape_dim0 = envs_1.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs_1[:, :-shape_dim0] = current_obs_1[:, shape_dim0:]
        current_obs_1[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    obs_1 = envs_1.reset()
    update_current_obs_1(obs_1)

    rollouts.observations[0].copy_(current_obs)
    rollouts_1.observations[0].copy_(current_obs_1)

    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    episode_rewards_1 = torch.zeros([args.num_processes, 1])
    final_rewards_1 = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()
        current_obs_1 = current_obs_1.cuda()
        rollouts_1.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions from branch 1
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))

            cpu_actions = action.data.squeeze(1).cpu().numpy()

            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward),
                                                     1)).float()
            episode_rewards += reward

            masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                       for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data,
                            action_log_prob.data, value.data, reward, masks)

            #Sample actions from branch 2
            value_1, action_1, action_log_prob_1, states_1 = actor_critic_1.act(
                Variable(rollouts_1.observations[step], volatile=True),
                Variable(rollouts_1.states[step], volatile=True),
                Variable(rollouts_1.masks[step], volatile=True))

            cpu_actions_1 = action_1.data.squeeze(1).cpu().numpy()
            obs_1, reward_1, done_1, info_1 = envs_1.step(cpu_actions_1)
            reward_1 = torch.from_numpy(np.expand_dims(np.stack(reward_1),
                                                       1)).float()
            episode_rewards_1 += reward_1

            masks_1 = torch.FloatTensor([[0.0] if done_ else [1.0]
                                         for done_ in done_1])
            final_rewards_1 *= masks_1
            final_rewards_1 += (1 - masks_1) * episode_rewards_1
            episode_rewards_1 *= masks_1

            if args.cuda:
                masks_1 = masks_1.cuda()

            if current_obs_1.dim() == 4:
                current_obs_1 *= masks_1.unsqueeze(2).unsqueeze(2)
            else:
                current_obs_1 *= masks_1

            update_current_obs_1(obs_1)
            rollouts_1.insert(step, current_obs_1, states_1.data,
                              action_1.data, action_log_prob_1.data,
                              value_1.data, reward_1, masks_1)

        #Update for branch 1
        next_value = actor_critic(
            Variable(rollouts.observations[-1], volatile=True),
            Variable(rollouts.states[-1], volatile=True),
            Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma,
                                 args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
            Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
            Variable(rollouts.masks[:-1].view(-1, 1)),
            Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps,
                                                 args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss -
         dist_entropy * args.entropy_coef).backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
        optimizer.step()
        rollouts.after_update()

        #share params branch 1 -> branch 2
        actor_critic_1.a_fc1.weight.data = copy.deepcopy(
            actor_critic.a_fc1.weight.data)
        actor_critic_1.a_fc1.bias.data = copy.deepcopy(
            actor_critic.a_fc1.bias.data)
        actor_critic_1.v_fc1.weight.data = copy.deepcopy(
            actor_critic.v_fc1.weight.data)
        actor_critic_1.v_fc1.bias.data = copy.deepcopy(
            actor_critic.v_fc1.bias.data)

        #Update for branch 2
        next_value_1 = actor_critic_1(
            Variable(rollouts_1.observations[-1], volatile=True),
            Variable(rollouts_1.states[-1], volatile=True),
            Variable(rollouts_1.masks[-1], volatile=True))[0].data

        rollouts_1.compute_returns(next_value_1, args.use_gae, args.gamma,
                                   args.tau)

        values_1, action_log_probs_1, dist_entropy_1, states_1 = actor_critic_1.evaluate_actions(
            Variable(rollouts_1.observations[:-1].view(-1, *obs_shape)),
            Variable(rollouts_1.states[0].view(-1, actor_critic_1.state_size)),
            Variable(rollouts_1.masks[:-1].view(-1, 1)),
            Variable(rollouts_1.actions.view(-1, action_shape)))

        values_1 = values_1.view(args.num_steps, args.num_processes, 1)
        action_log_probs_1 = action_log_probs_1.view(args.num_steps,
                                                     args.num_processes, 1)

        advantages_1 = Variable(rollouts_1.returns[:-1]) - values_1
        value_loss_1 = advantages_1.pow(2).mean()

        action_loss_1 = -(Variable(advantages_1.data) *
                          action_log_probs_1).mean()

        optimizer_1.zero_grad()
        (value_loss_1 * args.value_loss_coef + action_loss_1 -
         dist_entropy_1 * args.entropy_coef).backward()
        nn.utils.clip_grad_norm(actor_critic_1.parameters(),
                                args.max_grad_norm)
        optimizer_1.step()
        rollouts_1.after_update()

        #share params branch 2 -> branch 1
        actor_critic.a_fc1.weight.data = copy.deepcopy(
            actor_critic_1.a_fc1.weight.data)
        actor_critic.a_fc1.bias.data = copy.deepcopy(
            actor_critic_1.a_fc1.bias.data)
        actor_critic.v_fc1.weight.data = copy.deepcopy(
            actor_critic_1.v_fc1.weight.data)
        actor_critic.v_fc1.bias.data = copy.deepcopy(
            actor_critic_1.v_fc1.bias.data)

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo,
                                     args.env_name + '_' + env_name_1)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            save_model = actor_critic_1
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                save_model_1 = copy.deepcopy(actor_critic_1).cpu()

            save_model = [
                save_model,
                hasattr(envs, 'ob_rms') and envs.ob_rms or None
            ]
            save_model_1 = [
                save_model_1,
                hasattr(envs_1, 'ob_rms') and envs_1.ob_rms or None
            ]

            torch.save(save_model,
                       os.path.join(save_path, args.env_name + ".pt"))
            torch.save(save_model_1, os.path.join(save_path,
                                                  env_name_1 + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards.mean(), final_rewards.median(),
                        final_rewards.min(), final_rewards.max(),
                        dist_entropy.data[0], value_loss.data[0],
                        action_loss.data[0]))
            print(
                "Updates_1 {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}"
                .format(j, total_num_steps,
                        int(total_num_steps / (end - start)),
                        final_rewards_1.mean(), final_rewards_1.median(),
                        final_rewards_1.min(), final_rewards_1.max(),
                        dist_entropy_1.data[0], value_loss_1.data[0],
                        action_loss_1.data[0]))

        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name,
                                  args.algo)
                win1 = visdom_plot(viz_1, win1, args.log_dir_1, env_name_1,
                                   args.algo)
            except IOError:
                pass
예제 #6
0
파일: main.py 프로젝트: umd-agrc/QuadRL
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    observation_space = np.zeros((3, 1))
    action_space = np.zeros((4, 1))

    obs_shape = np.shape(observation_space)
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    # if observation_space == 3:
    #     actor_critic = CNNPolicy(obs_shape[0], action_space, args.recurrent_policy)
    # else:
    #     assert not args.recurrent_policy, \
    #         "Recurrent policy is not implemented for the MLP controller"
    #     actor_critic = MLPPolicy(obs_shape[0], action_space)
    actor_critic = MLPPolicy(obs_shape[0], action_space)

    action_shape = np.shape(action_space)[0]

    if args.cuda:
        actor_critic.cuda()

    if args.algo == 'a2c':
        optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)
    elif args.algo == 'ppo':
        optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)
    elif args.algo == 'acktr':
        optimizer = KFACOptimizer(actor_critic)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, action_space, actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = np.shape(observation_space)[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs.reshape(current_obs[:, -shape_dim0:].shape[1:])

    obs = reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(
                Variable(rollouts.observations[step], volatile=True),
                Variable(rollouts.states[step], volatile=True),
                Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            print(action)

            # Observe reward and next obs
            obs, reward, done, info = envstep(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward,
                            masks)

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        if args.algo in ['a2c', 'acktr']:
            values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                Variable(rollouts.masks[:-1].view(-1, 1)),
                Variable(rollouts.actions.view(-1, action_shape)))

            values = values.view(args.num_steps, args.num_processes, 1)
            action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

            advantages = Variable(rollouts.returns[:-1]) - values
            value_loss = advantages.pow(2).mean()

            action_loss = -(Variable(advantages.data) * action_log_probs).mean()

            if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0:
                # Sampled fisher, see Martens 2014
                actor_critic.zero_grad()
                pg_fisher_loss = -action_log_probs.mean()

                value_noise = Variable(torch.randn(values.size()))
                if args.cuda:
                    value_noise = value_noise.cuda()

                sample_values = values + value_noise
                vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean()

                fisher_loss = pg_fisher_loss + vf_fisher_loss
                optimizer.acc_stats = True
                fisher_loss.backward(retain_graph=True)
                optimizer.acc_stats = False

            optimizer.zero_grad()
            (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

            if args.algo == 'a2c':
                nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

            optimizer.step()
        elif args.algo == 'ppo':
            advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

            for e in range(args.ppo_epoch):
                if args.recurrent_policy:
                    data_generator = rollouts.recurrent_generator(advantages,
                                                                  args.num_mini_batch)
                else:
                    data_generator = rollouts.feed_forward_generator(advantages,
                                                                     args.num_mini_batch)

                for sample in data_generator:
                    observations_batch, states_batch, actions_batch, \
                    return_batch, masks_batch, old_action_log_probs_batch, \
                    adv_targ = sample

                    # Reshape to do in a single forward pass for all steps
                    values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(
                        Variable(observations_batch),
                        Variable(states_batch),
                        Variable(masks_batch),
                        Variable(actions_batch))

                    adv_targ = Variable(adv_targ)
                    ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch))
                    surr1 = ratio * adv_targ
                    surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                    action_loss = -torch.min(surr1, surr2).mean()  # PPO's pessimistic surrogate (L^CLIP)

                    value_loss = (Variable(return_batch) - values).pow(2).mean()

                    optimizer.zero_grad()
                    (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)
                    optimizer.step()

        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [save_model,
                          hasattr(envs, 'ob_rms') and envs.ob_rms or None]

            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print(
                "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{"
                ":.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                    format(j, total_num_steps,
                           int(total_num_steps / (end - start)),
                           final_rewards.mean(),
                           final_rewards.median(),
                           final_rewards.min(),
                           final_rewards.max(), dist_entropy.data[0],
                           value_loss.data[0], action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
            except IOError:
                pass