예제 #1
0
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = UsbCamEnv(ENV_IMG_W, ENV_IMG_H, env_done_reward)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    actor_critic = MLPPolicy(obs_shape[0], envs.action_space)
    action_shape = envs.action_space.shape[0]

    print('+++++++++++++++++++++++++++++++++++++')
    print('obs_shape:', obs_shape)
    print('action_shape:', action_shape)
    print('+++++++++++++++++++++++++++++++++++++')

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
    current_state = torch.zeros(args.num_processes, *obs_shape)

    def update_current_state(state):
        shape_dim0 = envs.observation_space.shape[0]
        state = torch.from_numpy(state).float()
        if args.num_stack > 1:
            current_state[:, :-shape_dim0] = current_state[:, shape_dim0:]
        current_state[:, -shape_dim0:] = state

    state = envs.reset()
    update_current_state(state)

    rollouts.states[0].copy_(current_state)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_state = current_state.cuda()
        rollouts.cuda()

    old_model = copy.deepcopy(actor_critic)

    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True))
            cpu_actions = action.data.cpu().numpy()

            # Obser reward and next state
            state, reward, done, info = envs.step(cpu_actions)

            print('%3d  [%3d  %3d  %3d  %3d]  %3d' % (step,
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 0]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 1]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 2]),
                                                      int(envs.convert_2_real_action(cpu_actions)[0, 3]),
                                                      reward[0]))

            if reward[0] >= search_done_reward:
                sys.exit()

            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_state.dim() == 4:
                current_state *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_state *= masks

            update_current_state(state)
            rollouts.insert(step, current_state, action.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data

        if hasattr(actor_critic, 'obs_filter'):
            actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape))

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)

        old_model.load_state_dict(actor_critic.state_dict())
        if hasattr(actor_critic, 'obs_filter'):
            old_model.obs_filter = actor_critic.obs_filter

        for _ in range(args.ppo_epoch):
            sampler = BatchSampler(SubsetRandomSampler(range(args.num_processes * args.num_steps)), args.batch_size * args.num_processes, drop_last=False)
            for indices in sampler:
                indices = torch.LongTensor(indices)
                if args.cuda:
                    indices = indices.cuda()
                states_batch = rollouts.states[:-1].view(-1, *obs_shape)[indices]
                actions_batch = rollouts.actions.view(-1, action_shape)[indices]
                return_batch = rollouts.returns[:-1].view(-1, 1)[indices]

                # Reshape to do in a single forward pass for all steps
                values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(states_batch), Variable(actions_batch))

                _, old_action_log_probs, _ = old_model.evaluate_actions(Variable(states_batch, volatile=True), Variable(actions_batch, volatile=True))

                ratio = torch.exp(action_log_probs - Variable(old_action_log_probs.data))
                adv_targ = Variable(advantages.view(-1, 1)[indices])
                surr1 = ratio * adv_targ
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ
                action_loss = -torch.min(surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP)

                value_loss = (Variable(return_batch) - values).pow(2).mean()

                optimizer.zero_grad()
                (value_loss + action_loss - dist_entropy * args.entropy_coef).backward()
                optimizer.step()

        rollouts.states[0].copy_(rollouts.states[-1])

        if j % args.save_interval == 0 and args.save_dir != "":
            save_path = os.path.join(args.save_dir, args.algo)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
            torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

        if j % args.log_interval == 0:
            print("Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, j * args.num_processes * args.num_steps,
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), -dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
예제 #2
0
            pass
        except NameError:
            pass

        # save learned network only for root node
        if (j % args.save_interval == 0
                and args.save_dir != "") or j >= num_updates - 1:
            print('save model!!')
            save_path = os.path.join(args.save_dir, args.algo, 'phase2',
                                     args.env_name)
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            save_state_dict = actor_critic.state_dict()
            # if args.cuda:
            #     save_model = copy.deepcopy(actor_critic).cpu()

            save_model = [
                save_state_dict,
                hasattr(env.robot, 'ob_rms') and env.robot.ob_rms or None,
                hasattr(env.robot, 'st_rms') and env.robot.st_rms or None,
                hasattr(env.robot, 'ret_rms') and env.robot.ret_rms or None
            ]

            # save_model = [save_model, None]
            str_itr_num = '(' + str(
                args.itr_num +
                j) + ')' if args.cont_learning else '(' + str(j) + ')'
            torch.save(