Exemplo n.º 1
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width,
                  args.height,
                  args.frame_skip,
                  args.rewarding_distance,
                  args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)
    if args.render:
        (_, _, obs_rgb_view2) = env.reset()
        plt.ion()
        f, ax = plt.subplots()
        im = ax.imshow(obs_rgb_view2)

    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.eval()
    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    n_digits = str(
        len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        cx = Variable(
                            torch.zeros(1, args.hidden_size), volatile=True)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                                0), 0, False, 0
                        reward_sum = 0

                    # Calculate policy
                    policy, _, (hx, cx) = model(
                        Variable(
                            state[0], volatile=True),
                        Variable(
                            state[1], volatile=True),
                        (hx.detach(),
                         cx.detach()))  # Break graph for memory efficiency

                    # Choose action greedily
                    action = [p.max(1)[1].data[0, 0] for p in policy]

                    # Step
                    state, reward, done = env.step(action)
                    obs_rgb_view1 = state[1]
                    obs_rgb_view2 = state[2]
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Optionally render validation states
                    if args.render:
                        # rendering the first camera view
                        im.set_data(obs_rgb_view1)
                        plt.draw()
                        plt.pause(0.05)

                        # rendering mujoco simulation
                        # viewer = mujoco_py.MjViewer(env.sim)
                        # viewer.render()

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break

            print(('[{}] Step: {:<' + n_digits +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime(
                           '%Y-%m-%d %H:%M:%S,%f')[:-3], t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join('results', str(t_start) +
                                    '_model.pth'))  # Checkpoint model params
            can_test = False  # Finish testing
            if args.evaluate:
                return
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond
Exemplo n.º 2
0
def train(rank, args, T, shared_model, optimiser):
    torch.manual_seed(args.seed + rank)

    env = JacoEnv(args.width, args.height, args.frame_skip,
                  args.rewarding_distance, args.control_magnitude,
                  args.reward_continuous)
    env.seed(args.seed + rank)

    # TODO: pass in the observation and action space
    model = ActorCritic(None, args.non_rgb_state_size, None, args.hidden_size)
    model.train()

    t = 1  # Thread step counter
    done = True  # Start new episode

    while T.value() <= args.T_max:
        # Sync with shared model at least every t_max steps
        model.load_state_dict(shared_model.state_dict())
        # Get starting timestep
        t_start = t

        # Reset or pass on hidden state
        if done:
            hx = Variable(torch.zeros(1, args.hidden_size))
            cx = Variable(torch.zeros(1, args.hidden_size))
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            action, reward, done, episode_length = (0, 0, 0, 0, 0,
                                                    0), 0, False, 0

        else:
            # Perform truncated backpropagation-through-time (allows freeing buffers after backwards call)
            hx = hx.detach()
            cx = cx.detach()

        # Lists of outputs for training
        policies, Vs, actions, rewards = [], [], [], []

        while not done and t - t_start < args.t_max:
            # Calculate policy and value
            policy, V, (hx, cx) = model(Variable(state[0]), Variable(state[1]),
                                        (hx, cx))

            # Sample action
            action = [
                p.multinomial().data[0, 0] for p in policy
            ]  # Graph broken as loss for stochastic action calculated manually

            # Step
            state, reward, done = env.step(action)
            state = state_to_tensor(state)
            done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
            episode_length += 1  # Increase episode counter

            # Save outputs for online training
            [
                arr.append(el)
                for arr, el in zip((policies, Vs, actions, rewards), (
                    policy, V, Variable(torch.LongTensor(action)), reward))
            ]

            # Increment counters
            t += 1
            T.increment()

        # Break graph for last values calculated (used for targets, not directly as model outputs)
        if done:
            # R = 0 for terminal s
            R = Variable(torch.zeros(1, 1))

        else:
            # R = V(s_i; θ) for non-terminal s
            _, R, _ = model(Variable(state[0]), Variable(state[1]), (hx, cx))
            R = R.detach()
        Vs.append(R)

        # Train the network
        _train(args, T, model, shared_model, optimiser, policies, Vs, actions,
               rewards, R)