示例#1
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    render, running_state, min_batch_size):
    # (2)
    torch.randn(pid)
    log = dict()
    memory = Memory(
    )  # every time we collect a batch he memory is re-initialized
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:  # collecting samples from episodes until we at least a batch
        state = env.reset()  # (maybe more since we stop when episode ends)
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0
        episode = []
        for t in range(
                10000
        ):  # in gym.env there's already an upper bound to the number of steps
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                action_mean, action_log_std, action_std = policy(state_var)
                if mean_action:
                    action = action_mean.numpy()  # use mean value
                else:
                    action = policy.select_action(state_var)[0].numpy(
                    )  # sample from normal distribution
            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:  # running list of normalized states allowing to access precise mean and std
                next_state = running_state(next_state)
            if custom_reward is not None:  # by default is None, unless given when init Agent
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            episode.append(
                Transition(state, action, next_state, reward, action_mean,
                           action_std.numpy(), None))

            if render:
                env.render()
            if done:
                memory.push(episode)
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
def collect_samples(pid, env, policy, num_req_steps, num_req_episodes,
                    mean_action, render, running_state, context_points_list,
                    attention, fixed_sigma):

    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    action_sum = zeros(context_points_list[0][1].shape[-1])

    with torch.no_grad():
        all_x_context, all_y_context = merge_context(
            context_points_list)  # merge episodes in one context set
        #  compute step-independent values
        if policy.id == 'DKL':
            policy.set_train_data(inputs=all_x_context.squeeze(0),
                                  targets=all_y_context.view(-1),
                                  strict=False)
        elif policy.id in 'ANP':
            #  compute context representation and latent variable
            if attention:
                encoder_input, keys = policy.xy_to_a.get_input_key(
                    all_x_context, all_y_context)
            else:
                r_context = policy.xy_to_r(all_x_context, all_y_context)
            _, z_dist = policy.sample_z(all_x_context, all_y_context)

        while num_steps < num_req_steps or num_episodes < num_req_episodes:

            episode = []
            reward_episode = 0
            if policy.id in 'ANP':
                z_sample = z_dist.sample()
                if not attention:
                    rep = torch.cat([z_sample, r_context], dim=-1)

            state = env.reset()
            if running_state is not None:
                state = running_state(state)
            t_ep = time.time()
            for t in range(10000):
                state_var = tensor(state).unsqueeze(0).unsqueeze(0)
                if policy.id == 'DKL':
                    with gpytorch.settings.use_toeplitz(
                            True), gpytorch.settings.fast_pred_var():
                        pi = policy(state_var)
                    mean = pi.mean
                    stddev = pi.stddev

                elif policy.id == 'MI':
                    mean = policy(all_x_context, all_y_context, state_var)
                    stddev = fixed_sigma
                else:  #  NPs and ANPs
                    if attention:
                        a_repr = policy.xy_to_a.get_repr(
                            encoder_input, keys, state_var)
                        representation = torch.cat(
                            [z_sample, a_repr.squeeze(0)], dim=-1)
                        mean, stddev = policy.xz_to_y(state_var,
                                                      representation)
                    else:
                        mean, stddev = policy.xrep_to_y(state_var, rep)

                if fixed_sigma is not None:
                    sigma = fixed_sigma  # use sigma learnt by update step
                else:
                    sigma = stddev.view(-1)  # use predicted sigma (NPs)

                action_distribution = Normal(mean, sigma)

                if mean_action:
                    action = mean.view(-1)  # use mean value
                    mean_rep = torch.cat([z_dist.mean, r_context], dim=-1)
                    mean, stddev = policy.xrep_to_y(state_var, mean_rep)
                    mean_s, _ = policy.xrep_to_y(
                        state_var,
                        torch.cat([z_dist.mean + z_dist.stddev, r_context],
                                  dim=-1))
                    sigma = torch.abs(mean_s - mean)
                else:
                    action = action_distribution.sample().view(
                        -1)  # sample from normal distribution
                cov = torch.diag(sigma.view(-1)**2)

                next_state, reward, done, _ = env.step(action.cpu().numpy())
                reward_episode += reward
                if running_state is not None:  # running list of normalized states allowing to access precise mean and std
                    next_state = running_state(next_state)

                episode.append(
                    Transition(state,
                               action.cpu().numpy(), next_state, reward,
                               mean.cpu().numpy(),
                               sigma.cpu().numpy(), None, cov))
                action_sum += action
                if render:
                    env.render()
                if done:
                    memory.push(episode)
                    break

                state = next_state
            # log stats
            num_steps += (t + 1)
            num_episodes += 1
            total_reward += reward_episode
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)
    print('tot episodes: ', num_episodes)
    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    try:
        log['avg_reward'] = total_reward.item() / num_episodes
    except AttributeError:
        log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['action_mean'] = action_sum / num_steps

    return memory, log
示例#3
0
def collect_samples_mlp(pid, env, policy, num_req_steps, num_req_episodes,
                        custom_reward, render, running_state, fixed_sigma):

    torch.randn(pid)
    log = dict()
    memory = Memory(
    )  # every time we collect a batch he memory is re-initialized
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    with torch.no_grad():
        while num_steps < num_req_steps or num_episodes < num_req_episodes:

            episode = []
            reward_episode = 0

            state = env.reset()
            if running_state is not None:
                state = running_state(state)
            t_ep = time.time()
            for t in range(10000):
                state_var = tensor(state).unsqueeze(0).unsqueeze(0)
                pi = policy(state_var)
                mean = pi
                #stddev = pi.stddev
                sigma = fixed_sigma
                cov = torch.diag(sigma**2)

                action_distribution = Normal(mean, sigma)

                action = action_distribution.sample(
                )  # sample from normal distribution
                next_state, reward, done, _ = env.step(action.cpu())
                reward_episode += reward
                if running_state is not None:
                    next_state = running_state(next_state)
                if custom_reward is not None:
                    reward = custom_reward(state, action)
                    total_c_reward += reward
                    min_c_reward = min(min_c_reward, reward)
                    max_c_reward = max(max_c_reward, reward)

                episode.append(
                    Transition(state,
                               action.cpu().numpy(), next_state, reward,
                               mean.cpu().numpy(),
                               sigma.cpu().numpy(), None, cov))

                if render:
                    env.render()
                if done:
                    memory.push(episode)
                    break

                state = next_state
            # log stats
            num_steps += (t + 1)
            num_episodes += 1
            total_reward += reward_episode
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)
    print('tot episodes: ', num_episodes)
    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    try:
        log['avg_reward'] = total_reward.item() / num_episodes
    except AttributeError:
        log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    return memory, log
示例#4
0
def collect_samples(pid, env, policy, custom_reward, mean_action, render,
                    running_state, context_points_list, attention,
                    fixed_sigma):
    # (2)
    torch.randn(pid)
    log = dict()
    memory = Memory(
    )  # every time we collect a batch he memory is re-initialized
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    action_sum = zeros(context_points_list[0][1].shape[-1])

    with torch.no_grad():
        for ep in range(len(context_points_list)):
            all_x_context_list = [context_points_list[0][0][:, [0], :]]
            all_y_context_list = [context_points_list[0][1][:, [0], :]]
            episode = []
            reward_episode = 0
            state = env.reset()
            if running_state is not None:
                state = running_state(state)
            t_ep = time.time()
            for t in range(10000):
                all_x_context = torch.cat(all_x_context_list, dim=1)
                all_y_context = torch.cat(all_y_context_list, dim=1)
                state_var = tensor(state).unsqueeze(0).unsqueeze(0)
                if policy.id == 'DKL':
                    with gpytorch.settings.use_toeplitz(
                            True), gpytorch.settings.fast_pred_var():
                        pi = policy(state_var)
                    mean = pi.mean
                    stddev = pi.stddev
                    if torch.isnan(stddev):
                        print(stddev)
                elif policy.id == 'MI':
                    mean = policy(all_x_context, all_y_context, state_var)
                    stddev = fixed_sigma
                else:
                    if attention:
                        pi = policy(all_x_context, all_y_context, state_var)
                        mean = pi.mean
                        stddev = pi.stddev
                    else:
                        pi = policy(all_x_context, all_y_context, state_var)
                        mean = pi.mean
                        stddev = pi.stddev
                if fixed_sigma is not None:
                    sigma = fixed_sigma
                else:
                    sigma = stddev

                action_distribution = Normal(mean, sigma)

                if mean_action:
                    action = mean  # use mean value
                    mean, stddev = policy.xz_to_y(state_var, z_dist.mean)
                else:
                    action = action_distribution.sample().view(
                        -1)  # sample from normal distribution
                    cov = torch.diag(sigma**2)
                next_state, reward, done, _ = env.step(action.cpu())
                reward_episode += reward
                if running_state is not None:  # running list of normalized states allowing to access precise mean and std
                    next_state = running_state(next_state)
                if custom_reward is not None:  # by default is None, unless given when init Agent
                    reward = custom_reward(state, action)
                    total_c_reward += reward
                    min_c_reward = min(min_c_reward, reward)
                    max_c_reward = max(max_c_reward, reward)
                if any(torch.isnan(state_var.view(-1))) or any(
                        torch.isnan(action.view(-1))) or any(
                            torch.isnan(mean.view(-1))):
                    print('wat')
                all_x_context_list.append(state_var)
                all_y_context_list.append(mean)
                episode.append(
                    Transition(state,
                               action.cpu().numpy(), next_state, reward,
                               mean.cpu().numpy(),
                               sigma.cpu().numpy(), None, cov))
                action_sum += action
                if render:
                    env.render()
                if done:
                    memory.push(episode)
                    break

                state = next_state
            # log stats
            num_steps += (t + 1)
            num_episodes += 1
            total_reward += reward_episode
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    try:
        log['avg_reward'] = total_reward.item() / num_episodes
    except AttributeError:
        log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['action_mean'] = action_sum / num_steps
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    return memory, log
def collect_samples(pid, env, policy, num_ep, custom_reward, render,
                    running_state, fixed_sigma):
    # (2)
    torch.randn(pid)
    log = dict()
    memory = Memory(
    )  # every time we collect a batch he memory is re-initialized
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    with torch.no_grad():
        for ep in range(num_ep):

            episode = []
            reward_episode = 0

            state = env.reset()
            if running_state is not None:
                state = running_state(state)
            t_ep = time.time()
            for t in range(10000):
                state_var = tensor(state).unsqueeze(0).unsqueeze(0)
                pi = policy(state_var)
                mean = pi.mean
                stddev = pi.stddev

                if fixed_sigma is not None:
                    sigma = fixed_sigma
                else:
                    sigma = stddev

                action_distribution = Normal(mean, sigma)

                action = action_distribution.sample().squeeze(0).squeeze(
                    0)  # sample from normal distribution
                next_state, reward, done, _ = env.step(action)
                reward_episode += reward
                if running_state is not None:  # running list of normalized states allowing to access precise mean and std
                    next_state = running_state(next_state)
                if custom_reward is not None:  # by default is None, unless given when init Agent
                    reward = custom_reward(state, action)
                    total_c_reward += reward
                    min_c_reward = min(min_c_reward, reward)
                    max_c_reward = max(max_c_reward, reward)

                episode.append(
                    Transition(state, action.numpy(), next_state, reward,
                               mean.numpy(), stddev.numpy(), None))

                if render:
                    env.render()
                if done:
                    memory.push(episode)
                    break

                state = next_state
            # log stats
            num_steps += (t + 1)
            num_episodes += 1
            total_reward += reward_episode
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    return memory, log
示例#6
0
def collect_samples(pid, env, policy, custom_reward, mean_action, render,
                    running_state, context_points_list, attention,
                    fixed_sigma):
    # (2)
    torch.randn(pid)
    log = dict()
    memory = Memory(
    )  # every time we collect a batch he memory is re-initialized
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    for episode_contexts in context_points_list:
        episode = []
        reward_episode = 0
        x_context, y_context, real_len = episode_contexts
        if attention:
            encoder_input, keys = policy.xy_to_a.get_input_key(
                x_context[:real_len], y_context[:real_len])
        _, z_dist = policy.sample_z(x_context[:real_len], y_context[:real_len])
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        z_sample = z_dist.sample()
        for t in range(10000):
            state_var = tensor(state).unsqueeze(0).unsqueeze(0)
            with torch.no_grad():
                if attention:
                    a_repr = policy.xy_to_a.get_repr(encoder_input, keys,
                                                     state_var)
                    representation = torch.cat(
                        [z_sample, a_repr.squeeze(0)], dim=-1)
                    mean, stddev = policy.xz_to_y(state_var, representation)
                else:
                    mean, stddev = policy.xz_to_y(state_var, z_sample)

                if fixed_sigma is not None:
                    sigma = fixed_sigma
                else:
                    sigma = stddev

                action_distribution = Normal(mean, sigma)

                if mean_action:
                    action = mean  # use mean value
                    mean, stddev = policy.xz_to_y(state_var, z_dist.mean)
                else:
                    action = action_distribution.sample().squeeze(0).squeeze(
                        0)  # sample from normal distribution

            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:  # running list of normalized states allowing to access precise mean and std
                next_state = running_state(next_state)
            if custom_reward is not None:  # by default is None, unless given when init Agent
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            episode.append(
                Transition(state, action.numpy(), next_state, reward,
                           mean.numpy(), stddev.numpy(), None))

            if render:
                env.render()
            if done:
                memory.push(episode)
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    return memory, log
def collect_samples(pid, env, policy, num_req_steps, num_req_episodes, num_context, render,
                    running_state, context_points_list, pick_dist, fixed_sigma):

    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    action_sum = zeros(context_points_list[0][1].shape[-1])

    # merge all episodes in RM in a single set
    all_x = torch.cat([ep[0][:ep[-1], :] for ep in context_points_list], dim=-2)
    all_y = torch.cat([ep[1][:ep[-1], :] for ep in context_points_list], dim=-2)
    num_tot_context = all_x.shape[-2]

    if num_tot_context < num_context:  # no need to select a subset
        pick = False
        all_x_context, all_y_context = [all_x.view(1, num_tot_context, -1), all_y.view(1, num_tot_context, -1)]
    else:
        pick = True

    with torch.no_grad():
        while num_steps < num_req_steps or num_episodes < num_req_episodes:
            # print('ep: ', ep)
            episode = []
            reward_episode = 0

            state = env.reset()
            if running_state is not None:
                state = running_state(state)
            t_ep = time.time()
            for t in range(10000):
                state_var = tensor(state).unsqueeze(0).unsqueeze(0)
                if pick:
                    all_x_context, all_y_context = get_close_context(t, state_var, context_points_list, pick_dist, num_tot_context=num_context)
                if policy.id == 'DKL':
                    policy.set_train_data(all_x_context.squeeze(0), all_y_context.squeeze(0).squeeze(-1), strict=False)
                    pi = policy(state_var)
                    mean = pi.mean
                    stddev = pi.stddev
                elif policy.id == 'MI':
                    mean = policy(all_x_context, all_y_context, state_var)
                    stddev = fixed_sigma
                else:
                    pi = policy(all_x_context, all_y_context, state_var)
                    mean = pi.mean
                    stddev = pi.stddev

                if fixed_sigma is not None:
                    sigma = fixed_sigma
                else:
                    sigma = stddev.view(-1)
                cov = torch.diag(sigma ** 2)

                action_distribution = MultivariateNormal(mean, cov)
                action = action_distribution.sample().view(-1)  # sample from normal distribution

                next_state, reward, done, _ = env.step(action.cpu().numpy())
                reward_episode += reward
                if running_state is not None:  # running list of normalized states allowing to access precise mean and std
                    next_state = running_state(next_state)

                episode.append(Transition(state, action.cpu().numpy(), next_state, reward, mean.cpu().numpy(),
                                          sigma.cpu().numpy(), None, cov))
                action_sum += action
                if render:
                    env.render()
                if done:
                    memory.push(episode)
                    break

                state = next_state
            # log stats
            num_steps += (t + 1)
            num_episodes += 1
            total_reward += reward_episode
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)
    print('tot episodes: ', num_episodes)
    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    try:
        log['avg_reward'] = total_reward.item() / num_episodes
    except AttributeError:
        log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['action_mean'] = action_sum / num_steps

    return memory, log