Exemplos de Common em Python, exemplos de RL.Library.Common em Python

Exemplo n.º 1

0

Exibir arquivo

def calc_loss(batch, net, tgt_net, gamma, device="cpu", double=True):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.tensor(dones, dtype=torch.bool).to(device)

    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    # If double
    # DQN is enabled, we calculate the best action to take in the next state using
    # our main trained network, but values corresponding to this action come from
    # the target network.
    if double:
        next_state_actions = net(next_states_v).max(1)[1]
        next_state_values = tgt_net(next_states_v).gather(
            1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    else:
        next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach(
    ) * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: CategoricalDQN.py Projeto: GuyRobot/RL-Python

def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None):
    """

    :param batch:
    :param net:
    :param tgt_net:
    :param gamma:
    :param device:
    :type device Union[str, torch.device]
    :param save_prefix:
    :return:
    """
    states, actions, rewards, dones, next_states = Common.unpack_batch(batch)
    batch_size = len(batch)

    state_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions, dtype=torch.long).to(device)
    next_states_v = torch.tensor(next_states).to(device)

    # need both probability distributions and Q-values for the next
    # states, so we use the both() call to the network, obtain the best actions to
    # take in the next state, apply softmax to the distribution, and convert it to the
    # array.
    next_distribute_v, next_qvals_v = tgt_net.both(next_states_v)
    next_actions = next_qvals_v.max(1)[1].data.cpu().numpy()
    next_distribute = tgt_net.apply_softmax(
        next_distribute_v).data.cpu().numpy()

    # extract distributions of the best actions and perform their projection
    # using the Bellman operator. The result of the projection will be target
    # distribution about what we want our network output
    next_best_distribute = next_distribute[range(batch_size), next_actions]
    dones = dones.astype(torch.bool)
    proj_distribute = Common.distribute_projection(next_best_distribute,
                                                   rewards, dones, v_min,
                                                   v_max, N_ATOMS, gamma)

    # compute the output of the network and
    # calculate KL-divergence between projected distribution and the network's
    # output for the taken actions. KL-divergence shows how much two
    # distributions differ
    distribute_v = net(state_v)
    state_action_values = distribute_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)
    proj_distribute_v = torch.tensor(proj_distribute).to(device)
    loss_v = -state_log_sm_v * proj_distribute_v
    return loss_v.sum(dim=1).mean()

Exemplo n.º 3

0

Exibir arquivo

Arquivo: NoisyNet.py Projeto: GuyRobot/RL-Python

    writer = SummaryWriter(comment="-" + params['run_name'] + "-noisy-net")
    net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = ptan.agent.TargetNet(net)
    agent = ptan.agent.DQNAgent(net,
                                ptan.actions.ArgmaxActionSelector(),
                                device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break

            if len(buffer) < params['replay_initial']:
                continue

            optimizer.zero_grad()
            batch = buffer.sample(params['batch_size'])
            loss_v = common.calc_loss_dqn(batch,

Exemplo n.º 4

0

Exibir arquivo

                        help="Enable double DQN")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env)

    writer = SummaryWriter(comment="-" + params['run_name'] + "-double=" +
                           str(args.double))
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    eval_states = None

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)

Exemplo n.º 5

0

Exibir arquivo

def calc_loss(batch, batch_weights, net, tgt_net, gamma, device="cpu"):
    """
    Here we use a small trick to speed up our calculations a bit. As the double
    DQN method requires us to use our main network to select actions but use
    the target network to obtain values (in our case, value distributions) for those
    actions, we need to pass to our main network both the current states and the
    next states. Earlier, we calculated the network output in two calls, which is
    not very efficient on GPU. Now, we concatenate both current states and next
    states into one tensor and obtain the result in one network pass, splitting the
    result later. We need to calculate both Q-values and raw values' distributions,
    as our action selection policy is still greedy: we choose the action with the
    largest Q-value.
    :param batch:
    :param batch_weights:
    :param net:
    :param tgt_net:
    :param gamma:
    :param  Union[str, torch.device] device:
    :return:
    """

    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions, dtype=torch.long).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    batch_weights_v = torch.tensor(batch_weights).to(device)

    distr_v, qvals_v = net.both(torch.cat((states_v, next_states_v)))
    next_qvals_v = qvals_v[batch_size:]
    distr_v = distr_v[:batch_size]

    # we decide on actions to take in the next state and
    # obtain the distribution of those actions using our target network. So, the
    # above net/tgt_net shuffling implements the double DQN method. Then we
    # apply softmax to distribution for those best actions and copy the data into
    # CPU to perform the Bellman projection.
    next_actions_v = next_qvals_v.max(1)[1]
    next_distr_v = tgt_net(next_states_v)
    next_best_distr_v = next_distr_v[range(batch_size), next_actions_v.data]
    next_best_distr_v = tgt_net.apply_softmax(next_best_distr_v)
    next_best_distr = next_best_distr_v.data.cpu().numpy()

    dones = dones.astype(np.bool)
    proj_distr = common.distribute_projection(next_best_distr,
                                              rewards,
                                              dones,
                                              v_min=Vmin,
                                              v_max=Vmax,
                                              n_atoms=N_ATOMS,
                                              gamma=gamma)
    # Here we obtain the distributions for taken actions and apply log_softmax to
    # calculate the loss
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)

    # calculate the KL-divergence loss, multiply
    # it by weights and return two quantities: combined loss to be used in the
    # optimizer step and individual loss values for batch, which will be used as
    # priorities in the replay buffer
    proj_distr_v = torch.tensor(proj_distr)
    loss_v = -state_log_sm_v * proj_distr_v
    loss_v = batch_weights_v * loss_v.sum(dim=1)
    return loss_v.mean(), loss_v + 1e-5

Exemplo n.º 6

0

Exibir arquivo

    agent = ptan.agent.DQNAgent(lambda x: net.qvals(x),
                                ptan.actions.ArgmaxActionSelector(),
                                device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=REWARD_STEPS)
    buffer = ptan.experience.PrioritizedReplayBuffer(exp_source,
                                                     params['replay_size'],
                                                     PRIO_REPLAY_ALPHA)
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0
    beta = BETA_START

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            # Call to buffer.populate(1) will start the following
            # chain of actions
            # ExperienceReplayBuffer will ask the experience source to get the next
            #   transition.
            # The experience source will feed the current observation to the agent to
            #   obtain the action.
            # The agent will apply the NN to the observation to calculate Q-values,
            #   then ask the action selector to choose the action to take.
            # The action selector (which is an epsilon-greedy selector) will generate
            #   the random number to check how to act: greedily or randomly. In both
            #   cases, it will decide which action to take.
            # The action will be returned to the experience source, which will feed it
            #   into the environment to obtain the reward and the next observation. All

Exemplo n.º 7

0

Exibir arquivo

Arquivo: NStepDQN.py Projeto: GuyRobot/RL-Python

                        help="Count of steps to unroll Bellman")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = gym.make(params['env_name'])
    env = ptan.common.wrappers.wrap_dqn(env)

    writer = SummaryWriter(comment="-" + params['run_name'] +
                           "-%d-step" % args.n)
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(
        epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params['gamma'], steps_count=args.n)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])

    frame_idx = 0

    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)