示例#1
0
def main(env, num_timesteps, experiment_config, experiment_name):

    q_func = DQNLRelu if experiment_config['adv_model'] else DQN

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=experiment_config['lr'],
                    alpha=experiment_config['alpha'],
                    eps=experiment_config['eps']),
    )

    exploration_schedule = LinearSchedule(1000000,
                                          experiment_config['min_eps'])

    dqn_learing(experiment_name=experiment_name,
                env=env,
                q_func=q_func,
                optimizer_spec=optimizer_spec,
                exploration=exploration_schedule,
                stopping_criterion=stopping_criterion,
                replay_buffer_size=experiment_config['replay_size'],
                batch_size=experiment_config['batch'],
                gamma=experiment_config['gamma'],
                learning_starts=experiment_config['learning_start'],
                learning_freq=experiment_config['learning_freq'],
                frame_history_len=experiment_config['frame_hist'],
                target_update_freq=experiment_config['target_update_freq'],
                output_path=experiment_config['output'])
示例#2
0
def main(env, num_timesteps, config):
    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        config=config,
        env=env,
        q_func=VIN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
示例#3
0
def q1_run(num_timesteps):
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed, expt_dir='tmp/gym-results2')

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learning(
        env=env,
        q_func=DQN,
        runname="normal_run",
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion2(num_timesteps),
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGET_UPDATE_FREQ
    )
示例#4
0
def main(env):
	global args
	args = parser.parse_args()

	optimizer_spec = OptimizerSpec(
		constructor=optim.RMSprop,
		kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
	)

	exploration_schedule = LinearSchedule(1000000, 0.1)

	dqn_learing(
		env=env,
		q_func=DQN,
		checkpoint_path=args.checkpoint,
		optimizer_spec=optimizer_spec,
		exploration=exploration_schedule,
		stopping_criterion=None,
		replay_buffer_size=REPLAY_BUFFER_SIZE,
		batch_size=BATCH_SIZE,
		gamma=GAMMA,
		learning_starts=LEARNING_STARTS,
		learning_freq=LEARNING_FREQ,
		frame_history_len=FRAME_HISTORY_LEN,
		target_update_freq=TARGET_UPDATE_FREQ,
	   )
示例#5
0
def q2_run(num_timesteps):
    schedulers = {"no_explore": ConstantSchedule(0.1),
                  "delayed_decay": PiecewiseSchedule([(0, 1.0), (0.25e6, 1.0), (1.25e6, 0.1)], outside_value=0.1),
                  "slower_decay": LinearSchedule(1500000, 0.1)}

    for name, exploration_schedule in schedulers.items():
        # Get Atari games.
        benchmark = gym.benchmark_spec('Atari40M')

        # Change the index to select a different game.
        task = benchmark.tasks[3]

        # Run training
        seed = 0  # Use a seed of zero (you may want to randomize the seed!)
        env = get_env(task, seed)
        env.reset()

        optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS))

        dqn_learning(
            env=env,
            q_func=DQN,
            runname=name,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion2(num_timesteps),
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            gamma=GAMMA,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ
        )
def main(env, num_timesteps):
    def stopping_criterion(env):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(2000000, 0.05)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )
示例#7
0
def main(env, num_timesteps, config):
    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_testing(
        config=config,
        env=env,
        q_func=VIN,
        exploration=exploration_schedule,
    )
示例#8
0
def bonus_run(num_timesteps):
    def make_range_black(arr: np.ndarray, start, end):
        arr[:, start:end, :] = 0

    frame_filters = {"no_left_side": lambda x: make_range_black(x, 0, x.shape[1] // 4),
                     "no_middle_side": lambda x: make_range_black(x, x.shape[1] // 4, x.shape[1] // 2), }

    for name, frame_filter in frame_filters.items():
        # Get Atari games.
        benchmark = gym.benchmark_spec('Atari40M')

        # Change the index to select a different game.
        task = benchmark.tasks[3]

        # Run training
        seed = 0  # Use a seed of zero (you may want to randomize the seed!)
        env = get_env(task, seed)
        env.reset()

        optimizer_spec = OptimizerSpec(constructor=optim.RMSprop, kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS))

        dqn_learning(
            env=env,
            q_func=DQN,
            runname=name,
            frame_filter=frame_filter,
            optimizer_spec=optimizer_spec,
            exploration=LinearSchedule(1000000, 0.1),
            stopping_criterion=stopping_criterion2(num_timesteps),
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=BATCH_SIZE,
            gamma=GAMMA,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ
        )
示例#9
0
def main(config, env):
    """
    Run DQN on Atari
    :param config:
    :param env:
    :return:
    """
    FLAGS = update_tf_wrapper_args(args, utils.gatedpixelcnn_bonus.FLAGS)

    def stopping_criterion(env, t):
        # t := number of steps of wrapped env
        # different from number of steps in underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= \
               config.max_timesteps

    # optimizer_spec = OptimizerSpec(
    #     constructor=torch.optim.Adam,
    #     kwargs=dict(lr=config.learning_rate, eps=config.epsilon),
    # )
    optimizer_spec = OptimizerSpec(constructor=torch.optim.RMSprop,
                                   kwargs=dict(lr=config.learning_rate,
                                               momentum=config.momentum,
                                               eps=config.epsilon))

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learn(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        density=PixelBonus,
        cnn_kwargs=FLAGS,
        config=config,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
    )
def main(env):

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        num_actions1=num_actions1,
        num_actions2=num_actions2
    )
示例#11
0
num_timesteps = task.max_timesteps


def stopping_criterion(env):
    # notice that here t is the number of steps of the wrapped env,
    # which is different from the number of steps in the underlying env
    return get_wrapper_by_name(env,
                               "Monitor").get_total_steps() >= num_timesteps


optimizer_spec = OptimizerSpec(
    constructor=optim.RMSprop,
    kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
)

exploration_schedule = LinearSchedule(1000000, 0.1)

USE_CUDA = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
) else torch.FloatTensor


class Variable(autograd.Variable):
    def __init__(self, data, *args, **kwargs):
        if USE_CUDA:
            data = data.cuda()
        super(Variable, self).__init__(data, *args, **kwargs)


OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs"])
示例#12
0
def train_model(env,
                conv_layers,
                learning_rate=5e-4,
                total_timesteps=100000,
                buffer_size=50000,
                exploration_fraction=0.1,
                exploration_final_eps=0.02,
                train_freq=1,
                batch_size=32,
                print_freq=1,
                checkpoint_freq=100000,
                checkpoint_path=None,
                learning_starts=1000,
                gamma=1.0,
                target_network_update_freq=500,
                double_dqn=False,
                **network_kwargs) -> tf.keras.Model:
    """Train a DQN model.

    Parameters
    -------
    env: gym.Env
        openai gym
    conv_layers: list
        a list of triples that defines the conv network
    learning_rate: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to run the environment
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every train_freq steps.
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to store a checkpoint during training
    checkpoint_path: str
        the fs path for storing the checkpoints
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    double_dqn: bool
        specifies if double q-learning is used during training
    Returns
    -------
    dqn: an instance of tf.Module that contains the trained model
    """
    q_func = build_dueling_q_func(conv_layers, **network_kwargs)

    dqn = DeepQ(model_builder=q_func,
                observation_shape=env.observation_space.shape,
                num_actions=env.action_space.n,
                learning_rate=learning_rate,
                gamma=gamma,
                double_dqn=double_dqn)

    manager = None
    if checkpoint_path is not None:
        load_path = osp.expanduser(checkpoint_path)
        ckpt = tf.train.Checkpoint(model=dqn.q_network)
        manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5)
        ckpt.restore(manager.latest_checkpoint)
        print("Restoring from {}".format(manager.latest_checkpoint))

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Create the replay buffer
    replay_buffer = ReplayMemory(buffer_size)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(total_timesteps=int(exploration_fraction *
                                                     total_timesteps),
                                 initial_prob=1.0,
                                 final_prob=exploration_final_eps)

    dqn.update_target()

    episode_rewards = [0.0]
    obs = env.reset()

    obs = np.expand_dims(np.array(obs), axis=0)

    for t in range(total_timesteps):
        update_eps = exploration.step_to(t)

        action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps)
        action = action[0].numpy()

        new_obs, reward, done, _ = env.step(action)
        # Store transition in the replay buffer.
        new_obs = np.expand_dims(np.array(new_obs), axis=0)
        replay_buffer.add(obs[0], action, reward, new_obs[0], float(done))
        obs = new_obs

        episode_rewards[-1] += reward
        if done:
            obs = env.reset()
            obs = np.expand_dims(np.array(obs), axis=0)
            episode_rewards.append(0.0)

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                batch_size)
            weights, _ = tf.ones_like(rewards), None

            td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones,
                                weights)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network every target_network_update_freq steps
            dqn.update_target()

        reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1)
        number_episodes = len(episode_rewards) - 1
        if done and print_freq is not None and number_episodes % print_freq == 0:
            format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}"
            print(
                format_str.format(t, number_episodes, reward_100_mean,
                                  episode_rewards[-2],
                                  int(100 * exploration.value(t))))

            with train_summary_writer.as_default():
                tf.summary.scalar('loss',
                                  dqn.train_loss_metrics.result(),
                                  step=t)
                tf.summary.scalar('reward', episode_rewards[-2], step=t)

        if checkpoint_path is not None and t % checkpoint_freq == 0:
            manager.save()

        # Every training step, reset the loss metric
        dqn.train_loss_metrics.reset_states()

    return dqn.q_network
示例#13
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration=LinearSchedule(1000000, 0.1),
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000,
                grad_norm_clipping=10):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    # optimizer_func = construct_optimizer_func(Q, optimizer_spec)
    optimizer = torch.optim.Adam(Q.parameters())

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # recent_observations: shape(img_h, img_w, frame_history_len) are input to to the model
        recent_observations = replay_buffer.encode_recent_observation(
        ).transpose(2, 0, 1)

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch.transpose(0, 3, 1, 2)).type(dtype) /
                255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch.transpose(
                    0, 3, 1, 2)).type(dtype) / 255.0)
            done_mask = torch.from_numpy(done_mask)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()
                done_mask = done_mask.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
            # Compute next Q value, based on which acion gives max Q values
            next_max_Q_values = Variable(torch.zeros(batch_size).type(dtype))
            # # Detach variable from the current graph since we don't want gradients to propagated
            next_max_Q_values[done_mask == 0] = target_Q(
                next_obs_batch).detach().max(1)[0]
            # Compute Bellman error, use huber loss to mitigate outlier impact
            target_Q_values = rew_batch + (gamma * next_max_Q_values)
            bellman_error = F.smooth_l1_loss(current_Q_values, target_Q_values)

            # Construct and optimizer and clear previous gradients
            optimizer = optimizer_func(t)
            optimizer.zero_grad()

            # run backward pass and clip the gradient
            bellman_error.backward()
            nn.utils.clip_grad_norm(Q.parameters(), grad_norm_clipping)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
            sys.stdout.flush()
示例#14
0
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

env = wrap_deepmind(env)
env = JoypadSpace(env, COMPLEX_MOVEMENT)
expt_dir = 'Game_play3'
env = wrappers.Monitor(env, expt_dir, force=True, video_callable=False)

optimizer_spec = OptimizerSpec(
    constructor=optim.RMSprop,
    kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
)

exploration_schedule = LinearSchedule(2000000, 0.05, 0.05)
annelation_schedule = LinearSchedule(2000000, 1.0, 0.4)

# recollect_experience(env2,DQN)
dqfd_learn(
    env=env,
    q_func=DQN,
    optimizer_spec=optimizer_spec,
    exploration=exploration_schedule,
    replay_buffer_size=REPLAY_BUFFER_SIZE,
    batch_size=BATCH_SIZE,
    gamma=GAMMA,
    learning_starts=LEARNING_STARTS,
    learning_freq=LEARNING_FREQ,
    alpha=ALPHA_P,
    annelation=annelation_schedule,
示例#15
0
    plt.style.use('ggplot')

    NUM_EPISODES = 12000
    BATCH_SIZE = 128
    GAMMA = 1.0
    REPLAY_MEMORY_SIZE = 1000000
    LEARNING_RATE = 0.00025
    ALPHA = 0.95
    EPS = 0.01

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(50000, 0.1, 1)

    agent = hDQN(
        optimizer_spec=optimizer_spec,
        replay_memory_size=REPLAY_MEMORY_SIZE,
        batch_size=BATCH_SIZE,
    )

    env = StochasticMDPEnv()

    agent, stats, visits = hdqn_learning(
        env=env,
        agent=agent,
        num_episodes=NUM_EPISODES,
        exploration_schedule=exploration_schedule,
        gamma=GAMMA,
示例#16
0
def atari_learn(env, args, num_timesteps):
    logdir = os.path.join('data', args.exp_name)

    num_iterations = float(num_timesteps) / 4.0

    # lr_multiplier = 1.0
    # lr_schedule = PiecewiseSchedule([
    #     (0, 1e-4 * lr_multiplier),
    #     (num_iterations / 10, 1e-4 * lr_multiplier),
    #     (num_iterations / 2, 5e-5 * lr_multiplier),
    # ],
    #     outside_value=5e-5 * lr_multiplier)
    # optimizer = dqn.OptimizerSpec(
    #     constructor=tf.train.AdamOptimizer,
    #     kwargs=dict(epsilon=1e-4),
    #     lr_schedule=lr_schedule
    # )

    def stopping_criterion(env):
        return get_wrapper_by_name(
            env, "Monitor").get_total_steps() >= num_timesteps

    # optimizer_spec = OptimizerSpec(
    #     constructor=optim.RMSprop,
    #     kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    # )

    optimizer_spec = OptimizerSpec(
        constructor=optim.Adam,
        kwargs=dict(lr=LEARNING_RATE),
    )

    exploration_schedule = LinearSchedule(30000, 0.01)

    # exploration_schedule = PiecewiseSchedule(
    #     [
    #         (0, 1.0),
    #         (1e6, 0.1),
    #         (num_iterations / 2, 0.01),
    #     ], outside_value=0.01
    # )

    logz.configure_output_dir(logdir)

    if args.dueling:
        dqn_learning(
            env=env,
            method=args.method,
            game=args.env,
            q_func=Dueling_DQN,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion,
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=args.batch_size,
            gamma=args.gamma,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ,
            double=args.double,
            dueling=args.dueling,
            logdir=logdir,
            svrl=args.svrl,
            me_type=args.me_type,
            maskp=args.maskp,
            maskstep=args.maskstep,
            maskscheduler=args.maskscheduler,
        )
    else:
        dqn_learning(
            env=env,
            method=args.method,
            game=args.env,
            q_func=DQN,
            optimizer_spec=optimizer_spec,
            exploration=exploration_schedule,
            stopping_criterion=stopping_criterion,
            replay_buffer_size=REPLAY_BUFFER_SIZE,
            batch_size=args.batch_size,
            gamma=args.gamma,
            learning_starts=LEARNING_STARTS,
            learning_freq=LEARNING_FREQ,
            frame_history_len=FRAME_HISTORY_LEN,
            target_update_freq=TARGET_UPDATE_FREQ,
            double=args.double,
            dueling=args.dueling,
            logdir=logdir,
            svrl=args.svrl,
            me_type=args.me_type,
            maskp=args.maskp,
            maskstep=args.maskstep,
            maskscheduler=args.maskscheduler,
        )

    env.close()
def dqn_learn(env,
              exploration=LinearSchedule(EPOCHS // 2, 0.1),
              optimizer_spec=optimizer):
    # 初始化
    Q = DQN(STATE_VEC_DIM, DQN_HIDDEN_DIM1, DQN_HIDDEN_DIM2, NUM_ACTIONS)
    Q_target = DQN(STATE_VEC_DIM, DQN_HIDDEN_DIM1, DQN_HIDDEN_DIM2,
                   NUM_ACTIONS)
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)
    replay_buffer = deque()

    loss_func = torch.nn.MSELoss()

    num_param_updates = 0

    for epoch_id in range(EPOCHS):
        print("\n ###### Epoch: %s/%s ######" % (epoch_id, EPOCHS))
        start = time.time()
        total_loss = 0
        # 环境初始化
        obs = env.reset()  # obs 是一个 list
        while True:
            # 选择动作
            sample = random.random()
            threshold = exploration.value(epoch_id)
            if sample > threshold:
                observation = torch.tensor(obs).unsqueeze(0).type(
                    DTYPE)  # observation 是一个 tensor
                value = Q(observation).cpu().data.numpy()
                action = value.argmax(-1)[0]
            else:
                action = np.random.randint(NUM_ACTIONS)
            # 执行动作
            reward, new_obs, done, _ = env.step(action)

            replay_buffer.append((obs, action, reward, new_obs, done))
            if len(replay_buffer) > REPLAY_SIZE:
                replay_buffer.popleft()

            obs = new_obs

            if len(replay_buffer) > BATCH_SIZE:
                # print("执行经验回放")

                # 首先准备输入数据
                minibatch = random.sample(replay_buffer, BATCH_SIZE)
                state_batch = [data[0] for data in minibatch]
                action_batch = [data[1] for data in minibatch]
                reward_batch = [data[2] for data in minibatch]
                next_state_batch = [data[3] for data in minibatch]
                done_batch = [data[4] for data in minibatch]
                # 第一维是 batch_size
                state_tensor = Variable(torch.tensor(state_batch).type(DTYPE))
                action_tensor = Variable(
                    torch.tensor(action_batch).type(DLONGTYPE))
                reward_tensor = Variable(
                    torch.tensor(reward_batch).type(DTYPE))
                next_state_tensor = Variable(
                    torch.tensor(next_state_batch).type(DTYPE))
                done_tensor = Variable(torch.tensor(done_batch).type(DTYPE))

                # Q 网络得到的估计值
                q_values = Q(state_tensor)
                # action_tensor 的 shape 是 [32]
                # action_tensor.unsqueeze(1) 是 [32, 1]
                # q_values 是 [32, 19]
                # gather 就是取出 action_tensor 对应的动作的 Q 值
                q_s_a = q_values.gather(1, action_tensor.unsqueeze(1))
                # q_s_a 变成 [32]
                q_s_a = q_s_a.squeeze()

                # 目标值
                # .max(1) 按第 2 个维度求最大值,返回最大值以及索引
                # 所以用 [0] 取最大动作值
                # Q_target(next_state_tensor).max(1)[0]: batch_size
                target_v = reward_tensor + GAMMA * (
                    1 - done_tensor
                ) * Q_target(next_state_tensor).detach().max(1)[0]

                loss = loss_func(q_s_a, target_v)

                total_loss += loss.item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                num_param_updates += 1
                if num_param_updates % REPLACE_TARGET_FREQ == 0:
                    Q_target.load_state_dict(Q.state_dict())

            if done:
                break

        end = time.time()
        print("Epoch %s  Time: %.2f s  Total Loss: %.2f" %
              (epoch_id, end - start, total_loss))
        # 每训练 10000 步,进行一次评估
        if epoch_id % 5 == 0:
            print("--------------------------------------------------------")
            print("--------------进入测试阶段")
            print("--------------------------------------------------------")

            obs = env.reset(False)

            while True:
                observation = torch.tensor(obs).unsqueeze(0).type(DTYPE)
                value = Q(observation).cpu().data.numpy()
                action = value.argmax(-1)[0]
                reward, new_obs, done, info = env.step(action, False)
                obs = new_obs
                if done:
                    gold_results = info[0]
                    pred_results = info[1]
                    break
            acc, p, r, f = get_ner_fmeasure(gold_results, pred_results)
            print("acc: %.4f, p: %.4f, r: %.4f, f: %.4f; \n" % (acc, p, r, f))
示例#18
0
文件: main.py 项目: LQNew/LWDRLD
    action_dim = env.action_space.n

    kwargs = {
        "action_dim": action_dim,
        "discount": args.discount,
        "gradient_clip": args.gradient_clip,
    }

    # Initialize policy
    # ----------------------------------------------
    if args.policy == "DQN":
        kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
        kwargs["learning_rate"] = 1e-4
        policy = DQN.DQN(**kwargs)
        eps_schedule = LinearSchedule(1.0, 0.01, 1e6)  # annealing epsilon
        args.batch_size = 64
    elif args.policy == "Double_DQN":
        kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
        kwargs["learning_rate"] = 1e-4
        policy = Double_DQN.DoubleDQN(**kwargs)
        eps_schedule = LinearSchedule(1.0, 0.01, 1e6)  # annealing epsilon
        args.batch_size = 64
    # ----------------------------------------------
    elif args.policy == "Dueling_DQN":
        kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
        kwargs["learning_rate"] = 1e-4
        policy = Dueling_DQN.DuelingDQN(**kwargs)
        eps_schedule = LinearSchedule(1.0, 0.01, 1e6)  # annealing epsilon
    elif args.policy == "Dueling_Double_DQN":
        kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
示例#19
0
def main(env, num_timesteps):
    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = random.randint(0,100)  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    # empty dict to hold all results
    Stats = {}

    new_lr = 0.001
    new_gamma = 0.999
    exploration_sches = [LinearSchedule(1000000, 0.1), ConstantSchedule(0.05),
                         ConstantSchedule(0.15), LinearSchedule(500000, 0.05)]

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=new_lr, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["lr=0.001, gamma=0.999"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=new_gamma,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested="lr=0.001, gamma=0.999"
    )

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    env = get_env(task, seed)
    Stats["Default"] = dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        feature_tested=""
    )

    plt.clf()
    plt.xlabel('Timesteps')
    plt.ylabel('Mean Reward (past 100 episodes)')
    num_items = len(Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["lr=0.001, gamma=0.999"]["mean_episode_rewards"], label="lr=0.001, gamma=0.999")
    num_items = len(Stats["Default"]["mean_episode_rewards"])
    plt.plot(range(num_items), Stats["Default"]["mean_episode_rewards"], label="Default")
    plt.legend()
    plt.title("Performance")
    plt.savefig('Final-Performance.png')
示例#20
0
def q_learning(env, num_episodes, discount_factor=1.0, lr=0.00025, exploration_schedule=LinearSchedule(50000, 0.1, 1.0)):
    """
    Q-Learning algorithm: Off-policy TD control. Finds the optimal greedy policy
    while following an epsilon-greedy policy
    Args:
        env: OpenAI environment.
        num_episodes: Number (can be divided by 1000) of episodes to run for. Ex: 12000
        discount_factor: Lambda time discount factor.
        lr: TD learning rate.
        exploration_schedule: Schedule (defined in utils.schedule)
            schedule for probability of chosing random action.
    Returns:
        A tuple (Q, stats, visits).
        Q is the optimal action-value function, a dictionary mapping state -> action values.
        stats is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
        visits is an 2D-array indicating how many time each state being visited in every 1000 episodes.
    """

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.nA))

    # Keep track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))
    n_thousand_episode = int(np.floor(num_episodes / 1000))

    visits = np.zeros((n_thousand_episode, env.nS))

    total_timestep = 0

    for i_thousand_episode in range(n_thousand_episode):
        for i_episode in range(1000):
            current_state = env.reset()

            visits[i_thousand_episode][current_state-1] += 1
            # Keep track number of time-step per episode only for plotting
            for t in itertools.count():
                total_timestep += 1
                # Get annealing exploration rate (epislon) from exploration_schedule
                epsilon = exploration_schedule.value(total_timestep)
                # Improve epsilon greedy policy using lastest updated Q
                policy = make_epsilon_greedy_policy(Q, epsilon, env.nA)

                # Choose the action based on epsilon greedy policy
                action_probs = policy(current_state)
                action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
                next_state, reward, done, _ = env.step(action)

                visits[i_thousand_episode][next_state-1] += 1

                # Use the greedy action to evaluate Q, not the one we actually follow
                greedy_next_action = Q[next_state].argmax()
                # Evaluate Q using estimated action value of (next_state, greedy_next_action)
                td_target = reward + discount_factor * Q[next_state][greedy_next_action]
                td_error = td_target - Q[current_state][action]
                Q[current_state][action] += lr * td_error

                # Update statistics
                stats.episode_rewards[i_thousand_episode*1000 + i_episode] += reward
                stats.episode_lengths[i_thousand_episode*1000 + i_episode] = t

                if done:
                    break
                else:
                    current_state = next_state

    return Q, stats, visits
示例#21
0
def dqn_learn(env,
              q_func,
              optimizer_spec,
              density,
              cnn_kwargs,
              config,
              exploration=LinearSchedule(1000000, 0.1),
              stopping_criterion=None):
    """
    Run Deep Q-learning algorithm.
    """
    # this is just to make sure that you're operating in the correct environment
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############
    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_shape = env.observation_space.shape
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_shape = (img_h, img_w, config.frame_history_len * img_c)
    num_actions = env.action_space.n

    # define Q network and target network (instantiate 2 DQN's)
    in_channel = input_shape[-1]
    Q = q_func(in_channel, num_actions)
    target_Q = deepcopy(Q)

    # define C network and target C
    C = q_func(in_channel, num_actions)
    target_C = deepcopy(C)

    # call tensorflow wrapper to get density model
    if config.bonus:
        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   log_device_placement=False)
        tf_config.gpu_options.allow_growth = True
        sess = tf.Session(config=tf_config)
        pixel_bonus = density(cnn_kwargs, sess, num_actions)
        tf.initialize_all_variables().run(session=sess)

    if USE_CUDA:
        Q.cuda()
        target_Q.cuda()
        C.cuda()
        target_C.cuda()

    # define eps-greedy exploration strategy
    def select_action(model, bonus_model, obs, t):
        """
        Selects random action w prob eps; otherwise returns best action
        :param exploration:
        :param t:
        :return:
        """
        def get_best_action(obs):
            obs = torch.from_numpy(obs).type(FloatTensor).unsqueeze(0) / 255.0
            Q_val = model(Variable(obs, volatile=True))
            C_val = bonus_model(Variable(obs, volatile=True))
            b = C_val
            if config.gaussian_ts:
                b = config.alpha * torch.distributions.normal.Normal(0, C_val)
            return (Q_val + b).data.max(1)[1].view(1, 1)

        if config.egreedy_exploration:
            sample = random.random()
            eps_threshold = exploration.value(t)
            if sample > eps_threshold:
                return get_best_action(obs)
            else:
                # return random action
                return LongTensor([[random.randrange(num_actions)]])
        # no exploration; just take best action
        else:
            return get_best_action(obs)

    # construct torch optimizer
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # C optimizer
    C_optimizer = optimizer_spec.constructor(C.parameters(),
                                             **optimizer_spec.kwargs)

    # construct the replay buffer
    if config.mmc:
        replay_buffer = MMCReplayBuffer(config.replay_buffer_size,
                                        config.frame_history_len)
    else:
        replay_buffer = ReplayBuffer(config.replay_buffer_size,
                                     config.frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    prev = time.time()

    # index trackers for updating mc returns
    episode_indices_in_buffer = []
    reward_each_timestep = []
    timesteps_in_buffer = []
    cur_timestep = 0

    # t denotes frames
    for t in itertools.count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        ### 2. Step the env and store the transition
        # process last_obs to include context from previous frame
        last_idx = replay_buffer.store_frame(last_obs)

        # record where this is in the buffer
        episode_indices_in_buffer.append(last_idx)
        timesteps_in_buffer.append(cur_timestep)
        # one more step in episode
        cur_timestep += 1

        # take latest observation pushed into buffer and compute corresponding input
        # that should be given to a Q network by appending some previous frames
        recent_obs = replay_buffer.encode_recent_observation()
        # recent_obs.shape is also (84, 84, 4)

        # choose random action if not yet started learning
        if t > config.learning_starts:
            action = select_action(Q, C, recent_obs, t)[0][0]
        else:
            action = random.randrange(num_actions)

        # advance one step
        obs, reward, done, _ = env.step(action)
        # clip reward to be in [-1, +1]
        reward = max(-1.0, min(reward, 1.0))

        ###############################################
        # do density model stuff here
        if config.bonus:  # just assume this is true
            intrinsic_reward = pixel_bonus.bonus(obs, action, t, num_actions)
            if t % config.log_freq == 0:
                logging.info('t: {}\t intrinsic reward: {}'.format(
                    t, intrinsic_reward))
                curr = time.time()
                diff = curr - prev
                prev = curr
                logging.info("Timestep %d" % (t, ))
                logging.info("Time elapsed %f" % diff)
                # utils.save_image(pixel_bonus.sample_images(img_dim**2), 'images/iteration_{}.png'.format(t), nrow=img_dim, padding=0)
                # pixel_bonus.sample_images(3, t)
                # utils.save_image(frame / 8.,'images/obs_{}.png'.format(t),padding=0)
            bonus = intrinsic_reward
            # TODO: add bonus/intrinsic_reward to replay buffer
            pixel_bonus.writer.add_scalar('data/bonus', bonus, t)
            # add intrinsic reward to clipped reward
            # NOTE: don't add bonus since we separate Q and C
            reward += intrinsic_reward
            # clip reward to be in [-1, +1] once again
            reward = max(-1.0, min(reward, 1.0))
            assert -1.0 <= reward <= 1.0
        ################################################

        # store reward in list to use for calculating MMC update
        reward_each_timestep.append(reward)
        replay_buffer.store_effect(last_idx, action, reward, done, bonus)

        # reset environment when reaching episode boundary
        if done:
            # only if computing MC return
            if config.mmc:
                # episode has terminated --> need to do MMC update here
                # loop through all transitions of this past episode and add in mc_returns
                print(len(timesteps_in_buffer), len(reward_each_timestep))
                assert len(timesteps_in_buffer) == len(reward_each_timestep)
                mc_returns = np.zeros(len(timesteps_in_buffer))

                # compute mc returns
                r = 0
                for i in reversed(range(len(mc_returns))):
                    r = reward_each_timestep[i] + config.gamma * r
                    mc_returns[i] = r

                # populate replay buffer
                for j in range(len(mc_returns)):
                    # get transition tuple in reward buffer and update
                    update_idx = episode_indices_in_buffer[j]
                    # put mmc return back into replay buffer
                    replay_buffer.mc_return_t[update_idx] = mc_returns[j]
            # reset because end of episode
            episode_indices_in_buffer = []
            timesteps_in_buffer = []
            cur_timestep = 0
            reward_each_timestep = []

            # reset
            obs = env.reset()
        last_obs = obs

        ### 3. Perform experience replay and train the network.
        # note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken

        # perform training
        if (t > config.learning_starts and t % config.learning_freq == 0
                and replay_buffer.can_sample(config.batch_size)):

            # sample batch of transitions
            if config.mmc:
                # also grab MMC batch if computing MMC return
                obs_batch, act_batch, rew_batch, next_obs_batch, bonus_batch, done_mask, mc_batch = \
                replay_buffer.sample(config.batch_size)
                mc_batch = Variable(
                    torch.from_numpy(mc_batch).type(FloatTensor))
            else:
                obs_batch, act_batch, rew_batch, next_obs_batch, bonus_batch, done_mask = \
                replay_buffer.sample(config.batch_size)

            # convert variables to torch tensor variables
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(FloatTensor) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).type(LongTensor))
            rew_batch = Variable(torch.from_numpy(rew_batch).type(FloatTensor))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(FloatTensor) / 255.0)
            bonus_batch = Variable(
                torch.from_numpy(bonus_batch).type(FloatTensor))
            not_done_mask = Variable(
                torch.from_numpy(1 - done_mask).type(FloatTensor))

            # 3.c: train the model: perform gradient step and update the network
            current_Q_values = Q(obs_batch).gather(
                1, act_batch.unsqueeze(1)).squeeze()
            # this gives you a FloatTensor of size 32 // gives values of max
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]

            # torch.FloatTensor of size 32
            next_Q_values = not_done_mask * next_max_q

            # this is [r(x,a) + gamma * max_a' Q(x', a')]
            target_Q_values = rew_batch + (config.gamma * next_Q_values)

            if config.mmc:
                # replace target_Q_values with mixed target
                target_Q_values = ((1 - config.beta) *
                                   target_Q_values) + (config.beta * mc_batch)
            # use huber loss
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)

            # zero out gradient
            optimizer.zero_grad()

            # backward pass
            loss.backward()

            # gradient clipping
            for params in Q.parameters():
                params.grad.data.clamp_(-1, 1)

            # perform param update
            optimizer.step()
            num_param_updates += 1

            # periodically update the target network
            if num_param_updates % config.target_update_freq == 0:
                target_Q = deepcopy(Q)

            ######### REPEAT ABOVE FOR C NETWORK ##################
            current_C_values = C(obs_batch).gather(
                1, act_batch.unsqueeze(1)).squeeze()
            # this gives you a FloatTensor of size 32 // gives values of max
            next_max_c = target_C(next_obs_batch).detach().max(1)[0]

            # torch.FloatTensor of size 32
            next_C_values = not_done_mask * next_max_c

            # this is [r(x,a) + gamma * max_a' Q(x', a')]
            target_C_values = bonus_batch + (config.gamma * next_C_values)

            #if config.mmc:
            # replace target_Q_values with mixed target
            #    target_C_values = ((1-config.beta) * target_C_values) + (config.beta *
            #                                                             mc_batch)
            # use huber loss
            C_loss = F.smooth_l1_loss(current_C_values, target_C_values)

            # zero out gradient
            C_optimizer.zero_grad()

            # backward pass
            C_loss.backward()

            # gradient clipping
            for params in C.parameters():
                params.grad.data.clamp_(-1, 1)

            # perform param update
            C_optimizer.step()
            num_param_updates += 1

            # periodically update the target network
            if num_param_updates % config.target_update_freq == 0:
                target_C = deepcopy(C)

            ### 4. Log progress
            episode_rewards = get_wrapper_by_name(
                env, "Monitor").get_episode_rewards()
            if len(episode_rewards) > 0:
                mean_episode_reward = np.mean(episode_rewards[-100:])
            if len(episode_rewards) > 100:
                best_mean_episode_reward = max(best_mean_episode_reward,
                                               mean_episode_reward)

            # Tensorboard logging
            pixel_bonus.writer.add_scalar('data/bonus', intrinsic_reward, t)
            pixel_bonus.writer.add_scalar('data/Q_loss', loss, t)
            #pixel_bonus.writer.add_scalar('data/C_loss', C_loss, t)
            pixel_bonus.writer.add_scalar('data/episode_reward',
                                          episode_rewards[-1], t)

            # save statistics
            Statistic["mean_episode_rewards"].append(mean_episode_reward)
            Statistic["best_mean_episode_rewards"].append(
                best_mean_episode_reward)
            Statistic["episode_rewards"].append(episode_rewards)

            if t % config.log_freq == 0 and t > config.learning_starts:
                # curr = time.time()
                # diff = curr - prev
                # prev = curr
                # logging.info("Timestep %d" % (t,))
                # logging.info("Time elapsed %f" % diff)
                logging.info("mean reward (100 episodes) %f" %
                             mean_episode_reward)
                logging.info("best mean reward %f" % best_mean_episode_reward)
                logging.info("episodes %d" % len(episode_rewards))
                logging.info("exploration %f" % exploration.value(t))
                sys.stdout.flush()
示例#22
0

if __name__ == '__main__':
    # make env
    args = xworld_args.parser().parse_args()
    args.visible_radius_unit_side = config.visible_radius_unit_side
    args.visible_radius_unit_front = config.visible_radius_unit_front
    args.ego_centric = config.ego_centric
    args.map_config = config.map_config_file
    env = xworld_navi_goal.XWorldNaviGoal(args)
    env.teacher.israndom_goal = False
    env.teacher.goal_id = 0

    # exploration strategy
    exp_schedule = LinearExploration(env, config.eps_begin, 
            config.eps_end, config.eps_nsteps)

    # learning rate schedule
    lr_schedule  = LinearSchedule(config.lr_begin, config.lr_end,
            config.lr_nsteps)

    # train model
    model = DRQN(env, config)
    shutil.copyfile('./configs/drqn_xworld.py', config.output_path+'config.py')
    shutil.copy(os.path.realpath(__file__), config.output_path)
    shutil.copy(config.map_config_file, config.output_path)
    if config.deploy_only:
        model.deploy()
    else:
        model.run(exp_schedule, lr_schedule)
示例#23
0
    action_dim = env.action_space.n

    kwargs = {
        "action_dim": action_dim,
        "discount": args.discount,
        "gradient_clip": args.gradient_clip,
    }

    # Initialize policy
    # ----------------------------------------------
    if args.policy == "DQN_per":
        kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
        kwargs["learning_rate"] = 1e-4
        policy = DQN_per.DQN_PER(**kwargs)
        eps_schedule = LinearSchedule(1.0, 0.01, 1e6)  # annealing epsilon
        beta_schedule = LinearSchedule(args.beta0_per, 1.0,
                                       args.max_timesteps -
                                       args.start_timesteps)  # annealing beta
        args.batch_size = 64
    elif args.policy == "Double_DQN_per":
        kwargs["policy_freq"] = int(args.policy_freq) // int(args.num_envs)
        kwargs["learning_rate"] = 1e-4
        policy = Double_DQN_per.DoubleDQN_PER(**kwargs)
        eps_schedule = LinearSchedule(1.0, 0.01, 1e6)  # annealing epsilon
        beta_schedule = LinearSchedule(args.beta0_per, 1.0,
                                       args.max_timesteps -
                                       args.start_timesteps)  # annealing beta
        args.batch_size = 64
    # ----------------------------------------------
    elif args.policy == "Dueling_DQN_per":