示例#1
0
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for t in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
            if is_solved:
                # Show off the result
                env.render()
            else:
示例#2
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          stage1_total_timesteps=None,
          stage2_total_timesteps=None,
          buffer_size=50000,
          exploration_fraction=0.3,
          initial_exploration_p=1.0,
          exploration_final_eps=0.0,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1,
          gamma=1.0,
          target_network_update_freq=100,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          double_q=True,
          obs_dim=None,
          qmdp_expert=None,
          stage1_td_error_threshold=1e-3,
          pretrain_experience=None,
          flatten_belief=False,
          num_experts=None,
          **network_kwargs):
    """Train a bootstrap-dqn model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    qmdp_expert: takes obs, bel -> returns qmdp q-vals
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    nenvs = env.num_envs
    print("{} envs".format(nenvs))

    assert pretrain_experience is not None and qmdp_expert is not None and num_experts is not None

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    # import IPython; IPython.embed()
    #assert isinstance(env.envs[0].env.env.env, ExplicitBayesEnv)
    #belief_space = env.envs[0].env.env.env.belief_space
    #observation_space = env.envs[0].env.env.env.internal_observation_space

    obs_space = env.observation_space

    assert obs_dim is not None

    observation_space = Box(obs_space.low[:obs_dim],
                            obs_space.high[:obs_dim],
                            dtype=np.float32)
    #belief_space = Box(obs_space.low[obs_dim:], obs_space.high[obs_dim:], dtype=np.float32)
    observed_belief_space = Box(obs_space.low[obs_dim:],
                                obs_space.high[obs_dim:],
                                dtype=np.float32)
    belief_space = Box(np.zeros(num_experts),
                       np.ones(num_experts),
                       dtype=np.float32)  # rocksample

    num_experts = belief_space.high.size

    # print("Num experts", num_experts)

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    def make_bel_ph(name):
        return ObservationInput(belief_space, name=name)

    q_func = build_q_func(network, num_experts, **network_kwargs)

    print('=============== got qfunc ============== ')

    if stage1_total_timesteps is None and stage2_total_timesteps is None:
        stage1_total_timesteps = total_timesteps // 2
        stage2_total_timesteps = total_timesteps // 2

    total_timesteps = stage1_total_timesteps + stage2_total_timesteps

    act, train, update_target, debug = rbqnfe_staged.build_train(
        make_obs_ph=make_obs_ph,
        make_bel_ph=make_bel_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise,
        double_q=double_q)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=initial_exploration_p,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_reward = np.zeros(nenvs, dtype=np.float32)
    saved_mean_reward = None
    reset = True
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_episodes = 0
    episode_rewards_history = deque(maxlen=1000)
    episode_step = np.zeros(nenvs, dtype=int)
    episodes = 0  #scalar

    # Load model
    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        print("Model will be saved at ", model_file)
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))
            print('Loaded model from {}'.format(load_path))

    t = 0

    accumulated_td_errors = deque(maxlen=100)

    # copy all pre-experiences
    for expert, experience in enumerate(pretrain_experience):
        obs, val, action, rew, new_obs, done = experience
        obs, bel = obs[:, :-observed_belief_space.
                       shape[0]], obs[:, -observed_belief_space.shape[0]:]
        if flatten_belief:
            bel = qmdp_expert.flatten_to_belief(bel,
                                                approximate=True).transpose()
        new_obs, new_bel = new_obs[:, :-observed_belief_space.
                                   shape[0]], new_obs[:,
                                                      -observed_belief_space.
                                                      shape[0]:]
        if flatten_belief:
            new_bel = qmdp_expert.flatten_to_belief(
                new_bel, approximate=True).transpose()  # rocksample specific
        new_expert_qval = qmdp_expert(new_obs, new_bel)
        expert_qval = qmdp_expert(obs, bel)
        obs = obs.astype(np.float32)
        bel = bel.astype(np.float32)
        expert_qval = expert_qval.astype(np.float32)
        action = action.astype(np.float32)
        rew = rew.astype(np.float32).ravel()
        new_obs = new_obs.astype(np.float32)
        new_bel = new_bel.astype(np.float32)
        new_expert_qval = new_expert_qval.astype(np.float32)
        replay_buffer.add(obs, bel, expert_qval, action, rew, new_obs, new_bel,
                          new_expert_qval, done)
        print("Added {} samples to ReplayBuffer".format(
            len(replay_buffer._storage)))

    # Stage 1: Train Residual without exploration, just with batches from replay buffer
    while t < stage1_total_timesteps:
        if callback is not None:
            if callback(locals(), globals()):
                break

        kwargs = {}
        update_param_noise_threshold = 0.

        obs = env.reset()
        episode_reward = np.zeros(nenvs, dtype=np.float32)
        episode_step[:] = 0
        obs, bel = obs[:, :-observed_belief_space.
                       shape[0]], obs[:, -observed_belief_space.shape[0]:]

        expert_qval = qmdp_expert(obs, bel)

        t += 1

        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:
            experience = replay_buffer.sample(batch_size,
                                              beta=beta_schedule.value(t))
            obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones, weights, batch_idxes = experience
        else:
            experience = replay_buffer.sample(batch_size)

            obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones = experience
            weights, batch_idxes = np.ones_like(rewards), None

        td_errors = train(obses_t, bels_t, expert_qvals, actions, rewards,
                          obses_tp1, bels_tp1, expert_qvals_1, dones, weights)

        if prioritized_replay:
            new_priorities = np.abs(td_errors) + prioritized_replay_eps
            replay_buffer.update_priorities(batch_idxes, new_priorities)

        accumulated_td_errors.append(np.mean(np.abs(td_errors)))
        if np.random.rand() < 0.01:
            print("Stage 1 TD error", np.around(td_errors, 1))

        if t % target_network_update_freq == 0:
            # Update target network periodically.
            print("Update target")
            update_target()

        if len(accumulated_td_errors) == 100 and np.mean(
                np.abs(accumulated_td_errors)) < stage1_td_error_threshold:
            if saved_mean_reward is not None:
                save_variables(model_file)
                print("Breaking due to low td error",
                      np.mean(accumulated_td_errors))
                break

        if t % print_freq == 0:
            # Just to get test rewards

            obs = env.reset()
            episode_reward = np.zeros(nenvs, dtype=np.float32)
            episode_step[:] = 0
            obs, bel = obs[:, :-observed_belief_space.
                           shape[0]], obs[:, -observed_belief_space.shape[0]:]

            expert_qval = qmdp_expert(obs, bel)

            episode_rewards_history = []
            horizon = 100
            while len(episode_rewards_history) < 1000:
                action, q_values = act(np.array(obs)[None],
                                       np.array(bel)[None],
                                       np.array(expert_qval)[None],
                                       update_eps=0,
                                       **kwargs)
                env_action = action

                new_obs, rew, done, info = env.step(env_action)
                new_obs, new_bel = new_obs[:, :-observed_belief_space.shape[
                    0]], new_obs[:, -observed_belief_space.shape[0]:]

                new_expert_qval = qmdp_expert(new_obs, new_bel)

                if flatten_belief:
                    new_bel = qmdp_expert.flatten_to_belief(new_bel)

                obs = new_obs
                bel = new_bel
                expert_qval = new_expert_qval

                episode_reward += 0.95**episode_step * rew
                episode_step += 1

                for d in range(len(done)):
                    if done[d]:
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1

            mean_100ep_reward = round(np.mean(episode_rewards_history), 2)
            num_episodes = episodes

            logger.record_tabular("stage", 1)
            logger.record_tabular("steps", t)
            logger.record_tabular("mean 1000 episode reward",
                                  mean_100ep_reward)
            logger.record_tabular("td errors", np.mean(accumulated_td_errors))

            logger.dump_tabular()
            print("episodes   ", num_episodes,
                  "steps {}/{}".format(t, total_timesteps))
            print("mean reward", mean_100ep_reward)
            print("exploration", int(100 * exploration.value(t)))

        if (checkpoint_freq is not None and t > learning_starts
                and num_episodes > 100 and t % checkpoint_freq == 0):
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                if print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".
                        format(saved_mean_reward, mean_100ep_reward))
                    print("saving model")
                save_variables(model_file)
                model_saved = True
                saved_mean_reward = mean_100ep_reward

    if model_saved:
        if print_freq is not None:
            logger.log("Restored model with mean reward: {}".format(
                saved_mean_reward))
        load_variables(model_file)

    # Post stage1 saving
    stage1_model_file = os.path.join(td, "stage1_model")
    save_variables(stage1_model_file)
    update_target()

    print("===========================================")
    print("              Stage 1 complete             ")
    print("===========================================")

    stage1_total_timesteps = t
    episode_rewards_history = deque(maxlen=1000)

    # Stage 2: Train Resisual with explorationi
    t = 0
    while t < stage2_total_timesteps:
        if callback is not None:
            if callback(locals(), globals()):
                break
        # Take action and update exploration to the newest value
        kwargs = {}
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.

        obs = env.reset()
        episode_reward = np.zeros(nenvs, dtype=np.float32)
        episode_step[:] = 0
        obs, bel = obs[:, :-observed_belief_space.
                       shape[0]], obs[:, -observed_belief_space.shape[0]:]

        expert_qval = qmdp_expert(obs, bel)

        start_time = timer.time()
        horizon = 100
        for m in range(horizon):
            action, q_values = act(np.array(obs)[None],
                                   np.array(bel)[None],
                                   np.array(expert_qval)[None],
                                   update_eps=update_eps,
                                   **kwargs)
            env_action = action

            new_obs, rew, done, info = env.step(env_action)
            new_obs, new_bel = new_obs[:, :-observed_belief_space.shape[
                0]], new_obs[:, -observed_belief_space.shape[0]:]

            new_expert_qval = qmdp_expert(new_obs, new_bel)

            if flatten_belief:
                new_bel = qmdp_expert.flatten_to_belief(new_bel)

            # Store transition in the replay buffer.
            replay_buffer.add(obs, bel, expert_qval, action, rew, new_obs,
                              new_bel, new_expert_qval, done)

            # if np.random.rand() < 0.05:
            # #     # write to file
            # #     with open('rbqn_fixed_expert.csv', 'a') as f:
            # #         out = ','.join(str(np.around(x,2)) for x in [bel[0], obs[0], q_values[0]])
            #         # f.write(out + "\n")

            #     print(np.around(bel[-1], 2), rew[-1], np.around(q_values[-1], 1), np.around(expert_qval[-1], 1))

            obs = new_obs
            bel = new_bel
            expert_qval = new_expert_qval

            episode_reward += 0.95**episode_step * rew
            episode_step += 1

            # print(action, done, obs)

            for d in range(len(done)):
                if done[d]:
                    epoch_episode_rewards.append(episode_reward[d])
                    episode_rewards_history.append(episode_reward[d])
                    epoch_episode_steps.append(episode_step[d])
                    episode_reward[d] = 0.
                    episode_step[d] = 0
                    epoch_episodes += 1
                    episodes += 1

        print("Took {}".format(timer.time() - start_time))

        t += 1

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(batch_size,
                                                  beta=beta_schedule.value(t))
                if experience is None:
                    continue
                obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones, weights, batch_idxes = experience
            else:
                experience = replay_buffer.sample(batch_size)
                if experience is None:
                    continue

                obses_t, bels_t, expert_qvals, actions, rewards, obses_tp1, bels_tp1, expert_qvals_1, dones = experience
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = train(obses_t, bels_t, expert_qvals, actions, rewards,
                              obses_tp1, bels_tp1, expert_qvals_1, dones,
                              weights)

            if np.random.rand() < 0.01:
                print("TD error", np.around(td_errors, 1))

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

            accumulated_td_errors.append(np.mean(td_errors))

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            print("Update target")
            update_target()

        mean_100ep_reward = round(np.mean(episode_rewards_history), 2)
        num_episodes = episodes

        if print_freq is not None and num_episodes % print_freq == 0:
            logger.record_tabular("stage", 2)
            logger.record_tabular("steps", t + stage1_total_timesteps)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("mean 1000 episode reward",
                                  mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.record_tabular("td errors", np.mean(accumulated_td_errors))
            logger.dump_tabular()
            print("episodes   ", num_episodes,
                  "steps {}/{}".format(t, total_timesteps))
            print("mean reward", mean_100ep_reward)
            print("exploration", int(100 * exploration.value(t)))

        if (checkpoint_freq is not None and t > learning_starts
                and num_episodes > 100 and t % checkpoint_freq == 0):
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                if print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".
                        format(saved_mean_reward, mean_100ep_reward))
                    print("saving model")
                save_variables(model_file)
                model_saved = True
                saved_mean_reward = mean_100ep_reward
    if model_saved:
        if print_freq is not None:
            logger.log("Restored model with mean reward: {}".format(
                saved_mean_reward))
        load_variables(model_file)

    return act
示例#3
0
文件: simple.py 项目: lhm3561/OSS
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return U.BatchInput(observation_space_shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return act
replay_buffer = ReplayBuffer(args.replay_buffer_size)
# mem = ReplayBuffer(args.memory_capacity)

# schedule of epsilon annealing
exploration = LinearSchedule(args.final_exploration_step,
                             args.final_exploration, 1)

# import pdb
# pdb.set_trace()

# Training loop
dqn.online_net.train()
timestamp = 0
for episode in range(args.max_episodes):

    epsilon = exploration.value(episode)

    state, done = env.reset(), False
    if args.agent == 'BootstrappedDQN':
        k = random.randrange(args.nheads)
    elif args.agent == 'VariationalDQN':
        dqn.online_net.freeze_noise()
    elif args.agent == 'BayesBackpropDQN':
        dqn.online_net.reset_noise()
    elif args.agent == 'MNFDQN':
        dqn.online_net.reset_noise()
    while not done:
        timestamp += 1

        if args.agent == 'BootstrappedDQN':
            action = dqn.act_single_head(state[None], k)
示例#5
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          initial_exploration_p=1.0,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=100,
          prioritized_replay=True,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          pretraining_obs=None,
          pretraining_targets=None,
          pretrain_steps=1000,
          pretrain_experience=None,
          pretrain_num_episodes=0,
          double_q=True,
          expert_qfunc=None,
          aggrevate_steps=0,
          pretrain_lr=1e-4,
          sampling_starts=0,
          beb_agent=None,
          qvalue_file="qvalue.csv",
          **network_kwargs):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    beb_agent: takes Q values and suggests actions after adding beb bonus
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    nenvs = env.num_envs
    print("Bayes-DeepQ:", env.num_envs)
    print("Total timesteps", total_timesteps)
    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, train_target, copy_target_to_q, debug = brl_deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        pretrain_optimizer=tf.train.AdamOptimizer(learning_rate=pretrain_lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise,
        double_q=double_q)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=initial_exploration_p,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        print("Model will be saved at ", model_file)
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))
            print('Loaded model from {}'.format(load_path))

    if pretraining_obs is not None:
        # pretrain target and copy to qfunc
        print("Pretrain steps ", pretrain_steps)
        for i in range(pretrain_steps):
            pretrain_errors = train_target(pretraining_obs,
                                           pretraining_targets)
            if i % 500 == 0:
                print("Step {}".format(i), np.mean(pretrain_errors))
            if np.mean(pretrain_errors) < 1e-5:
                break

        min_rew = 0
        # copy all pre-experiences
        if pretrain_experience is not None:
            for obs, action, rew, new_obs, done in zip(*pretrain_experience):
                replay_buffer.add(obs, action, rew, new_obs, float(done))
            print("Added {} samples to ReplayBuffer".format(
                len(replay_buffer._storage)))
            min_rew = min(rew, min_rew)
        print("Pretrain Error", np.mean(pretrain_errors))
    else:
        print("Skipping pretraining")

    update_target()
    print("Save the pretrained model", model_file)
    save_variables(model_file)

    episode_reward = np.zeros(nenvs, dtype=np.float32)
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_episodes = 0
    episode_rewards_history = deque(maxlen=100)
    episode_step = np.zeros(nenvs, dtype=int)
    episodes = 0  #scalar

    start = 0

    if expert_qfunc is None:
        aggrevate_steps = 0

    # if pretraining_obs is None or pretraining_obs.size == 0:
    #     episode_rewards = []
    # else:
    #     episode_rewards = [[0.0]] * pretrain_num_episodes
    #     start = len(pretraining_obs)
    #     if print_freq is not None:
    #         for t in range(0, len(pretraining_obs), print_freq):
    #             logger.record_tabular("steps", t)
    #             logger.record_tabular("episodes", pretrain_num_episodes)
    #             logger.record_tabular("mean 100 episode reward", min_rew)
    #             logger.record_tabular("% time spent exploring", 0)
    #             logger.dump_tabular()
    #             print("pretraining episodes", pretrain_num_episodes, "steps {}/{}".format(t, total_timesteps))

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        print("Aggrevate: Model will be saved at ", model_file)
        model_saved = False

        for i in range(aggrevate_steps):
            obses_t, values = [], []
            for j in range(30):
                # TODO: 30 should be changed to max horizon?
                t = np.random.randint(50) + 1

                obs = env.reset()
                for k in range(t):
                    action, value = act(np.array(obs)[None],
                                        update_eps=exploration.value(i))
                    obs, rew, done, _ = env.step(action)

                obses_t.extend(obs)
                # Roll out expert policy
                episode_reward[:] = 0
                dones = np.array([False] * obs.shape[0])
                for k in range(51 - t):
                    obs, rew, done, _ = env.step(
                        [expert_qfunc.step(o) for o in obs])
                    dones[done] = True
                    rew[dones] = 0
                    episode_reward += 0.95**k * rew

                # TODO: change this to exploration-savvy action
                # action = np.random.randint(env.action_space.n, size=len(obs))
                # Rocksample specific, take sensing actions
                # prob = np.array([1] * 6 + [2] * (env.action_space.n - 6), dtype=np.float32)
                # prob = prob / np.sum(prob)
                # action = np.random.choice(env.action_space.n, p=prob, size=len(action))
                # new_obs, rew, done, _ = env.step(action)

                # value = rew.copy()
                # value[np.logical_not(done)] += gamma * np.max(expert_qfunc.value(new_obs[np.logical_not(done)]), axis=1)
                # current_value[tuple(np.array([np.arange(len(action)), action]))] = value

                # episode reward
                # episode_reward[np.logical_not(done)] += np.max(current_value[np.logical_not(done)], axis=1)
                # episode_rewards_history.extend(np.max(current_value, axis=1))
                value[tuple([np.arange(len(action)), action])] = episode_reward
                values.extend(value)

            print("Aggrevate got {} / {} new data".format(
                obs.shape[0] * 30, len(obses_t)))
            # print("Mean expected cost at the explored points", np.mean(np.max(values, axis=1)))
            for j in range(1000):
                obs, val = np.array(obses_t), np.array(values)
                # indices = np.random.choice(len(obs), min(1000, len(obses_t)))
                aggrevate_errors = train_target(obs, val)
                if np.mean(aggrevate_errors) < 1e-5:
                    print("Aggrevate Step {}, {}".format(i, j),
                          np.mean(aggrevate_errors))
                    break
            print("Aggrevate Step {}, {}".format(i, j),
                  np.mean(aggrevate_errors))
            update_target()
            print("Save the aggrevate model", i, model_file)

            # Evaluate current policy
            episode_reward[:] = 0
            obs = env.reset()
            num_episodes = 0
            k = np.zeros(len(obs))
            while num_episodes < 100:
                action, _ = act(np.array(obs)[None], update_eps=0.0)
                # print(action)
                obs, rew, done, _ = env.step(action)
                episode_reward += 0.95**k * rew
                k += 1
                for d in range(len(done)):
                    if done[d]:
                        episode_rewards_history.append(episode_reward[d])
                        episode_reward[d] = 0.
                        k[d] = 0
                        num_episodes += 1
            mean_1000ep_reward = round(np.mean(episode_rewards_history), 2)
            print("Mean discounted reward", mean_1000ep_reward)
            logger.record_tabular("mean 100 episode reward",
                                  mean_1000ep_reward)
            logger.dump_tabular()
            save_variables(model_file)

        t = 0  # could start from pretrain-steps
        epoch = 0
        while True:
            epoch += 1
            if t >= total_timesteps:
                break

            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            # no randomization
            # update_eps = 0
            print('update_eps', int(100 * exploration.value(t)))
            qv_error = []

            obs = env.reset()
            for m in range(100):

                action, q_values = act(np.array(obs)[None],
                                       update_eps=update_eps,
                                       **kwargs)
                if beb_agent is not None:
                    action = beb_agent.step(obs, action, q_values,
                                            exploration.value(t))
                # if expert_qfunc is not None:
                #     v = expert_qfunc.value(obs)
                #     qv_error += [v - q_values[0]]

                env_action = action
                reset = False
                new_obs, rew, done, info = env.step(env_action)

                if t >= sampling_starts:
                    # Store transition in the replay buffer.
                    replay_buffer.add(obs, action, rew, new_obs, done)
                obs = new_obs

                episode_reward += rew
                episode_step += 1

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.

                        # discount(np.array(rewards), gamma) consider doing discount
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1

            t += 100 * nenvs

            if t > learning_starts:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None

                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)

                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if target_network_update_freq is not None and t > sampling_starts \
                and epoch % target_network_update_freq == 0:
                # Update target network periodically.
                print("Update target")
                update_target()

            mean_1000ep_reward = round(np.mean(episode_rewards_history), 2)
            num_episodes = episodes

            if print_freq is not None:
                logger.record_tabular("steps", t)
                logger.record_tabular("td errors", np.mean(td_errors))
                logger.record_tabular("td errors std",
                                      np.std(np.abs(td_errors)))
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 1000 episode reward",
                                      mean_1000ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()
                print("episodes", num_episodes,
                      "steps {}/{}".format(t, total_timesteps))

            if (checkpoint_freq is not None and t > learning_starts
                    and len(episode_rewards_history) >= 1000):
                if saved_mean_reward is None or mean_1000ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_1000ep_reward))
                        print("saving model")
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_1000ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_variables(model_file)

    return act
示例#6
0
class DQNAgent_Vanila_simple(agent):
    def __init__(self, model, opt, learning=True):
        super().__init__()
        self.memory = ReplayBuffer(3000)
        self.previous_state = None
        self.previous_action = None
        self.previous_legal_actions = None
        self.step = 0
        self.model = model
        self.opt = opt
        self.loss = 0
        self.batch_size = 10
        self.test_q = 0
        self.max_tile = 0
        #self.test_q = 0
        self.epsilon_schedule = LinearSchedule(1000000,
                                               initial_p=0.99,
                                               final_p=0.01)
        self.learning = learning

    def should_explore(self):
        self.epsilon = self.epsilon_schedule.value(self.step)
        return random.random() < self.epsilon

    def action(self):
        if self.learning:
            self.step += 1

        legalActions = self.legal_actions(deepcopy(self.gb.board))
        if len(legalActions) == 0:
            print(111111111111111111111111111111111111111)
        board = deepcopy(self.gb.board)
        board = oneHotMap(board)

        if self.learning and self.should_explore():
            q_values = None
            action = random.choice(legalActions)
            choice = self.actions[action]
        else:
            #mark
            state = torch.from_numpy(board).type(
                torch.FloatTensor).cuda().view(-1, 17, 4, 4)
            action, q_values = self.predict(state, legalActions)
            choice = self.actions[action]
        if self.learning:
            reward = self.gb.currentReward
            if reward != 0:
                reward = np.log2(reward)
            if (self.previous_state is not None
                    and self.previous_action is not None):
                self.memory.add(self.previous_state, self.previous_action,
                                self.previous_legal_actions, reward,
                                legalActions, board, 0)

        self.previous_state = board
        self.previous_action = action
        self.previous_legal_actions = legalActions

        if self.learning:
            self.update()
        return choice

    def enableLearning(self):
        self.model.train()
        self.learning = True
        self.max_tile = 0
        self.reset()

    def disableLearning(self):
        self.model.eval()
        self.learning = False

    def end_episode(self):
        if not self.learning:
            m = np.max(self.gb.board)
            if m > self.max_tile:
                self.max_tile = m
            return
        #print(self.gb.board)

        board = deepcopy(self.gb.board)
        board = oneHotMap(board)

        #legalActions = self.legal_actions(deepcopy(self.gb.board))
        #print(legalActions)
        self.memory.add(self.previous_state, self.previous_action,
                        self.previous_legal_actions, self.gb.currentReward, [],
                        board, 1)
        self.reset()

    def reset(self):

        self.previous_state = None
        self.previous_action = None
        self.previous_legal_actions = None

    def update(self):
        if self.step < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)
        (states, actions, legal_actions, reward, next_legal_actions,
         next_states, is_terminal) = batch

        terminal = torch.tensor(is_terminal).type(torch.cuda.FloatTensor)
        reward = torch.tensor(reward).type(torch.cuda.FloatTensor)
        states = torch.from_numpy(states).type(torch.FloatTensor).cuda().view(
            -1, 17, 4, 4)
        next_states = torch.from_numpy(next_states).type(
            torch.FloatTensor).cuda().view(-1, 17, 4, 4)
        # Current Q Values

        _, q_values = self.predict_batch(states)
        batch_index = torch.arange(self.batch_size, dtype=torch.long)
        #print(actions)
        #print(q_values)

        q_values = q_values[batch_index, actions]
        #print(q_values)
        # Calculate target
        q_actions_next, q_values_next = self.predict_batch(
            next_states, legalActions=next_legal_actions)
        #print(q_values_next)
        q_max = q_values_next.max(1)[0].detach()

        q_max = (1 - terminal) * q_max
        # if sum(terminal == 1) > 0:
        #     print(reward)
        #     print( (terminal == 1).nonzero())
        #     print(terminal)
        #     print(next_legal_actions)
        #     print(q_max)
        #     input()
        q_target = reward + 0.99 * q_max
        self.opt.zero_grad()
        loss = self.model.loss_function(q_target, q_values)

        loss.backward()

        self.opt.step()

        #train_loss = loss_vae.item() + loss_dqn.item()

        self.loss += loss.item() / len(states)

    def predict_batch(self, input, legalActions=None):
        input = input
        #print(legalActions)

        q_values = self.model(input)
        if legalActions is None:
            values, q_actions = q_values.max(1)
        else:
            isNotlegal = True

            # print(legalActions)
            # print(q_values)
            q_values_true = torch.full((self.batch_size, 4), -100000000).cuda()
            for i, action in enumerate(legalActions):
                q_values_true[i, action] = q_values[i, action]
            values, q_actions = q_values_true.max(1)
            q_values = q_values_true
            #print(q_values_true)
            '''
            while isNotlegal:
                isNotlegal = False
                values, q_actions = q_values.max(1)
                #print(q_values)
                #print(values)
                #print(q_actions)


                for i, action in enumerate(q_actions):
                    #print(legalActions[i])
                    if len(legalActions[i]) == 0:
                        continue

                    if action.item() not in legalActions[i]:
                        isNotlegal = True
                        # print(i)
                        # print(action.item())
                        # print(q_values)
                        q_values[i, action] = -1
                #         print(q_values)
                # print("*********************")
            '''
        return q_actions, q_values

    def predict(self, input, legalActions):
        q_values = self.model(input)
        for action in range(4):
            if action not in legalActions:
                q_values[0, action] = -100000000

        action = torch.argmax(q_values)
        if int(action.item()) not in legalActions:
            print(legalActions, q_values, action)
            print("!!!!!!!!!!!!!!!!!!!!!!!!!")
        return action.item(), q_values

    def legal_actions(self, copy_gb):
        legalActions = []
        for i in range(4):
            try_gb = gameboard(4, deepcopy(copy_gb))
            changed = try_gb.takeAction(self.actions[i])
            if changed:
                legalActions.append(i)
        return legalActions

    '''
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):

    # Create all the functions necessary to train the model

    # Returns a session that will use <num_cpu> CPU's only
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    # Creates a placeholder for a batch of tensors of a given shape and dtyp
    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    # act, train, update_target are function, debug is dict
    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)

    # Choose use prioritized replay buffer or normal replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    # Create the schedule for exploration starting from 1
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # SC2的部分開始

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None

    path_memory = np.zeros((64, 64))

    obs = env.reset()

    # Select all marines
    obs = env.step(
        actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

    # obs is tuple, obs[0] is 'pysc2.env.environment.TimeStep', obs[0].observation is dictionary.
    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    # 利用path memory記憶曾經走過的軌跡
    screen = player_relative + path_memory

    # 取得兩個陸戰隊的中心位置
    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
    player = [int(player_x.mean()), int(player_y.mean())]

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            # np.array()[None] 是指多包一個維度在外面 e.g. [1] -> [[1]]
            action = act(np.array(screen)[None],
                         update_eps=update_eps,
                         **kwargs)[0]
            reset = False

            coord = [player[0], player[1]]
            rew = 0

            # 只有四個action,分別是上下左右,走過之後在路徑上留下一整排-3,目的是與水晶碎片的id(=3)相抵銷,代表有順利採集到。
            path_memory_ = np.array(path_memory, copy=True)
            if (action == 0):  # UP

                if (player[1] >= 16):
                    coord = [player[0], player[1] - 16]
                    path_memory_[player[1] - 16:player[1], player[0]] = -3
                elif (player[1] > 0):
                    coord = [player[0], 0]
                    path_memory_[0:player[1], player[0]] = -3

            elif (action == 1):  # DOWN

                if (player[1] <= 47):
                    coord = [player[0], player[1] + 16]
                    path_memory_[player[1]:player[1] + 16, player[0]] = -3
                elif (player[1] > 47):
                    coord = [player[0], 63]
                    path_memory_[player[1]:63, player[0]] = -3

            elif (action == 2):  # LEFT

                if (player[0] >= 16):
                    coord = [player[0] - 16, player[1]]
                    path_memory_[player[1], player[0] - 16:player[0]] = -3
                elif (player[0] < 16):
                    coord = [0, player[1]]
                    path_memory_[player[1], 0:player[0]] = -3

            elif (action == 3):  # RIGHT

                if (player[0] <= 47):
                    coord = [player[0] + 16, player[1]]
                    path_memory_[player[1], player[0]:player[0] + 16] = -3
                elif (player[0] > 47):
                    coord = [63, player[1]]
                    path_memory_[player[1], player[0]:63] = -3

            # 更新path_memory
            path_memory = np.array(path_memory_)

            # 如果不能移動陸戰隊,想必是還沒圈選到陸戰隊,圈選他們
            if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])

            # 移動陸戰隊
            new_action = [
                sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
            ]

            # 取得環境給的observation
            obs = env.step(actions=new_action)

            # 這裡要重新取得player_relative,因為上一行的obs是個有複數資訊的tuple
            # 但我們要存入replay_buffer的只有降維後的screen畫面
            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            new_screen = player_relative + path_memory

            # 取得reward
            rew = obs[0].reward

            # StepType.LAST 代表done的意思
            done = obs[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer
            replay_buffer.add(screen, action, rew, new_screen, float(done))

            # 確實存入之後就能以新screen取代舊screen
            screen = new_screen

            episode_rewards[-1] += rew

            if done:
                # 重新取得敵我中立關係位置圖
                obs = env.reset()
                # player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

                # # 還是看不懂為何要加上path_memory
                # screen = player_relative + path_memory

                # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
                # player = [int(player_x.mean()), int(player_y.mean())]

                # # 圈選全部的陸戰隊(為何要在done observation做這件事情?)
                # env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
                episode_rewards.append(0.0)

                # 清空path_memory
                path_memory = np.zeros((64, 64))

                reset = True

            # 定期從replay buffer中抽experience來訓練,以及train target network
            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                # 這裡的train來自deepq.build_train
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            # target network
            if t > learning_starts and t % target_network_update_freq == 0:
                # 同樣來自deepq.build_train
                # Update target network periodically
                update_target()

            # 下LOG追蹤reward
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode", mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            # 當model進步時,就存檔下來
            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
            if model_saved:
                if print_freq is not None:
                    logger.log("Restored model with mean reward: {}".format(
                        saved_mean_reward))
                U.load_state(model_file)

        return ActWrapper(act)
class MemBufferThread(threading.Thread):
    # 注意可变参数概念
    def __init__(self,
                 mem_queue,
                 max_timesteps=1000000,
                 buffer_size=50000,
                 batch_size=32,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6):

        threading.Thread.__init__(self)
        self.mem_queue = mem_queue
        self.prioritized_replay = prioritized_replay
        self.batch_size = batch_size
        self.batch_idxes = None
        self.prioritized_replay_eps = prioritized_replay_eps

        # Create the replay buffer
        if prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                buffer_size, alpha=prioritized_replay_alpha)
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = max_timesteps
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(buffer_size)
            self.beta_schedule = None

    def __len__(self):
        return self.replay_buffer.__len__()

    def sample(self, t):
        if self.prioritized_replay:
            experience = self.replay_buffer.sample(
                self.batch_size,
                beta=self.beta_schedule.value(t))  # 这个t的取值有待商议,
            (obses_t, actions, rewards, obses_tp1, dones, weights,
             self.batch_idxes) = experience
        else:
            obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                self.batch_size)
            #  np.ones_like() : Return an array of ones with the same shape and type as a given array.
            weights, self.batch_idxes = np.ones_like(rewards), None

        return obses_t, actions, rewards, obses_tp1, dones, weights

    def update_priorities(self, td_errors):
        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
        self.replay_buffer.update_priorities(self.batch_idxes, new_priorities)

    def run(self):
        # flag = 1
        while True:
            if self.mem_queue.full() is True:
                print("the mem_queue is full")
            # if self.replay_buffer.__len__() >= 100000 and self.replay_buffer.__len__() % 100 == 0:  # bool(flag):
            #     # print("replay_buffer is 100000 !")
            #     print('')
            #    flag = 0
            if self.mem_queue.empty() is not True:
                single_mem = self.mem_queue.get()
                self.replay_buffer.add(single_mem[0], single_mem[1],
                                       single_mem[2], single_mem[3],
                                       single_mem[4])
示例#9
0
    def evaluate(self, num_episodes, render=False):
        with U.make_session(NUM_CORES):
            self.t0 = time.time()
            env = self.env.env

            # Create all the functions necessary to train the model
            act, train, update_target, debug = deepq.build_train(
                    make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
                    q_func=model,
                    num_actions=env.action_space.n,
                    optimizer=tf.train.AdamOptimizer(learning_rate=5e-4)
            )
            # Create the replay buffer
            replay_buffer = ReplayBuffer(50000)
            # Create the schedule for exploration starting from 1 (every action is random) down to
            # 0.02 (98% of actions are selected according to values predicted by the model).
            exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)

            # Initialize the parameters and copy them to the target network.
            U.initialize()
            update_target()

            self.episode_count += 1
            state = env.reset()
            self.scores = [0.0]
            episode_q = []

            for t in itertools.count():
                action = act(state[None], update_eps=exploration.value(t))[0]
                observation, reward, done, _ = env.step(action)
                replay_buffer.add(state, action, reward, observation, float(done))

                state = observation
                self.scores[-1] += reward

                episode_q.append(float(debug['q_values'](state[None]).max()))

                if render:
                    env.render()

                if done:
                    print('{0}, score: {1} ({2})'.format(len(self.scores), self.scores[-1], np.mean(self.scores[-100:])))
                    self.evaluation.info['q_values'].append(np.mean(episode_q))

                    if len(self.scores) >= num_episodes:
                        return self.final_evaluation()

                    state = env.reset()
                    episode_q = []
                    self.scores.append(0)

                    if self.env.solved(self.scores):
                        self.evaluation.info['solved'] = len(self.scores)

                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()

            U.reset()
            return self.final_evaluation()
示例#10
0
def do_agent_exploration(updates_queue: multiprocessing.Queue,
                         q_func_vars_trained_queue: multiprocessing.Queue,
                         network, seed, config, lr, total_timesteps,
                         learning_starts, buffer_size, exploration_fraction,
                         exploration_initial_eps, exploration_final_eps,
                         train_freq, batch_size, print_freq, checkpoint_freq,
                         gamma, target_network_update_freq, prioritized_replay,
                         prioritized_replay_alpha, prioritized_replay_beta0,
                         prioritized_replay_beta_iters, prioritized_replay_eps,
                         experiment_name, load_path, network_kwargs):
    env = DotaEnvironment()

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, _, _, debug = deepq.build_train(
        scope='deepq_act',
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=exploration_initial_eps,
                                 final_p=exploration_final_eps)

    U.initialize()

    reward_shaper = ActionAdviceRewardShaper(config=config)
    reward_shaper.load()
    reward_shaper.generate_merged_demo()

    full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'),
                                   experiment_name)
    experiment_dir = os.path.join('experiments', full_exp_name)
    os.makedirs(experiment_dir, exist_ok=True)

    summary_dir = os.path.join(experiment_dir, 'summaries')
    os.makedirs(summary_dir, exist_ok=True)
    summary_writer = tf.summary.FileWriter(summary_dir)
    checkpoint_dir = os.path.join(experiment_dir, 'checkpoints')
    os.makedirs(checkpoint_dir, exist_ok=True)
    stats_dir = os.path.join(experiment_dir, 'stats')
    os.makedirs(stats_dir, exist_ok=True)

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_dir or td

        os.makedirs(td, exist_ok=True)
        model_file = os.path.join(td, "best_model")
        model_saved = False
        saved_mean_reward = None

        # if os.path.exists(model_file):
        #     print('Model is loading')
        #     load_variables(model_file)
        #     logger.log('Loaded model from {}'.format(model_file))
        #     model_saved = True
        # elif load_path is not None:
        #     load_variables(load_path)
        #     logger.log('Loaded model from {}'.format(load_path))

        def synchronize_q_func_vars():
            updates_queue.put(
                UpdateMessage(UPDATE_STATUS_SEND_WEIGHTS, None, None))
            q_func_vars_trained = q_func_vars_trained_queue.get()
            update_q_func_expr = []
            for var, var_trained in zip(debug['q_func_vars'],
                                        q_func_vars_trained):
                update_q_func_expr.append(var.assign(var_trained))
            update_q_func_expr = tf.group(*update_q_func_expr)
            sess.run(update_q_func_expr)

        synchronize_q_func_vars()

        episode_rewards = []
        act_step_t = 0
        while act_step_t < total_timesteps:
            # Reset the environment
            obs = env.reset()
            obs = StatePreprocessor.process(obs)
            episode_rewards.append(0.0)
            done = False
            # Demo preservation variables
            demo_picked = 0
            demo_picked_step = 0
            # Demo switching statistics
            demo_switching_stats = [(0, 0)]
            # Sample the episode until it is completed
            act_started_step_t = act_step_t
            while not done:
                # Take action and update exploration to the newest value
                biases, demo_indexes = reward_shaper.get_action_potentials_with_indexes(
                    obs, act_step_t)
                update_eps = exploration.value(act_step_t)
                actions, is_randoms = act(np.array(obs)[None],
                                          biases,
                                          update_eps=update_eps)
                action, is_random = actions[0], is_randoms[0]
                if not is_random:
                    bias_demo = demo_indexes[action]
                    if bias_demo != demo_switching_stats[-1][1]:
                        demo_switching_stats.append(
                            (act_step_t - act_started_step_t, bias_demo))
                    if bias_demo != 0 and demo_picked == 0:
                        demo_picked = bias_demo
                        demo_picked_step = act_step_t + 1
                pairs = env.step(action)
                action, (new_obs, rew, done, _) = pairs[-1]
                logger.log(
                    f'{act_step_t}/{total_timesteps} obs {obs} action {action}'
                )

                # Compute state on the real reward but learn from the normalized version
                episode_rewards[-1] += rew
                rew = np.sign(rew) * np.log(1 + np.abs(rew))
                new_obs = StatePreprocessor.process(new_obs)

                if len(new_obs) == 0:
                    done = True
                else:
                    transition = (obs, action, rew, new_obs, float(done),
                                  act_step_t)
                    obs = new_obs
                    act_step_t += 1
                    if act_step_t - demo_picked_step >= MIN_STEPS_TO_FOLLOW_DEMO_FOR:
                        demo_picked = 0
                    reward_shaper.set_demo_picked(act_step_t, demo_picked)
                    updates_queue.put(
                        UpdateMessage(UPDATE_STATUS_CONTINUE, transition,
                                      demo_picked))
            # Post episode logging
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="rewards",
                                 simple_value=episode_rewards[-1])
            ])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(
                value=[tf.Summary.Value(tag="eps", simple_value=update_eps)])
            summary_writer.add_summary(summary, act_step_t)
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="episode_steps",
                                 simple_value=act_step_t - act_started_step_t)
            ])
            summary_writer.add_summary(summary, act_step_t)
            mean_5ep_reward = round(float(np.mean(episode_rewards[-5:])), 1)
            num_episodes = len(episode_rewards)
            if print_freq is not None and num_episodes % print_freq == 0:
                logger.record_tabular("steps", act_step_t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 5 episode reward", mean_5ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(act_step_t)))
                logger.dump_tabular()
            # Wait for the learning to finish and synchronize
            synchronize_q_func_vars()
            # Record demo_switching_stats
            if num_episodes % 10 == 0:
                save_demo_switching_stats(demo_switching_stats, stats_dir,
                                          num_episodes)
            if checkpoint_freq is not None and num_episodes % checkpoint_freq == 0:
                # Periodically save the model
                rec_model_file = os.path.join(
                    td, "model_{}_{:.2f}".format(num_episodes,
                                                 mean_5ep_reward))
                save_variables(rec_model_file)
                # Check whether the model is the best so far
                if saved_mean_reward is None or mean_5ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_5ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_5ep_reward

        updates_queue.put(UpdateMessage(UPDATE_STATUS_FINISH, None, None))
示例#11
0
def train(env,
        eval_env,
        q_func,
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=100,
        checkpoint_freq=10000,
        learning_starts=1000,
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        param_noise=False,
        callback=None,
        my_skill_set=None,
        log_dir = None,
        num_eval_episodes=10,
        render=False,
        render_eval = False,
        commit_for = 1
        ):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model


    if my_skill_set: assert commit_for>=1, "commit_for >= 1"

    save_idx = 0
    with U.single_threaded_session() as sess:
    

        ## restore
        if my_skill_set:
            action_shape = my_skill_set.len
        else:
            action_shape = env.action_space.n
            
        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph
        observation_space_shape = env.observation_space.shape
        def make_obs_ph(name):
            return U.BatchInput(observation_space_shape, name=name)

        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=action_shape,
            optimizer=tf.train.AdamOptimizer(learning_rate=lr),
            gamma=gamma,
            grad_norm_clipping=10,
            param_noise=param_noise
        )

        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': q_func,
            'num_actions': action_shape,
        }

        act = ActWrapper(act, act_params)

        # Create the replay buffer
        if prioritized_replay:
            replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = max_timesteps
            beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                           initial_p=prioritized_replay_beta0,
                                           final_p=1.0)
        else:
            replay_buffer = ReplayBuffer(buffer_size)
            beta_schedule = None
        # Create the schedule for exploration starting from 1.
        exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                     initial_p=1.0,
                                     final_p=exploration_final_eps)

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        # sess.run(tf.variables_initializer(new_variables))
        # sess.run(tf.global_variables_initializer())
        update_target()

        if my_skill_set:
            ## restore skills
            my_skill_set.restore_skillset(sess=sess)
            

        episode_rewards = [0.0]
        saved_mean_reward = None
        obs = env.reset()
        reset = True
        
        model_saved = False
        
        model_file = os.path.join(log_dir, "model", "deepq")

        # save the initial act model 
        print("Saving the starting model")
        os.makedirs(os.path.dirname(model_file), exist_ok=True)
        act.save(model_file + '.pkl')

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            paction = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            
            if(my_skill_set):
                skill_obs = obs.copy()
                primitive_id = paction
                rew = 0.
                for _ in range(commit_for):
                
                    ## break actions into primitives and their params    
                    action = my_skill_set.pi(primitive_id=primitive_id, obs = skill_obs.copy(), primitive_params=None)
                    new_obs, skill_rew, done, _ = env.step(action)
                    if render:
                        # print(action)
                        env.render()
                        sleep(0.1)
                    rew += skill_rew
                    skill_obs = new_obs
                    terminate_skill = my_skill_set.termination(new_obs)
                    if done or terminate_skill:
                        break
                    
            else:
                action= paction

                env_action = action
                reset = False
                new_obs, rew, done, _ = env.step(env_action)
                if render:
                    env.render()
                    sleep(0.1)
              


            # Store transition in the replay buffer for the outer env
            replay_buffer.add(obs, paction, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True
                print("Time:%d, episodes:%d"%(t,len(episode_rewards)))

                # add hindsight experience
            

            if t > learning_starts and t % train_freq == 0:
                # print('Training!')
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            # print(len(episode_rewards), episode_rewards[-11:-1])
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
        
            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 50 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    act.save(model_file + '%d.pkl'%save_idx)
                    save_idx += 1
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
                # else:
                #     print(saved_mean_reward, mean_100ep_reward)

            if (eval_env is not None) and t > learning_starts and t % target_network_update_freq == 0:
                
                # dumping other stats
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("%d time spent exploring", int(100 * exploration.value(t)))

                print("Testing!")
                eval_episode_rewards = []
                eval_episode_successes = []

                for i in range(num_eval_episodes):
                    eval_episode_reward = 0.
                    eval_obs = eval_env.reset()
                    eval_obs_start = eval_obs.copy()
                    eval_done = False
                    while(not eval_done):
                        eval_paction = act(np.array(eval_obs)[None])[0]
                        
                        if(my_skill_set):
                            eval_skill_obs = eval_obs.copy()
                            eval_primitive_id = eval_paction
                            eval_r = 0.
                            for _ in range(commit_for):
                            
                                ## break actions into primitives and their params    
                                eval_action, _ = my_skill_set.pi(primitive_id=eval_primitive_id, obs = eval_skill_obs.copy(), primitive_params=None)
                                eval_new_obs, eval_skill_rew, eval_done, eval_info = eval_env.step(eval_action)
                                # print('env reward:%f'%eval_skill_rew)
                                if render_eval:
                                    print("Render!")
                                    
                                    eval_env.render()
                                    print("rendered!")

                                eval_r += eval_skill_rew
                                eval_skill_obs = eval_new_obs
                                
                                eval_terminate_skill = my_skill_set.termination(eval_new_obs)

                                if eval_done or eval_terminate_skill:
                                    break
                                
                        else:
                            eval_action= eval_paction

                            env_action = eval_action
                            reset = False
                            eval_new_obs, eval_r, eval_done, eval_info = eval_env.step(env_action)
                            if render_eval:
                                # print("Render!")
                                
                                eval_env.render()
                                # print("rendered!")


                        
                        eval_episode_reward += eval_r
                        # print("eval_r:%f, eval_episode_reward:%f"%(eval_r, eval_episode_reward))
                        eval_obs = eval_new_obs
                        
                    eval_episode_success = (eval_info["done"]=="goal reached")
                    if(eval_episode_success):
                        logger.info("success, training epoch:%d,starting config:"%t)


                    eval_episode_rewards.append(eval_episode_reward)
                    eval_episode_successes.append(eval_episode_success)

                combined_stats = {}

                # print(eval_episode_successes, np.mean(eval_episode_successes))
                combined_stats['eval/return'] = normal_mean(eval_episode_rewards)
                combined_stats['eval/success'] = normal_mean(eval_episode_successes)
                combined_stats['eval/episodes'] = (len(eval_episode_rewards))

                for key in sorted(combined_stats.keys()):
                    logger.record_tabular(key, combined_stats[key])
                
                print("dumping the stats!")
                logger.dump_tabular()

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)
示例#12
0
def do_network_training(updates_queue: multiprocessing.Queue,
                        weights_queue: multiprocessing.Queue, network, seed,
                        config, lr, total_timesteps, learning_starts,
                        buffer_size, exploration_fraction,
                        exploration_initial_eps, exploration_final_eps,
                        train_freq, batch_size, print_freq, checkpoint_freq,
                        gamma, target_network_update_freq, prioritized_replay,
                        prioritized_replay_alpha, prioritized_replay_beta0,
                        prioritized_replay_beta_iters, prioritized_replay_eps,
                        experiment_name, load_path, network_kwargs):
    _ = get_session()
    set_global_seeds(seed)
    q_func = build_q_func(network, **network_kwargs)

    def make_obs_ph(name):
        return ObservationInput(DotaEnvironment.get_observation_space(),
                                name=name)

    _, train, update_target, debug = deepq.build_train(
        scope='deepq_train',
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=DotaEnvironment.get_action_space().n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
    )

    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None

    U.initialize()
    update_target()

    reward_shaper = ActionAdviceRewardShaper(config=config)
    reward_shaper.load()
    reward_shaper.generate_merged_demo()

    full_exp_name = '{}-{}'.format(date.today().strftime('%Y%m%d'),
                                   experiment_name)
    experiment_dir = os.path.join('experiments', full_exp_name)
    os.makedirs(experiment_dir, exist_ok=True)

    learning_dir = os.path.join(experiment_dir, 'learning')
    learning_summary_writer = tf.summary.FileWriter(learning_dir)

    update_step_t = 0
    should_finish = False
    while not should_finish:
        message = updates_queue.get()
        logger.log(f'do_network_training ← {message}')
        if message.status == UPDATE_STATUS_CONTINUE:
            transition = message.transition
            replay_buffer.add(*transition)
            next_act_step = transition[5] + 1
            reward_shaper.set_demo_picked(next_act_step, message.demo_picked)

            if update_step_t >= learning_starts and update_step_t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(update_step_t))
                    (obses_t, actions, rewards, obses_tp1, dones, ts, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones, ts = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                biases_t = []
                for obs_t, timestep in zip(obses_t, ts):
                    biases_t.append(
                        reward_shaper.get_action_potentials(obs_t, timestep))
                biases_tp1 = []
                for obs_tp1, timestep in zip(obses_tp1, ts):
                    biases_tp1.append(
                        reward_shaper.get_action_potentials(
                            obs_tp1, timestep + 1))
                td_errors, weighted_error = train(obses_t, biases_t, actions,
                                                  rewards, obses_tp1,
                                                  biases_tp1, dones, weights)
                # Loss logging
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag='weighted_error',
                                     simple_value=weighted_error)
                ])
                learning_summary_writer.add_summary(summary, update_step_t)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)
            if update_step_t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()
            update_step_t += 1
        elif message.status == UPDATE_STATUS_SEND_WEIGHTS:
            q_func_vars = get_session().run(debug['q_func_vars'])
            weights_queue.put(q_func_vars)
        elif message.status == UPDATE_STATUS_FINISH:
            should_finish = True
        else:
            logger.log(f'Unknown status in UpdateMessage: {message.status}')
示例#13
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=3000,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=3000,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs
            ):


    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space
    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)


    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        # gamma=gamma,
        # grad_norm_clipping=10,
        # param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(100000),
                                 initial_p=1.0,
                                 final_p=0.02)

    # Initialize the paramete    print(type(act))rs and copy them to the target network.
    U.initialize()
    update_target()





    old_state = None





    formula_LTLf_1 = "!F(die)"
    monitoring_RightToLeft = MonitoringSpecification(
        ltlf_formula=formula_LTLf_1,
        r=1,
        c=-10,
        s=1,
        f=-10
    )



    monitoring_specifications = [monitoring_RightToLeft]

    stepCounter = 0
    done = False

    def RightToLeftConversion(observation) -> TraceStep:

        print(stepCounter)


        if(done and not(stepCounter>=199)):
            die=True
        else:
            die=False


        dictionary={'die': die}
        print(dictionary)
        return dictionary

    multi_monitor = MultiRewardMonitor(
        monitoring_specifications=monitoring_specifications,
        obs_to_trace_step=RightToLeftConversion
    )













    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True


    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        episodeCounter=0
        num_episodes=0

        for t in itertools.count():
            
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(t))[0]
            #print(action)
            new_obs, rew, done, _ = env.step(action)
            stepCounter+=1

            rew, is_perm = multi_monitor(new_obs)
            old_state=new_obs




            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew


            is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
            if episodeCounter % 100 == 0 or episodeCounter<1:
                # Show off the result
                #print("coming here Again and Again")
                env.render()


            if done:
                episodeCounter+=1
                num_episodes+=1
                obs = env.reset()
                episode_rewards.append(0)
                multi_monitor.reset()
                stepCounter=0
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if t > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                    train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))

                # Update target network periodically.
                if t % 1000 == 0:
                    update_target()
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean 100 episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 500 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    act.save_act()
                    #save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        # if model_saved:
        #     if print_freq is not None:
        #         logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
        #     load_variables(model_file)

    return act
示例#14
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=5,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the trained model from. (default: None)(used in test stage)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.

    """

    # Create all the functions necessary to train the model
    sess = get_session()
    set_global_seeds(seed)
    med_libs = MedLibs()
    '''Define Q network 
    inputs: observation place holder(make_obs_ph), num_actions, scope, reuse
    outputs(tensor of shape batch_size*num_actions): values of each action, Q(s,a_{i})
    '''
    q_func = build_q_func(network, **network_kwargs)
    '''  To put observations into a placeholder  '''
    # TODO: Can only deal with Discrete and Box observation spaces for now
    # observation_space = env.observation_space (default)
    # Use sub_obs_space instead

    observation_space = med_libs.subobs_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    '''  Customize action  '''
    # TODO: subset of action space.
    action_dim = med_libs.sub_act_dim
    ''' 
    Returns: deepq.build_train()
        act: (tf.Variable, bool, float) -> tf.Variable
            function to select and action given observation.
            act is computed by [build_act] or [build_act_with_param_noise]
        train: (object, np.array, np.array, object, np.array, np.array) -> np.array
            optimize the error in Bellman's equation.
        update_target: () -> ()
            copy the parameters from optimized Q function to the target Q function. 
        debug: {str: function}
            a bunch of functions to print debug data like q_values.
    '''

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=action_dim,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        double_q=True,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': action_dim,
    }
    '''Contruct an act object using ActWrapper'''
    act = ActWrapper(act, act_params)
    ''' Create the replay buffer'''
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    '''Create the schedule for exploration starting from 1.'''
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    '''
    Initialize all the uninitialized variables in the global scope and copy them to the target network.
    '''
    U.initialize()
    update_target()
    episode_rewards = [0.0]
    saved_mean_reward = None

    obs = env.reset()
    sub_obs = med_libs.custom_obs(obs)  # TODO: customize observations
    pre_obs = obs
    reset = True
    mydict = med_libs.action_dict
    already_starts = False

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td
        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True

        elif load_path is not None:
            # load_path: a trained model/policy
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))
        ''' Training loop starts'''
        t = 0
        while t < total_timesteps:
            if callback is not None:
                if callback(locals(), globals()):
                    break
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            ''' Choose action: take action and update exploration to the newest value
            '''
            # TODO: Mixed action strategy
            # Normal status, action is easily determined by rules, use [obs]
            action = med_libs.simple_case_action(obs)
            # Distraction status, action is determined by Q, with [sub_obs]
            if action == -10:
                action = act(np.array(sub_obs)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
                action = med_libs.action_Q_env(
                    action
                )  # TODO:action_Q_env, from Q_action(0~2) to env_action(2~4)

            reset = False
            ''' Step action '''
            new_obs, rew, done, d_info = env.step(action)
            d_att_last = int(pre_obs[0][0])
            d_att_now = int(obs[0][0])
            d_att_next = int(new_obs[0][0])
            #TODO: you can customize reward here.
            ''' Store transition in the replay buffer.'''
            pre_obs = obs
            obs = new_obs
            sub_new_obs = med_libs.custom_obs(new_obs)

            if (d_att_last == 0 and d_att_now == 1) and not already_starts:
                already_starts = True

            if already_starts and d_att_now == 1:
                replay_buffer.add(sub_obs, action, rew, sub_new_obs,
                                  float(done))
                episode_rewards[-1] += rew  # Sum of rewards
                t = t + 1
                print(
                    '>> Iteration:{}, State[d_att,cd_activate,L4_available,ssl4_activate,f_dc]:{}'
                    .format(t, sub_obs))
                print(
                    'Dis_Last:{}, Dis_Now:{}, Dis_Next:{},Reward+Cost:{}, Action:{}'
                    .format(
                        d_att_last, d_att_now, d_att_next, rew,
                        list(mydict.keys())[list(
                            mydict.values()).index(action)]))

            # update sub_obs
            sub_obs = sub_new_obs

            # Done and Reset
            if done:
                print('Done infos: ', d_info)
                print('======= end =======')
                obs = env.reset()
                sub_obs = med_libs.custom_obs(obs)  # TODO: custom obs
                pre_obs = obs  # TODO: save obs at t-1
                already_starts = False
                episode_rewards.append(0.0)
                reset = True

            # Update the Q network parameters
            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None

                # Calculate td-errors
                actions = med_libs.action_env_Q(
                    actions
                )  # TODO:action_env_Q, from env_action(2~4) to Q_action(0~2)
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)

                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically, copy weights of Q to target Q
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_variables(model_file)

    return act
示例#15
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape
    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise
    )

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_state(model_file)

    return act
示例#16
0
文件: Learn.py 项目: cstan969/SC2
def learn(
        env,
        q_func,
        num_actions=3,
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        train_freq=1,
        batch_size=32,
        print_freq=1,
        checkpoint_freq=10000,
        learning_starts=1000,
        gamma=1.0,  #NO discounted reward
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,
        prioritized_replay_eps=1e-6,
        num_cpu=16,
        param_noise=False,
        param_noise_threshold=0.05,
        callback=None,
        demo_replay=[]):

    #Create functions necessary to train the model
    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize U, reward, obs, environment
    U.initialize()
    #update_target() #WHAT DOES THIS DO OR HOW DO I DO THIS IF I AM NOT USING DEEPQ.BUILD_TRAIN
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()

    # Init action_vector
    action_vector = [[]]
    track_unit_vector = np.array([[]])
    #Init feature_vector
    feature_vector = [[]]
    #feature_vector = FeatureObservation.PopulateFeatureVector(env, obs)
    #print(feature_vector)

    # Initialize Unit LastActionTaken Vector - This is for determining which units need to select actions still!
    time_between_actions = 9.0  #frames??
    #ActionVector = np.array([[]], dtype=[('unit_id', 'int'), ('x_pos', 'float'), ('y_pos', 'float'),
    #                                     ('last_action', 'float')])
    #unit_count_for_ActionVector = 0
    #for unit in feature_vector:
    #   if unit[1] == 1: #Add unit identifier to last action taken vector
    #       np.append(ActionVector, [unit_count_for_ActionVector, unit[2], unit[3], (-1) * time_between_actions], axis=1)
    #       unit_count_for_ActionVector = unit_count_for_ActionVector + 1

    reset = True  # WHAT IS RESET

    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        First = True
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            #EXPLORATION SPACE
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            #Populate Observation Vector
            feature_vector = FeatureObservation.PopulateFeatureVector(
                env, obs, feature_vector, action_vector)
            print(feature_vector)
            track_unit_vector = TrackUnits.track(track_unit_vector,
                                                 feature_vector)
            print(track_unit_vector)
            #for each unit in the vector, get an action for that unit...
            for u in range(0, feature_vector.shape[0]):
                if feature_vector[u][1] == 1:
                    xy = UnitAction.take_action(feature_vector, u, q_func)
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(_SELECT_POINT, [[
                            0
                        ], [feature_vector[u][3], feature_vector[u][2]]])
                    ])
                    #if movement then move

                    #if action then action
                    obs = env.step(actions=[
                        sc2_actions.FunctionCall(_ATTACK_SCREEN, [[0], xy])
                    ])
                    #update  track_unit_vector

            #DO ACTIONS
            obs, screen, player = common.select_marine(env, obs)
            #get action from training model thingy
            #action = act()
            reset = False
            rew = 0
            new_action = None
            #obs, new_action = common.marine_action(env, obs, player, action)

            new_screen = obs[0].observation["screen"][_PLAYER_RELATIVE]
            army_count = env._obs.observation.player_common.army_count
            rew += obs[0].reward / army_count
            game_info = sc_pb.ResponseGameInfo
            feature_vector = FeatureObservation.PopulateFeatureVector(env, obs)

            output = q_func(feature_vector)
            print(output)

            # available_actions = obs[0].observation["available_actions"]
            # for i in available_actions:
            #     print(i)
            # print("")

            # #select marine and see what we can do now...
            # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_POINT, [[0], [feature_vector[0][2],
            #                                                                        feature_vector[0][3]]])])
            # obs = env.step(actions=[sc2_actions.FunctionCall(_ATTACK_SCREEN, [_NOT_QUEUED, [feature_vector[12][2],
            #                                                                                feature_vector[12][3]]])])
            # available_actions = obs[0].observation["available_actions"]
            # for i in available_actions:
            #     print(i)
            # print("")

        for t in range(max_timesteps):
            for unit in FeatureVector:
                game_info = sc_pb.ResponseGameInfo

                #obs, screen, player = common.select_marine(env, obs)
                action = act(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
            #reset = False
            #rew = 0

            #new_action = None

            #obs, new_action = common.marine_action(env, obs, player, action)

    #Make decisions for each ally unit based on Feature Vector fed into
    #for unit in FeatureVector:
    #    if unit[1] == 1: # Then Friendly needs to make decision
    #do things

    #if ally army count > 0 make army actions
    try:
        if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[
                "available_actions"]:
            obs = env.step(actions=new_action)
        else:
            new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
            obs = env.step(actions=new_action)
    except Exception as e:
        print(e)
示例#17
0
def learn(
        env,
        q_func,  # input obs,num od actions etc and obtain q value for each action
        num_actions=16,  # available actions: up down left right
        lr=5e-4,
        max_timesteps=100000,
        buffer_size=50000,  # size of the replay buffer
        exploration_fraction=0.1,  # during the first 10% training period, exploration rate is decreased from 1 to 0.02
        exploration_final_eps=0.02,  # final value of random action probability
        train_freq=1,  # update the model every `train_freq` steps.
        batch_size=32,  # size of a batched sampled from replay buffer for training
        print_freq=1,
        checkpoint_freq=10000,
        learning_starts=1000,  # time for the model to collect transitions before learning starts
        gamma=1.0,
        target_network_update_freq=500,
        prioritized_replay=False,
        prioritized_replay_alpha=0.6,
        prioritized_replay_beta0=0.4,
        prioritized_replay_beta_iters=None,  # beta keeps to be beta0
        prioritized_replay_eps=1e-6,
        num_cpu=16,  # number of cpus to use for training
        param_noise=False,  # whether or not to use parameter space noise
        param_noise_threshold=0.05,
        callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(
        name
    ):  # Creates a placeholder for a batch of tensors of a given shape and dtype
        return U_b.BatchInput((16, 16), name=name)

    act_x, train_x, update_target_x, debug_x = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,  #   clip gradient norms to this value
        scope="deepq_x")

    act_y, train_y, update_target_y, debug_y = deepq.build_train(  #because there are two players in the game
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope="deepq_y")

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer_x = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        replay_buffer_y = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule_x = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,  # 0.4->1
            final_p=1.0)

        beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)
    else:
        replay_buffer_x = ReplayBuffer(buffer_size)
        replay_buffer_y = ReplayBuffer(buffer_size)

        beta_schedule_x = None
        beta_schedule_y = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.  ---环境初始化
    U.initialize()
    update_target_x()
    update_target_y()
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()  # start a new episode

    # Select all marines first      ---选择所有个体,获得新的观察
    obs = env.step(actions=[
        sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
    ])  # Apply actions, step the world forward, and return observations.

    # 查看返回的字典中屏幕中的目标关系分布图:1表示着地图中个体的位置,3表示着矿物的位置,就是终端的矩阵图
    player_relative = obs[0].observation["feature_screen"][
        _PLAYER_RELATIVE]  #obs is a 'TimeStep' whose type is tuple of ['step_type', 'reward', 'discount', 'observation'];step_type.first or mid or last
    # 矿的位置 0,1矩阵分布
    screen = (player_relative == _PLAYER_NEUTRAL).astype(
        int
    )  #+ path_memory   screen=1 or 0  to indicate the location of mineral
    # 队友的位置,给出行列信息
    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero(
    )  #the location of team member: row, col <-> y,x

    # print(player_relative)
    # print('*************')
    # print(screen)
    # print(_PLAYER_FRIENDLY)
    #
    # print(player_x)
    # print(player_y)
    # print('ssss)

    # if (len(player_x) == 0):
    #   player_x = np.array([0])
    #   # print('player_x from null to 0')
    #   # print(player_x)
    # if (len(player_y) == 0):
    #   player_y = np.array([0])
    #   # print('player_y from null to 0')
    #   # print(player_y)

    player = [int(player_x.mean()), int(player_y.mean())]

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join("model/", "mineral_shards")  #给了一个模型保存路径
        print(model_file)

        for t in range(max_timesteps):
            # print('timestep=',t)
            if callback is not None:
                if callback(locals(), globals()):
                    break

            # Take action and update exploration to the newest value--更新探索并采取动作
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)  # 输出一个1->0.02之间的值
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            # actions obtained after exploration
            action_x = act_x(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
            # print('action_x is ',action_x)

            action_y = act_y(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
            # print('action_y is ',action_y)
            reset = False

            # coord = [player[0], player[1]]
            rew = 0  #reward

            coord = [action_x, action_y]

            if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
            # obs = env.step(actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

            new_action = [
                sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
            ]

            # else:
            #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

            obs = env.step(actions=new_action)

            player_relative = obs[0].observation["feature_screen"][
                _PLAYER_RELATIVE]
            # print(player_relative)
            new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int)

            # print(_PLAYER_FRIENDLY)

            # print(player_x)
            # print(player_y)
            # print('ssssss2')

            # if (len(player_x) == 0):
            #   player_x = np.array([0])
            #   # print('player_x from null to 0')
            #   # print(player_x)
            # if (len(player_y) == 0):
            #   player_y = np.array([0])
            #   # print('player_y from null to 0')
            #   # print(player_y)

            # player = [int(player_x.mean()), int(player_y.mean())]

            rew = obs[0].reward

            done = obs[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer.
            replay_buffer_x.add(screen, action_x, rew, new_screen, float(done))
            replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

            screen = new_screen

            episode_rewards[-1] += rew
            reward = episode_rewards[-1]

            if done:
                obs = env.reset()
                # player_relative = obs[0].observation["feature_screen"][_PLAYER_RELATIVE]
                # screent = (player_relative == _PLAYER_NEUTRAL).astype(int)
                #
                # player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
                # player = [int(player_x.mean()), int(player_y.mean())]

                # Select all marines first
                env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
                episode_rewards.append(0.0)
                # print("episode_rewards is ", episode_rewards)
                print('num_episodes is', len(episode_rewards))

                #episode_minerals.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:  #train_freq=1: update the model every `train_freq` steps
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:

                    experience_x = replay_buffer_x.sample(
                        batch_size, beta=beta_schedule_x.value(t))
                    (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x,
                     weights_x, batch_idxes_x) = experience_x

                    experience_y = replay_buffer_y.sample(
                        batch_size, beta=beta_schedule_y.value(t))
                    (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y,
                     weights_y, batch_idxes_y) = experience_y
                else:

                    obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample(
                        batch_size)
                    weights_x, batch_idxes_x = np.ones_like(
                        rewards_x
                    ), None  # weights_x is an array padded with 1 which has the same shape as rewards_x

                    obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(
                        batch_size)
                    weights_y, batch_idxes_y = np.ones_like(rewards_y), None

                td_errors_x = train_x(obses_t_x, actions_x, rewards_x,
                                      obses_tp1_x, dones_x, weights_x)

                td_errors_y = train_y(obses_t_y, actions_y, rewards_y,
                                      obses_tp1_y, dones_y, weights_y)

                if prioritized_replay:
                    new_priorities_x = np.abs(
                        td_errors_x) + prioritized_replay_eps
                    new_priorities_y = np.abs(
                        td_errors_y) + prioritized_replay_eps
                    replay_buffer_x.update_priorities(batch_idxes_x,
                                                      new_priorities_x)
                    replay_buffer_y.update_priorities(batch_idxes_y,
                                                      new_priorities_y)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target_x()
                update_target_y()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]),
                                      1)  # round: sishewuru value
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act_x), ActWrapper(act_y)
示例#18
0
def learn(
    env,
    var_func,
    cvar_func,
    nb_atoms,
    run_alpha=None,
    lr=5e-4,
    max_timesteps=100000,
    buffer_size=50000,
    exploration_fraction=0.1,
    exploration_final_eps=0.01,
    train_freq=1,
    batch_size=32,
    print_freq=1,
    checkpoint_freq=10000,
    learning_starts=1000,
    gamma=0.95,
    target_network_update_freq=500,
    num_cpu=4,
    callback=None,
    periodic_save_freq=1000000,
    periodic_save_path=None,
    grad_norm_clip=None,
):
    """Train a CVaR DQN model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    var_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    cvar_func: function
        same as var_func
    nb_atoms: int
        number of atoms used in CVaR discretization
    run_alpha: float
        optimize CVaR_alpha while running. None if you want random alpha each episode.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the best model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    periodic_save_freq: int
        How often do we save the model - periodically
    periodic_save_path: str
        Where do we save the model - periodically
    grad_norm_clip: float
        Clip gradient to this value. No clipping if None
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/distdeepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = make_session(num_cpu=num_cpu)
    sess.__enter__()

    obs_space_shape = env.observation_space.shape

    def make_obs_ph(name):
        return U.BatchInput(obs_space_shape, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        var_func=var_func,
        cvar_func=cvar_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        nb_atoms=nb_atoms,
        grad_norm_clipping=grad_norm_clip)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'cvar_func': cvar_func,
        'var_func': var_func,
        'num_actions': env.action_space.n,
        'nb_atoms': nb_atoms
    }

    # Create the replay buffer
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    episode = 0
    alpha = 1.

    # --------------------------------- RUN ---------------------------------
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    print('Target reached')
                    model_saved = False
                    break
            # Take action and update exploration to the newest value
            update_eps = exploration.value(t)

            update_param_noise_threshold = 0.

            action = act(np.array(obs)[None], alpha, update_eps=update_eps)[0]
            reset = False
            new_obs, rew, done, _ = env.step(action)

            # ===== DEBUG =====

            # s = np.ones_like(np.array(obs)[None])
            # a = np.ones_like(act(np.array(obs)[None], run_alpha, update_eps=update_eps))
            # r = np.array([0])
            # s_ = np.ones_like(np.array(obs)[None])
            # d = np.array([False])
            # s = obs[None]
            # a = np.array([action])
            # r = np.array([rew])
            # s_ = new_obs[None]
            # d = np.array([done])
            # if t % 100 == 0:
            #     for f in debug:
            #         print(f(s, a, r, s_, d))
            #     print('-------------')
            #
            #     # print([sess.run(v) for v in tf.global_variables('cvar_dqn/cvar_func')])
            #     # print([sess.run(v) for v in tf.global_variables('cvar_dqn/var_func')])

            # =================

            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True
                if run_alpha is None:
                    alpha = np.random.random()

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.

                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

                errors = train(obses_t, actions, rewards, obses_tp1, dones,
                               weights)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            # Log results and periodically save the model
            mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])),
                                      1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.record_tabular("(current alpha)", "%.2f" % alpha)
                logger.dump_tabular()

            # save and report best model
            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward

            # save periodically
            if periodic_save_freq is not None and periodic_save_path is not None and t > learning_starts:
                if t % periodic_save_freq == 0:
                    ActWrapper(act, act_params).save("{}-{}.pkl".format(
                        periodic_save_path, int(t / periodic_save_freq)))

        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
    # Create the replay buffer
    replay_buffer = create_replay_buffer(replay, 50000)
    # Create the schedule for exploration starting from 1 (every action is random) down to
    exploration = LinearSchedule(schedule_timesteps=300000,
                                 initial_p=1.0,
                                 final_p=0.02)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    obs = env.reset()
    for t in itertools.count():
        # Take action and update exploration to the newest value
        action = act(obs[None], update_eps=exploration.value(t))[0]
        new_obs, rew, done, _ = env.step(action)
        # Store transition in the replay buffer.
        if replay != 'None':
            replay_buffer.add(obs, action, rew, new_obs, float(done))
        obs = new_obs

        episode_rewards[-1] += rew
        if done:
            obs = env.reset()
            episode_rewards.append(0)

        is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
        if is_solved:
            break
        else:
示例#20
0
def learn(
    policy,
    env,
    seed,
    training,
    use_adda,
    adda_lr,
    adda_batch,
    total_timesteps=int(80e6),
    lrschedule='linear',
    nsteps=20,
    max_grad_norm=None,
    lr=7e-4,
    epsilon=0.1,
    alpha=0.99,
    gamma=0.99,
    log_interval=1000,  #alpha and epsilon for RMSprop used in Model()
    exploration_fraction=0.8,
    exploration_final_eps=0.001,
    target_network_update_freq=10000
):  # Additional arguments for epsilon greedy

    tf.reset_default_graph()
    set_global_seeds(seed)

    nenvs = env.num_envs  # 16, from train() in run_doom.py -> used by env, which is a parameter in learn()
    print('Num Envs {}'.format(nenvs))
    ob_space = env.observation_space  # (84,84,1)

    ac_space = env.action_space  # Discrete(6)
    print('RL SEED: ', seed)
    model = Model(policy=policy,
                  ob_space=ob_space,
                  ac_space=ac_space,
                  nenvs=nenvs,
                  nsteps=nsteps,
                  use_adda=use_adda,
                  adda_lr=adda_lr,
                  adda_batch=adda_batch,
                  max_grad_norm=max_grad_norm,
                  lr=lr,
                  alpha=alpha,
                  epsilon=epsilon,
                  total_timesteps=total_timesteps,
                  lrschedule=lrschedule,
                  seed=seed)
    print('Model Obj created')
    #import sys; sys.exit()
    runner = Runner(env, model, nsteps=nsteps, gamma=gamma)

    if training:
        nbatch = nenvs * nsteps  # 16*20

        exploration = LinearSchedule(
            schedule_timesteps=50000000,
            initial_p=1.0,
            final_p=exploration_final_eps
        )  # U want to hit lowest epsilon value in 50e6 steps
        model.update_target()
        tstart = time.time()
        save_step = 0
        for update in range(
                1, total_timesteps // nbatch + 1
        ):  # For 100k steps, loop is from 1 to 313 -> runs 312 updates
            update_eps = exploration.value(update * nbatch)
            # Performs 1 update step (320 total_timesteps). For 16 envs nd nstep = 20, shapes r: (16*20,84,84,1), (320,), (320,) resp
            obs, rewards, actions = runner.run(update_eps)
            action_value_loss, cur_lr = model.train(
                obs, rewards, actions, update)  # Computes TD Error
            nseconds = time.time() - tstart

            fps = int((update * nbatch) / nseconds)

            # Save model every 1e6 steps (each iteration of loop makes 320 steps. 320*3125 = 1e6 steps. So update % 3125)
            if update % 3125 == 0 or update == 1:
                model.save_model(save_step)
                save_step += 1
                #print('Model Saved')
            # Update target network every 10k steps
            if update % 31 == 0:
                #print('Target Network Updated')
                model.update_target()
            if update % log_interval == 0 or update == 1:
                logger.record_tabular("learning rate", cur_lr)
                #logger.record_tabular("adda learning rate", cur_adda_lr)
                logger.record_tabular("epsilon", update_eps)
                logger.record_tabular("nupdates", update)
                logger.record_tabular("total_timesteps", update * nbatch)
                logger.record_tabular("fps", fps)
                logger.record_tabular("action_value_loss",
                                      float(action_value_loss))
                #logger.record_tabular("mapping_loss", float(mapping_loss_val))
                #logger.record_tabular("adversary_loss", float(adversary_loss_val))
                logger.record_tabular("time_elapsed", nseconds)
                logger.dump_tabular()
    else:

        snapshots = [66]
        seeds = [0]
        #snapshot_rewards = np.zeros(shape=(seeds, snapshots+1))
        seed_list = ['Seed 0', 'Seed 1', 'Seed 2']

        for seed in seeds:
            #seed = seed + 1
            snapshot_reward = []
            #snapshot_health = []
            print('################### Seed {}!!! ###################'.format(
                seed))
            tstart = time.time()
            #for snapshot in range(snapshots+1):
            for snapshot in snapshots:
                model.load_model(snapshot, seed, adda_mode=True)
                #print('##################################################')
                print('Evaluating snapshot {}!!!'.format(snapshot))
                reward = runner.runner_eval_parallel(num_episodes=1,
                                                     num_envs=nenvs)
                #reward = runner.runner_eval(num_episodes=1000)
                snapshot_reward.append(reward)
                #snapshot_health.append(health)

            print('Mean Reward of every ST decLR 40e6 snapshot on target: ',
                  snapshot_reward)
            print('Max Reward: ', max(snapshot_reward))
            #snapshot_rewards[seed] = snapshot_reward
            #print('Mean Health of every snapshot: ', snapshot_health)
            print('##################################################')
            nseconds = time.time() - tstart
            print('\n')
            print('Time Elapsed:', nseconds)
            epochs = np.arange(0, snapshots + 1)

            #plt.figure()
            #plt.plot(epochs, np.array(snapshot_reward), '-o', label = seed_list[seed])
            #plt.legend(loc = 'lower right')
            #plt.xlabel('TimeSteps (1e6)')
            #plt.ylabel('Mean Reward after 1000 episodes')
            #plt.savefig('TargetEnv_on_SourceModel with ADDA every 10 steps after 20e6.png')

        # Create plot of mean reward of all seed values with std devns
        mean = []
        std = []
        for x, y, z in zip(snapshot_rewards[0], snapshot_rewards[1],
                           snapshot_rewards[2]):
            mean_val = np.mean([x, y, z])
            std_val = np.std([x, y, z])
            mean.append(mean_val)
            std.append(std_val)
        epochs = np.arange(0, snapshots + 1)
        lower = np.array(mean) - np.array(std)
        upper = np.array(mean) + np.array(std)
        print('Mean of 3 seeds: ', mean)
        print('Std Devn of 3 seeds: ', std)
        '''
示例#21
0
class DQNEvaluator(Evaluator):
    
    def __init__(self, config, env_creator):
        self.config = config
        self.local_timestep = 0
        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]

        if "cartpole" in self.config["env_config"]:
            self.env = env_creator(self.config["env_config"])
        else:
            self.env = wrap_deepmind(
                env_creator(self.config["env_config"]),
                clip_rewards=False, frame_stack=True, scale=True)
        self.obs = self.env.reset()

        self.sess = U.make_session()
        self.sess.__enter__()

        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph
        observation_space_shape = self.env.observation_space.shape
        def make_obs_ph(name):
            return BatchInput(observation_space_shape, name=name)

        if "cartpole" in self.config["env_config"]:
            q_func = models.mlp([64])
        else:
            q_func = models.cnn_to_mlp(
                convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
                hiddens=[256],
                dueling=True,
            )

        act, self.train, self.update_target, debug = build_train(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=self.env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.config["lr"]),
            gamma=self.config["gamma"],
            grad_norm_clipping=10,
            param_noise=False
        )

        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': q_func,
            'num_actions': self.env.action_space.n,
        }

        self.act = ActWrapper(act, act_params)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(self.config["exploration_fraction"] * self.config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=self.config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

    def sample(self):
        obs, actions, rewards, new_obs, dones = [], [], [], [], []
        for _ in range(
                self.config["sample_batch_size"] + self.config["n_step"] - 1):
            update_eps = self.exploration.value(self.local_timestep)
            action = self.act(
                np.array(self.obs)[None], update_eps=update_eps)[0]
            obs_tp1, reward, done, _ = self.env.step(action)
            obs.append(self.obs)
            actions.append(action)
            rewards.append(np.sign(reward))
            new_obs.append(obs_tp1)
            dones.append(1.0 if done else 0.0)
            self.obs = obs_tp1
            self.episode_rewards[-1] += reward
            self.episode_lengths[-1] += 1
            if done:
                self.obs = self.env.reset()
                self.episode_rewards.append(0.0)
                self.episode_lengths.append(0.0)
            self.local_timestep += 1

        # N-step Q adjustments
        if self.config["n_step"] > 1:
            # Adjust for steps lost from truncation
            self.local_timestep -= (self.config["n_step"] - 1)
            adjust_nstep(
                self.config["n_step"], self.config["gamma"],
                obs, actions, rewards, new_obs, dones)

        batch = SampleBatch({
            "obs": obs, "actions": actions, "rewards": rewards,
            "new_obs": new_obs, "dones": dones,
            "weights": np.ones_like(rewards)})
        assert batch.count == self.config["sample_batch_size"]

#        td_errors = self.agent.compute_td_error(batch)
        batch.data["obs"] = [pack(o) for o in batch["obs"]]
        batch.data["new_obs"] = [pack(o) for o in batch["new_obs"]]
#        new_priorities = (
#            np.abs(td_errors) + self.config["prioritized_replay_eps"])
#        batch.data["weights"] = new_priorities

        return batch

    def compute_gradients(self, samples):
        raise NotImplementedError

    def apply_gradients(self, grads):
        raise NotImplementedError

    def compute_apply(self, samples):
        return self.train(
            samples["obs"], samples["actions"], samples["rewards"],
            samples["new_obs"], samples["dones"], samples["weights"])

    def get_weights(self):
        raise NotImplementedError

    def set_weights(self, weights):
        raise NotImplementedError

    def stats(self):
        mean_100ep_reward = round(np.mean(self.episode_rewards[-101:-1]), 5)
        mean_100ep_length = round(np.mean(self.episode_lengths[-101:-1]), 5)
        return {
            "mean_100ep_reward": mean_100ep_reward,
            "mean_100ep_length": mean_100ep_length,
            "num_episodes": len(self.episode_rewards),
            "local_timestep": self.local_timestep,
        }
class DDPG(object):
    @store_args
    def __init__(self,
                 input_dims,
                 buffer_size,
                 hidden,
                 layers,
                 network_class,
                 polyak,
                 batch_size,
                 Q_lr,
                 pi_lr,
                 norm_eps,
                 norm_clip,
                 max_u,
                 action_l2,
                 clip_obs,
                 scope,
                 T,
                 rollout_batch_size,
                 subtract_goals,
                 relative_goals,
                 clip_pos_returns,
                 clip_return,
                 sample_transitions,
                 gamma,
                 temperature,
                 prioritization,
                 env_name,
                 alpha,
                 beta0,
                 beta_iters,
                 eps,
                 max_timesteps,
                 rank_method,
                 reuse=False,
                 **kwargs):
        """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).

        Args:
            input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the
                actions (u)
            buffer_size (int): number of transitions that are stored in the replay buffer
            hidden (int): number of units in the hidden layers
            layers (int): number of hidden layers
            network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic')
            polyak (float): coefficient for Polyak-averaging of the target network
            batch_size (int): batch size for training
            Q_lr (float): learning rate for the Q (critic) network
            pi_lr (float): learning rate for the pi (actor) network
            norm_eps (float): a small value used in the normalizer to avoid numerical instabilities
            norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip]
            max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u]
            action_l2 (float): coefficient for L2 penalty on the actions
            clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs]
            scope (str): the scope used for the TensorFlow graph
            T (int): the time horizon for rollouts
            rollout_batch_size (int): number of parallel rollouts per DDPG agent
            subtract_goals (function): function that subtracts goals from each other
            relative_goals (boolean): whether or not relative goals should be fed into the network
            clip_pos_returns (boolean): whether or not positive returns should be clipped
            clip_return (float): clip returns to be in [-clip_return, clip_return]
            sample_transitions (function) function that samples from the replay buffer
            gamma (float): gamma used for Q learning updates
            reuse (boolean): whether or not the networks should be reused
        """
        if self.clip_return is None:
            self.clip_return = np.inf

        self.create_actor_critic = import_function(self.network_class)

        input_shapes = dims_to_shapes(self.input_dims)
        self.dimo = self.input_dims['o']
        self.dimg = self.input_dims['g']
        self.dimu = self.input_dims['u']

        self.prioritization = prioritization
        self.env_name = env_name
        self.temperature = temperature
        self.rank_method = rank_method

        # Prepare staging area for feeding data to the model.
        stage_shapes = OrderedDict()
        for key in sorted(self.input_dims.keys()):
            if key.startswith('info_'):
                continue
            stage_shapes[key] = (None, *input_shapes[key])
        for key in ['o', 'g']:
            stage_shapes[key + '_2'] = stage_shapes[key]
        stage_shapes['r'] = (None, )
        stage_shapes['w'] = (None, )
        self.stage_shapes = stage_shapes

        # Create network.
        with tf.variable_scope(self.scope):
            self.staging_tf = StagingArea(
                dtypes=[tf.float32 for _ in self.stage_shapes.keys()],
                shapes=list(self.stage_shapes.values()))
            self.buffer_ph_tf = [
                tf.placeholder(tf.float32, shape=shape)
                for shape in self.stage_shapes.values()
            ]
            self.stage_op = self.staging_tf.put(self.buffer_ph_tf)

            self._create_network(reuse=reuse)

        # Configure the replay buffer.
        buffer_shapes = {
            key: (self.T if key != 'o' else self.T + 1, *input_shapes[key])
            for key, val in input_shapes.items()
        }
        buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg)
        buffer_shapes['ag'] = (self.T + 1, self.dimg)
        buffer_size = (self.buffer_size //
                       self.rollout_batch_size) * self.rollout_batch_size

        if self.prioritization == 'energy':
            self.buffer = ReplayBufferEnergy(buffer_shapes, buffer_size,
                                             self.T, self.sample_transitions,
                                             self.prioritization,
                                             self.env_name)
        elif self.prioritization == 'tderror':
            self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size,
                                                  self.T,
                                                  self.sample_transitions,
                                                  alpha, self.env_name)
            if beta_iters is None:
                beta_iters = max_timesteps
            self.beta_schedule = LinearSchedule(beta_iters,
                                                initial_p=beta0,
                                                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T,
                                       self.sample_transitions)

    def _random_action(self, n):
        return np.random.uniform(low=-self.max_u,
                                 high=self.max_u,
                                 size=(n, self.dimu))

    def _preprocess_og(self, o, ag, g):
        if self.relative_goals:
            g_shape = g.shape
            g = g.reshape(-1, self.dimg)
            ag = ag.reshape(-1, self.dimg)
            g = self.subtract_goals(g, ag)
            g = g.reshape(*g_shape)
        o = np.clip(o, -self.clip_obs, self.clip_obs)
        g = np.clip(g, -self.clip_obs, self.clip_obs)
        return o, g

    def get_actions(self,
                    o,
                    ag,
                    g,
                    noise_eps=0.,
                    random_eps=0.,
                    use_target_net=False,
                    compute_Q=False):
        o, g = self._preprocess_og(o, ag, g)
        policy = self.target if use_target_net else self.main
        # values to compute
        vals = [policy.pi_tf]
        if compute_Q:
            vals += [policy.Q_pi_tf]
        # feed
        feed = {
            policy.o_tf:
            o.reshape(-1, self.dimo),
            policy.g_tf:
            g.reshape(-1, self.dimg),
            policy.u_tf:
            np.zeros((o.size // self.dimo, self.dimu), dtype=np.float32)
        }

        ret = self.sess.run(vals, feed_dict=feed)

        # action postprocessing
        u = ret[0]
        noise = noise_eps * self.max_u * np.random.randn(
            *u.shape)  # gaussian noise
        u += noise
        u = np.clip(u, -self.max_u, self.max_u)
        u += np.random.binomial(1, random_eps, u.shape[0]).reshape(-1, 1) * (
            self._random_action(u.shape[0]) - u)  # eps-greedy
        if u.shape[0] == 1:
            u = u[0]
        u = u.copy()
        ret[0] = u

        if len(ret) == 1:
            return ret[0]
        else:
            return ret

    def get_td_errors(self, o, g, u):
        o, g = self._preprocess_og(o, g, g)
        vals = [self.td_error_tf]
        r = np.ones((o.reshape(-1, self.dimo).shape[0], 1))

        feed = {
            self.target.o_tf: o.reshape(-1, self.dimo),
            self.target.g_tf: g.reshape(-1, self.dimg),
            self.bath_tf_r: r,
            self.main.o_tf: o.reshape(-1, self.dimo),
            self.main.g_tf: g.reshape(-1, self.dimg),
            self.main.u_tf: u.reshape(-1, self.dimu)
        }
        td_errors = self.sess.run(vals, feed_dict=feed)
        td_errors = td_errors.copy()

        return td_errors

    def store_episode(self,
                      episode_batch,
                      dump_buffer,
                      w_potential,
                      w_linear,
                      w_rotational,
                      rank_method,
                      clip_energy,
                      update_stats=True):
        """
        episode_batch: array of batch_size x (T or T+1) x dim_key
                       'o' is of size T+1, others are of size T
        """
        if self.prioritization == 'tderror':
            self.buffer.store_episode(episode_batch, dump_buffer)
        elif self.prioritization == 'energy':
            self.buffer.store_episode(episode_batch, w_potential, w_linear,
                                      w_rotational, rank_method, clip_energy)
        else:
            self.buffer.store_episode(episode_batch)

        if update_stats:
            # add transitions to normalizer
            episode_batch['o_2'] = episode_batch['o'][:, 1:, :]
            episode_batch['ag_2'] = episode_batch['ag'][:, 1:, :]
            num_normalizing_transitions = transitions_in_episode_batch(
                episode_batch)

            if self.prioritization == 'energy':
                if not self.buffer.current_size == 0 and not len(
                        episode_batch['ag']) == 0:
                    transitions = self.sample_transitions(
                        episode_batch, num_normalizing_transitions, 'none',
                        1.0, True)
            elif self.prioritization == 'tderror':
                transitions, weights, episode_idxs = \
                self.sample_transitions(self.buffer, episode_batch, num_normalizing_transitions, beta=0)
            else:
                transitions = self.sample_transitions(
                    episode_batch, num_normalizing_transitions)

            o, o_2, g, ag = transitions['o'], transitions['o_2'], transitions[
                'g'], transitions['ag']
            transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)

            self.o_stats.update(transitions['o'])
            self.g_stats.update(transitions['g'])

            self.o_stats.recompute_stats()
            self.g_stats.recompute_stats()

    def get_current_buffer_size(self):
        return self.buffer.get_current_size()

    def dump_buffer(self, epoch):
        self.buffer.dump_buffer(epoch)

    def _sync_optimizers(self):
        self.Q_adam.sync()
        self.pi_adam.sync()

    def _grads(self):
        # Avoid feed_dict here for performance!
        critic_loss, actor_loss, Q_grad, pi_grad, td_error = self.sess.run([
            self.Q_loss_tf, self.main.Q_pi_tf, self.Q_grad_tf, self.pi_grad_tf,
            self.td_error_tf
        ])
        return critic_loss, actor_loss, Q_grad, pi_grad, td_error

    def _update(self, Q_grad, pi_grad):
        self.Q_adam.update(Q_grad, self.Q_lr)
        self.pi_adam.update(pi_grad, self.pi_lr)

    def sample_batch(self, t):

        if self.prioritization == 'energy':
            transitions = self.buffer.sample(self.batch_size,
                                             self.rank_method,
                                             temperature=self.temperature)
            weights = np.ones_like(transitions['r']).copy()
        elif self.prioritization == 'tderror':
            transitions, weights, idxs = self.buffer.sample(
                self.batch_size, beta=self.beta_schedule.value(t))
        else:
            transitions = self.buffer.sample(self.batch_size)
            weights = np.ones_like(transitions['r']).copy()

        o, o_2, g = transitions['o'], transitions['o_2'], transitions['g']
        ag, ag_2 = transitions['ag'], transitions['ag_2']
        transitions['o'], transitions['g'] = self._preprocess_og(o, ag, g)
        transitions['o_2'], transitions['g_2'] = self._preprocess_og(
            o_2, ag_2, g)

        transitions['w'] = weights.flatten().copy()  # note: ordered dict
        transitions_batch = [
            transitions[key] for key in self.stage_shapes.keys()
        ]

        if self.prioritization == 'tderror':
            return (transitions_batch, idxs)
        else:
            return transitions_batch

    def stage_batch(self, t, batch=None):  #
        if batch is None:
            if self.prioritization == 'tderror':
                batch, idxs = self.sample_batch(t)
            else:
                batch = self.sample_batch(t)
        assert len(self.buffer_ph_tf) == len(batch)
        self.sess.run(self.stage_op,
                      feed_dict=dict(zip(self.buffer_ph_tf, batch)))

        if self.prioritization == 'tderror':
            return idxs

    def train(self, t, dump_buffer, stage=True):
        if not self.buffer.current_size == 0:
            if stage:
                if self.prioritization == 'tderror':
                    idxs = self.stage_batch(t)
                else:
                    self.stage_batch(t)
            critic_loss, actor_loss, Q_grad, pi_grad, td_error = self._grads()
            if self.prioritization == 'tderror':
                new_priorities = np.abs(td_error) + self.eps  # td_error

                if dump_buffer:
                    T = self.buffer.buffers['u'].shape[1]
                    episode_idxs = idxs // T
                    t_samples = idxs % T
                    batch_size = td_error.shape[0]
                    with self.buffer.lock:
                        for i in range(batch_size):
                            self.buffer.buffers['td'][episode_idxs[i]][
                                t_samples[i]] = td_error[i]

                self.buffer.update_priorities(idxs, new_priorities)
            self._update(Q_grad, pi_grad)
            return critic_loss, actor_loss

    def _init_target_net(self):
        self.sess.run(self.init_target_net_op)

    def update_target_net(self):
        self.sess.run(self.update_target_net_op)

    def clear_buffer(self):
        self.buffer.clear_buffer()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))

        self.sess = tf.get_default_session()
        if self.sess is None:
            self.sess = tf.InteractiveSession()

        # running averages
        with tf.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)
        with tf.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg,
                                      self.norm_eps,
                                      self.norm_clip,
                                      sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([
            (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())
        ])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])
        batch_tf['w'] = tf.reshape(batch_tf['w'], [-1, 1])

        # networks
        with tf.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf,
                                                 net_type='main',
                                                 **self.__dict__)
            vs.reuse_variables()
        with tf.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(target_batch_tf,
                                                   net_type='target',
                                                   **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(
            batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)

        self.td_error_tf = tf.stop_gradient(target_tf) - self.main.Q_tf
        self.errors_tf = tf.square(self.td_error_tf)
        self.errors_tf = tf.reduce_mean(batch_tf['w'] * self.errors_tf)
        self.Q_loss_tf = tf.reduce_mean(self.errors_tf)

        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))
        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'),
                               scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars(
            'g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()

    def logs(self, prefix=''):
        logs = []
        logs += [('stats_o/mean', np.mean(self.sess.run([self.o_stats.mean])))]
        logs += [('stats_o/std', np.mean(self.sess.run([self.o_stats.std])))]
        logs += [('stats_g/mean', np.mean(self.sess.run([self.g_stats.mean])))]
        logs += [('stats_g/std', np.mean(self.sess.run([self.g_stats.std])))]

        if prefix is not '' and not prefix.endswith('/'):
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs

    def __getstate__(self):
        """Our policies can be loaded from pkl, but after unpickling you cannot continue training.
        """
        excluded_subnames = [
            '_tf', '_op', '_vars', '_adam', 'buffer', 'sess', '_stats', 'main',
            'target', 'lock', 'env', 'sample_transitions', 'stage_shapes',
            'create_actor_critic'
        ]

        state = {
            k: v
            for k, v in self.__dict__.items()
            if all([not subname in k for subname in excluded_subnames])
        }
        state['buffer_size'] = self.buffer_size
        state['tf'] = self.sess.run(
            [x for x in self._global_vars('') if 'buffer' not in x.name])
        return state

    def __setstate__(self, state):
        if 'sample_transitions' not in state:
            # We don't need this for playing the policy.
            state['sample_transitions'] = None
        state['env_name'] = None  # No need for playing the policy

        self.__init__(**state)
        # set up stats (they are overwritten in __init__)
        for k, v in state.items():
            if k[-6:] == '_stats':
                self.__dict__[k] = v
        # load TF variables
        vars = [x for x in self._global_vars('') if 'buffer' not in x.name]
        assert (len(vars) == len(state["tf"]))
        node = [tf.assign(var, val) for var, val in zip(vars, state["tf"])]
        self.sess.run(node)
示例#23
0
def learn(env,
          q_func,
          num_actions=64 * 64,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    episode_minerals = [0.0]
    saved_mean_reward = None

    path_memory = np.zeros((64, 64))

    obs = env.reset()
    # Select all marines first
    step_result = env.step(
        actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    obs = player_relative + path_memory

    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
    player = [int(player_x.mean()), int(player_y.mean())]

    if (player[0] > 32):
        obs = shift(LEFT, player[0] - 32, obs)
    elif (player[0] < 32):
        obs = shift(RIGHT, 32 - player[0], obs)

    if (player[1] > 32):
        obs = shift(UP, player[1] - 32, obs)
    elif (player[1] < 32):
        obs = shift(DOWN, 32 - player[1], obs)

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            reset = False

            coord = [player[0], player[1]]
            rew = 0

            path_memory_ = np.array(path_memory, copy=True)
            if (action == 0):  #UP

                if (player[1] >= 16):
                    coord = [player[0], player[1] - 16]
                    path_memory_[player[1] - 16:player[1], player[0]] = -1
                elif (player[1] > 0):
                    coord = [player[0], 0]
                    path_memory_[0:player[1], player[0]] = -1
                else:
                    rew -= 1

            elif (action == 1):  #DOWN

                if (player[1] <= 47):
                    coord = [player[0], player[1] + 16]
                    path_memory_[player[1]:player[1] + 16, player[0]] = -1
                elif (player[1] > 47):
                    coord = [player[0], 63]
                    path_memory_[player[1]:63, player[0]] = -1
                else:
                    rew -= 1

            elif (action == 2):  #LEFT

                if (player[0] >= 16):
                    coord = [player[0] - 16, player[1]]
                    path_memory_[player[1], player[0] - 16:player[0]] = -1
                elif (player[0] < 16):
                    coord = [0, player[1]]
                    path_memory_[player[1], 0:player[0]] = -1
                else:
                    rew -= 1

            elif (action == 3):  #RIGHT

                if (player[0] <= 47):
                    coord = [player[0] + 16, player[1]]
                    path_memory_[player[1], player[0]:player[0] + 16] = -1
                elif (player[0] > 47):
                    coord = [63, player[1]]
                    path_memory_[player[1], player[0]:63] = -1
                else:
                    rew -= 1

            else:
                #Cannot move, give minus reward
                rew -= 1

            if (path_memory[coord[1], coord[0]] != 0):
                rew -= 0.5

            path_memory = np.array(path_memory_)
            #print("action : %s Coord : %s" % (action, coord))

            new_action = [
                sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
            ]

            step_result = env.step(actions=new_action)

            player_relative = step_result[0].observation["screen"][
                _PLAYER_RELATIVE]
            new_obs = player_relative + path_memory

            player_y, player_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()
            player = [int(player_x.mean()), int(player_y.mean())]

            if (player[0] > 32):
                new_obs = shift(LEFT, player[0] - 32, new_obs)
            elif (player[0] < 32):
                new_obs = shift(RIGHT, 32 - player[0], new_obs)

            if (player[1] > 32):
                new_obs = shift(UP, player[1] - 32, new_obs)
            elif (player[1] < 32):
                new_obs = shift(DOWN, 32 - player[1], new_obs)

            rew += step_result[0].reward * 10

            done = step_result[0].step_type == environment.StepType.LAST

            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            episode_minerals[-1] += step_result[0].reward

            if done:
                obs = env.reset()
                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]

                obs = player_relative + path_memory

                player_y, player_x = (
                    player_relative == _PLAYER_FRIENDLY).nonzero()
                player = [int(player_x.mean()), int(player_y.mean())]

                if (player[0] > 32):
                    obs = shift(LEFT, player[0] - 32, obs)
                elif (player[0] < 32):
                    obs = shift(RIGHT, 32 - player[0], obs)

                if (player[1] > 32):
                    obs = shift(UP, player[1] - 32, obs)
                elif (player[1] < 32):
                    obs = shift(DOWN, 32 - player[1], obs)

                # Select all marines first
                env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
                episode_rewards.append(0.0)
                episode_minerals.append(0.0)

                path_memory = np.zeros((64, 64))

                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            mean_100ep_mineral = round(np.mean(episode_minerals[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("mean 100 episode mineral",
                                      mean_100ep_mineral)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act)
示例#24
0
class DeepqLearner:
    def __init__(self, env, q_func, config=DEEPQ_CONFIG, callback=None):
        self.env = env
        self.q_func = q_func
        self.config = config
        self.callback = callback

        # Create all the functions necessary to train the model
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=config["gpu_memory_fraction"])
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        sess.__enter__()

        # capture the shape outside the closure so that the env object is not serialized
        # by cloudpickle when serializing make_obs_ph

        def make_obs_ph(name):
            return ObservationInput(env.observation_space, name=name)

        act, self.train, self.update_target, self.debug = deepq.build_train(
            make_obs_ph=make_obs_ph,
            q_func=q_func,
            num_actions=env.action_space.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=config["lr"]),
            gamma=config["gamma"],
            grad_norm_clipping=10,
            param_noise=config["param_noise"])

        act_params = {
            # 'make_obs_ph': make_obs_ph,
            # 'q_func': q_func,
            'num_actions': env.action_space.n,
        }

        self.act = ActWrapper(act, act_params)

        # Create the replay buffer
        self.config = config
        self.replay_buffer = None
        self.beta_schedule = None
        self.make_replay_buffer()

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        U.initialize()
        self.update_target()

        self.t = 0
        self.episode_rewards = [0.0]
        self.num_episodes = 1
        self.saved_mean_reward = None
        self.saved_episode_num = None
        self.episode_frames = 0
        self.model_file = None
        self.start_time = 0
        self.episode_start_time = 0

    def make_replay_buffer(self):
        if self.config["prioritized_replay"]:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.config["buffer_size"],
                alpha=self.config["prioritized_replay_alpha"])
            if self.config["prioritized_replay_beta_iters"] is None:
                self.config["prioritized_replay_beta_iters"] = self.config[
                    "max_timesteps"]
            self.beta_schedule = LinearSchedule(
                self.config["prioritized_replay_beta_iters"],
                initial_p=self.config["prioritized_replay_beta0"],
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.config["buffer_size"])
            self.beta_schedule = None

    def run(self):
        reset = True
        obs = self.env.reset()
        self.start_time = time.time()
        self.episode_start_time = time.time()

        with tempfile.TemporaryDirectory() as td:
            td = self.config["checkpoint_path"] or td

            self.model_file = os.path.join(td, "model")
            if tf.train.latest_checkpoint(td) is not None:
                load_state(self.model_file)
                logger.log('Loaded model from {}'.format(self.model_file))

            for self.t in range(self.config["max_timesteps"]):
                if self.callback is not None:
                    if self.callback(locals(), globals()):
                        break

                # Determine next action to take, then take that action and observe results
                action = self._action(obs, reset)
                env_action = action
                new_obs, rew, done, _ = self.env.step(env_action)
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                # Increment typical values
                reset = False
                self.episode_frames += 1
                self.episode_rewards[-1] += rew

                # See if done with episode
                if done:
                    obs = self._reset()
                    reset = True

                # Do training and deepq updating as needed
                if self.t > self.config["learning_starts"]:
                    if self.t % self.config["train_freq"] == 0:
                        self._train()
                    if self.t % self.config["target_network_update_freq"] == 0:
                        self.update_target()

    def _action(self, obs, reset):
        # Take action and update exploration to the newest value
        kwargs = {}
        if not self.config["param_noise"]:
            update_eps = self.exploration.value(self.t)
            # update_param_noise_threshold = 0.
        else:
            update_eps = 0.
            # Compute the threshold such that the KL divergence between perturbed and non-perturbed
            # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
            # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
            # for detailed explanation.
            update_param_noise_threshold = -np.log(
                1. - self.exploration.value(self.t) +
                self.exploration.value(self.t) /
                float(self.env.action_space.n))
            kwargs['reset'] = reset
            kwargs[
                'update_param_noise_threshold'] = update_param_noise_threshold
            kwargs['update_param_noise_scale'] = True
        return self.act(np.array(obs)[None], update_eps=update_eps,
                        **kwargs)[0]

    def _train(self):
        try:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if self.config["prioritized_replay"]:
                experience = self.replay_buffer.sample(
                    self.config["batch_size"],
                    beta=self.beta_schedule.value(self.t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                    self.config["batch_size"])
                weights, batch_idxes = np.ones_like(rewards), None

            # Determine errors
            td_errors = self.train(obses_t, actions, rewards, obses_tp1, dones,
                                   weights)
            if self.config["prioritized_replay"]:
                new_priorities = np.abs(
                    td_errors) + self.config["prioritized_replay_eps"]
                self.replay_buffer.update_priorities(batch_idxes,
                                                     new_priorities)
        except Exception as e:
            self.make_replay_buffer()
            print(e)

    def _reset(self):
        self.attempt_print()
        self.attempt_checkpoint()
        self.episode_rewards.append(0.0)
        self.num_episodes += 1
        self.episode_frames = 0
        self.episode_start_time = time.time()

        return self.env.reset()

    def calc_mean_100ep_reward(self):
        if self.num_episodes <= 1:
            return None
        return round(np.mean(self.episode_rewards[-101:-1]), 1)

    def attempt_print(self):
        p_freq = self.config["print_freq"]
        if p_freq is not None and self.num_episodes % p_freq == 0:
            logger.record_tabular("% time spent exploring",
                                  int(100 * self.exploration.value(self.t)))
            logger.record_tabular("reward - current", self.episode_rewards[-1])
            logger.record_tabular("reward - mean",
                                  self.calc_mean_100ep_reward())
            logger.record_tabular("reward - saved", self.saved_mean_reward)
            logger.record_tabular("episode # - current", self.num_episodes)
            logger.record_tabular("episode # - saved", self.saved_episode_num)
            logger.record_tabular("steps - total", self.t)
            logger.record_tabular("steps - episode", self.episode_frames)
            logger.record_tabular(
                "time - ep duration",
                str(time.time() - self.episode_start_time) + "s")
            logger.record_tabular("time - remaining",
                                  self.estimate_time_remaining())
            logger.dump_tabular()

    def estimate_time_remaining(self):
        duration = time.time() - self.start_time
        if duration <= 0:
            return "Unknown"

        time_remaining = self.t / duration * (self.config["max_timesteps"] -
                                              self.t) / 60.0
        suffix = ""

        # Format based on time
        if time_remaining < MINUTE:
            suffix = " seconds"
        elif time_remaining < HOUR:
            suffix = " minutes"
            time_remaining = time_remaining / MINUTE
        elif time_remaining < DAY:
            suffix = " hours"
            time_remaining = time_remaining / HOUR
        else:
            suffix = " days"
            time_remaining = time_remaining / DAY

        # Round remaining time and return
        time_remaining = round(time_remaining * 100.0) / 100.0
        return str(time_remaining) + suffix

    def attempt_checkpoint(self):
        # Determine if we're going to checkpoint
        c_freq = self.config["checkpoint_freq"]
        if c_freq is not None \
                and self.num_episodes > 100 \
                and self.t > self.config["learning_starts"] \
                and self.num_episodes % c_freq == 0:

            # Determine if reward is growing
            mean_100ep_reward = self.calc_mean_100ep_reward()
            if self.saved_mean_reward is None or mean_100ep_reward > self.saved_mean_reward:
                if self.config["print_freq"] is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".
                        format(self.saved_mean_reward, mean_100ep_reward))
                    self.saved_mean_reward = mean_100ep_reward
                    self.saved_episode_num = self.num_episodes
                    save_state(self.model_file)

    def save(self, save_path):
        print("Saving model to " + save_path)
        self.act.save(save_path)
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
    """Train a deepq model.

  Parameters
  -------
  env: pysc2.env.SC2Env
      environment to train on
  q_func: (tf.Variable, int, str, bool) -> tf.Variable
      the model that takes the following inputs:
          observation_in: object
              the output of observation placeholder
          num_actions: int
              number of actions
          scope: str
          reuse: bool
              should be passed to outer variable scope
      and returns a tensor of shape (batch_size, num_actions) with values of every action.
  lr: float
      learning rate for adam optimizer
  max_timesteps: int
      number of env steps to optimizer for
  buffer_size: int
      size of the replay buffer
  exploration_fraction: float
      fraction of entire training period over which the exploration rate is annealed
  exploration_final_eps: float
      final value of random action probability
  train_freq: int
      update the model every `train_freq` steps.
      set to None to disable printing
  batch_size: int
      size of a batched sampled from replay buffer for training
  print_freq: int
      how often to print out training progress
      set to None to disable printing
  checkpoint_freq: int
      how often to save the model. This is so that the best version is restored
      at the end of the training. If you do not wish to restore the best version at
      the end of the training set this variable to None.
  learning_starts: int
      how many steps of the model to collect transitions for before learning starts
  gamma: float
      discount factor
  target_network_update_freq: int
      update the target network every `target_network_update_freq` steps.
  prioritized_replay: True
      if True prioritized replay buffer will be used.
  prioritized_replay_alpha: float
      alpha parameter for prioritized replay buffer
  prioritized_replay_beta0: float
      initial value of beta for prioritized replay buffer
  prioritized_replay_beta_iters: int
      number of iterations over which beta will be annealed from initial value
      to 1.0. If set to None equals to max_timesteps.
  prioritized_replay_eps: float
      epsilon to add to the TD errors when updating priorities.
  num_cpu: int
      number of cpus to use for training
  callback: (locals, globals) -> None
      function called at every steps with state of the algorithm.
      If callback returns true training stops.

  Returns
  -------
  act: ActWrapper
      Wrapper over act function. Adds ability to save it and load it.
      See header of baselines/deepq/categorical.py for details on the act function.
  """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((16, 16), name=name)

    act_x, train_x, update_target_x, debug_x = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope='deep_x')

    act_y, train_y, update_target_y, debug_y = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        scope='deep_y')

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer_x = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        replay_buffer_y = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)

        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule_x = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)

        beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
                                         initial_p=prioritized_replay_beta0,
                                         final_p=1.0)
    else:
        replay_buffer_x = ReplayBuffer(buffer_size)
        replay_buffer_y = ReplayBuffer(buffer_size)

        beta_schedule_x = None
        beta_schedule_y = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    U.initialize()
    update_target_x()
    update_target_y()

    episode_rewards = [0.0]
    episode_beacons = [0.0]
    saved_mean_reward = None

    obs = env.reset()
    # Select marines
    obs = env.step(
        actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = (player_relative == _PLAYER_NEUTRAL).astype(int)

    player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
    player = [int(player_x.mean()), int(player_y.mean())]
    #print(np.array(screen)[None].shape)

    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join("model/", "mineral_shards")
        print(model_file)

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            #print(np.array(screen)[None].shape)
            action_x = act_x(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]

            action_y = act_y(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]

            reset = False

            coord = [player[0], player[1]]
            rew = 0

            coord = [action_x, action_y]

            change_x = coord[0] - player[0]
            change_y = coord[1] - player[1]
            change_m = np.sqrt((change_x**2) + (change_y**2))
            #print(change_y, change_x, change_m)

            # action 0-3
            # path_memory = np.array(path_memory_) # at end of action, edit path_memory
            if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
                obs = env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
            else:
                new_action = [
                    sc2_actions.FunctionCall(_MOVE_SCREEN,
                                             [_NOT_QUEUED, coord])
                ]
                obs = env.step(actions=new_action)

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            new_screen = (player_relative == _PLAYER_NEUTRAL).astype(int)

            player_y, player_x = (
                player_relative == _PLAYER_FRIENDLY).nonzero()
            player = [int(player_x.mean()), int(player_y.mean())]

            rew = obs[0].reward * 10

            done = obs[0].step_type == environment.StepType.LAST

            replay_buffer_x.add(screen, action_x, rew, new_screen, float(done))
            replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

            screen = new_screen

            episode_rewards[-1] += rew
            episode_beacons[-1] += obs[0].reward
            reward = episode_rewards[-1]

            if done:
                obs = env.reset()
                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]
                screen = (player_relative == _PLAYER_NEUTRAL).astype(int)

                player_y, player_x = (
                    player_relative == _PLAYER_FRIENDLY).nonzero()
                player = [int(player_x.mean()), int(player_y.mean())]

                env.step(actions=[
                    sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
                ])
                episode_rewards.append(0.0)
                episode_beacons.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:

                    experience_x = replay_buffer_x.sample(
                        batch_size, beta=beta_schedule_x.value(t))
                    (obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x,
                     weights_x, batch_idxes_x) = experience_x

                    experience_y = replay_buffer_y.sample(
                        batch_size, beta=beta_schedule_y.value(t))
                    (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y,
                     weights_y, batch_idxes_y) = experience_y

                else:

                    obses_t_x, actions_x, rewards_x, obses_tp1_x, dones_x = replay_buffer_x.sample(
                        batch_size)
                    weights_x, batch_idxes_x = np.ones_like(rewards_x), None

                    obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(
                        batch_size)
                    weights_y, batch_idxes_y = np.ones_like(rewards_y), None

                td_errors_x = train_x(obses_t_x, actions_x, rewards_x,
                                      obses_tp1_x, dones_x, weights_x)

                td_errors_y = train_x(obses_t_y, actions_y, rewards_y,
                                      obses_tp1_y, dones_y, weights_y)

                if prioritized_replay:
                    new_priorities_x = np.abs(
                        td_errors_x) + prioritized_replay_eps
                    new_priorities_y = np.abs(
                        td_errors_y) + prioritized_replay_eps
                    replay_buffer_x.update_priorities(batch_idxes_x,
                                                      new_priorities_x)
                    replay_buffer_y.update_priorities(batch_idxes_y,
                                                      new_priorities_y)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target_x()
                update_target_y()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            mean_100ep_beacon = round(np.mean(episode_beacons[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("mean 100 episode beacon",
                                      mean_100ep_beacon)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act)
示例#26
0
class LineModel_PPO1:
    REWARD_RIVAL_DMG = 250

    def __init__(
            self,
            statesize,
            actionsize,
            hero,
            ob,
            ac,
            policy_func=None,
            update_target_period=100,
            scope="ppo1",
            schedule_timesteps=10000,
            initial_p=0,
            final_p=0,
            gamma=0.99,
            lam=0.95,
            optim_epochs=4,
            optim_stepsize=1e-3,  # optimization hypers
            schedule='linear',
            max_timesteps=40e6):
        self.act = None
        self.train = None
        self.update_target = None
        self.debug = None

        self.state_size = statesize
        self.action_size = actionsize  # 50=8*mov+10*attack+10*skill1+10*skill2+10*skill3+回城+hold
        self.gamma = gamma  # discount rate
        self.lam = lam
        self.hero = hero
        self.scope = scope

        # todo:英雄1,2普攻距离为2,后续需修改
        self.att_dist = 2

        self.act_times = 0
        self.train_times = 0
        self.update_target_period = update_target_period

        self.exploration = LinearSchedule(schedule_timesteps=5000,
                                          initial_p=initial_p,
                                          final_p=final_p)

        # Initialize history arrays
        self.obs = np.array([ob for _ in range(update_target_period)])
        self.rews = np.zeros(update_target_period, 'float32')
        self.vpreds = np.zeros(update_target_period, 'float32')
        self.news = np.zeros(update_target_period, 'int32')
        self.acs = np.array([ac for _ in range(update_target_period)])
        self.prevacs = self.acs.copy()
        self.schedule = schedule
        self.max_timesteps = max_timesteps
        self.t = 0

        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize

        self.ep_rets = []
        self.ep_lens = []
        self.cur_ep_ret = 0
        self.cur_ep_len = 0

        self.lenbuffer = deque(
            maxlen=100)  # rolling buffer for episode lengths
        self.rewbuffer = deque(
            maxlen=100)  # rolling buffer for episode rewards
        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.iters_so_far = 0

        self.exploration = LinearSchedule(
            schedule_timesteps=schedule_timesteps,
            initial_p=initial_p,
            final_p=final_p)
        policy_func = LinePPOModel if LinePPOModel is None else policy_func
        self._build_model(input_space=statesize,
                          action_size=actionsize,
                          policy_func=policy_func)

        self.tstart = time.time()

    def _build_model(
            self,
            input_space,
            action_size,
            policy_func,
            clip_param=0.2,
            entcoeff=0.01,  # clipping parameter epsilon, entropy coeff
            adam_epsilon=1e-5):
        sess = U.get_session()
        if sess is None:
            sess = U.make_session(8)
            sess.__enter__()

        # Setup losses and stuff
        # ----------------------------------------
        with tf.variable_scope(self.scope):
            self.pi = policy_func(
                "pi", input_space,
                action_size)  # Construct network for new policy
            self.oldpi = policy_func("oldpi", input_space,
                                     action_size)  # Network for old policy
            atarg = tf.placeholder(
                dtype=tf.float32,
                shape=[None])  # Target advantage function (if applicable)
            ret = tf.placeholder(dtype=tf.float32,
                                 shape=[None])  # Empirical return

            lrmult = tf.placeholder(
                name='lrmult', dtype=tf.float32,
                shape=[])  # learning rate multiplier, updated with schedule
            clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

            ob = U.get_placeholder_cached(name="ob")
            ac = self.pi.pdtype.sample_placeholder([None])

            kloldnew = self.oldpi.pd.kl(self.pi.pd)
            ent = self.pi.pd.entropy()
            meankl = U.mean(kloldnew)
            meanent = U.mean(ent)
            pol_entpen = (-entcoeff) * meanent

            ratio = tf.exp(self.pi.pd.logp(ac) -
                           self.oldpi.pd.logp(ac))  # pnew / pold
            surr1 = ratio * atarg  # surrogate from conservative policy iteration
            surr2 = U.clip(ratio, 1.0 - clip_param,
                           1.0 + clip_param) * atarg  #
            pol_surr = -U.mean(tf.minimum(
                surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
            vf_loss = U.mean(tf.square(self.pi.vpred - ret))
            total_loss = pol_surr + pol_entpen + vf_loss

            var_list = self.pi.get_trainable_variables()

            # more debug info
            debug_atarg = atarg
            pi_ac = self.pi.pd.logp(ac)
            opi_ac = self.oldpi.pd.logp(ac)
            vpred = U.mean(self.pi.vpred)
            pi_pd = U.mean(self.pi.pd.flatparam())
            opi_pd = self.oldpi.pd.flatparam()[0]
            kl_oldnew = kloldnew[0]
            grads = tf.gradients(total_loss, var_list)

            losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
            debugs = [
                debug_atarg, pi_ac, opi_ac, vpred, pi_pd, opi_pd, kl_oldnew,
                total_loss
            ]

            self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                                          losses + debugs + [var_list, grads] +
                                          [U.flatgrad(total_loss, var_list)])
            self.adam = MpiAdam(var_list, epsilon=adam_epsilon)

            self.assign_old_eq_new = U.function(
                [], [],
                updates=[
                    tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                        self.oldpi.get_variables(), self.pi.get_variables())
                ])
            self.compute_losses = U.function([ob, ac, atarg, ret, lrmult],
                                             losses)

            U.initialize()
            self.adam.sync()

    def load(self, name):
        saver = tf.train.Saver(var_list=tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope))
        sess = U.get_session()
        saver.restore(sess, name)

    def save(self, name):
        saver = tf.train.Saver(var_list=tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope))
        sess = U.get_session()
        saver.save(sess, name)

    @staticmethod
    def gen_input(cur_state, hero_name, rival_hero):
        cur_line_input = Line_Input_Lite(cur_state, hero_name, rival_hero)
        cur_state_input = cur_line_input.gen_line_input()
        return cur_state_input

    def remember(self, cur_state, new_state, vpred, prevac):
        hero_name = self.hero
        action = cur_state.get_hero_action(hero_name)
        if action is not None:
            selected_action_idx = action.output_index
            reward = action.reward

            # 暂时将1v1的rival_hero 定义为对面英雄
            for hero in cur_state.heros:
                if hero.hero_name != hero_name:
                    rival_hero = hero.hero_name
                    break

            cur_line_input = Line_Input_Lite(cur_state, hero_name, rival_hero)
            cur_state_input = cur_line_input.gen_line_input()

            new_line_input = Line_Input_Lite(new_state, hero_name, rival_hero)
            new_state_input = new_line_input.gen_line_input()

            new = True if new_state.get_hero(hero_name).hp <= 0 else False

            i = self.t % self.update_target_period
            self.obs[i] = cur_state_input
            self.vpreds[i] = vpred
            self.news[i] = new
            self.acs[i] = selected_action_idx
            self.prevacs[i] = prevac
            self.rews[i] = reward
            self.t += 1

            self.cur_ep_ret += reward
            self.cur_ep_len += 1
            if new:
                self.ep_rets.append(self.cur_ep_ret)
                self.ep_lens.append(self.cur_ep_len)
                self.cur_ep_ret = 0
                self.cur_ep_len = 0

    def get_memory_size(self):
        return self.iters_so_far

    def add_vtarg_and_adv(self, seg, gamma, lam):
        """
        Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
        """
        new = np.append(
            seg["new"], 0
        )  # last element is only used for last vtarg, but we already zeroed it if last new = 1
        vpred = np.append(seg["vpred"], seg["nextvpred"])
        T = len(seg["rew"])
        seg["adv"] = gaelam = np.empty(T, 'float32')
        rew = seg["rew"]
        lastgaelam = 0
        for t in reversed(range(T)):
            nonterminal = 1 - new[t + 1]
            delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t]
            gaelam[
                t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
            # print('gaelam', gaelam[t], 'rew', rew[t], 'vpred_t+1', vpred[t+1], 'vpred_t', vpred[t])
        seg["tdlamret"] = seg["adv"] + seg["vpred"]

    # 需要下一次行动的vpred,所以需要在执行完一次act之后计算是否replay
    def replay(self, seg_list, batch_size):
        print(self.scope + " training")

        if self.schedule == 'constant':
            cur_lrmult = 1.0
        elif self.schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(self.timesteps_so_far) / self.max_timesteps, 0)

        # Here we do a bunch of optimization epochs over the data
        # 批量计算的思路是,每次将所有战斗的g值得到,然后求平均,优化。循环多次
        newlosses_list = []
        logger.log("Optimizing...")
        loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
        logger.log(fmt_row(13, loss_names))
        for _ in range(self.optim_epochs):
            g_list = []
            for seg in seg_list:

                self.add_vtarg_and_adv(seg, self.gamma, self.lam)

                # print(seg)

                # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[
                    "adv"], seg["tdlamret"]
                vpredbefore = seg[
                    "vpred"]  # predicted value function before udpate
                atarg = (atarg - atarg.mean()) / atarg.std(
                )  # standardized advantage function estimate
                d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                            shuffle=not self.pi.recurrent)

                if hasattr(self.pi, "ob_rms"):
                    self.pi.ob_rms.update(
                        ob)  # update running mean/std for policy

                self.assign_old_eq_new(
                )  # set old parameter values to new parameter values

                # 完整的拿所有行为
                batch = d.next_batch(d.n)
                # print("ob", batch["ob"], "ac", batch["ac"], "atarg", batch["atarg"], "vtarg", batch["vtarg"])
                *newlosses, debug_atarg, pi_ac, opi_ac, vpred, pi_pd, opi_pd, kl_oldnew, total_loss, var_list, grads, g = \
                    self.lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                # print("debug_atarg", debug_atarg, "pi_ac", pi_ac, "opi_ac", opi_ac, "vpred", vpred, "pi_pd", pi_pd,
                #       "opi_pd", opi_pd, "kl_oldnew", kl_oldnew, "var_mean", np.mean(g), "total_loss", total_loss)
                if np.isnan(np.mean(g)):
                    print('output nan, ignore it!')
                else:
                    g_list.append(g)
                    newlosses_list.append(newlosses)

            # 批量计算之后求平均在优化模型
            if len(g_list) > 0:
                avg_g = np.mean(g_list, axis=0)
                self.adam.update(avg_g, self.optim_stepsize * cur_lrmult)
                logger.log(fmt_row(13, np.mean(newlosses_list, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for seg in seg_list:
            self.add_vtarg_and_adv(seg, self.gamma, self.lam)

            # print(seg)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not self.pi.recurrent)
            # 完整的拿所有行为
            batch = d.next_batch(d.n)
            newlosses = self.compute_losses(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
            losses.append(newlosses)
        print(losses)

        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            if np.isinf(lossval):
                debug = True
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(self.flatten_lists, zip(*listoflrpairs))
        self.lenbuffer.extend(lens)
        self.rewbuffer.extend(rews)
        last_rew = self.rewbuffer[-1] if len(self.rewbuffer) > 0 else 0
        logger.record_tabular("LastRew", last_rew)
        logger.record_tabular(
            "LastLen", 0 if len(self.lenbuffer) <= 0 else self.lenbuffer[-1])
        logger.record_tabular("EpLenMean", np.mean(self.lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(self.rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        self.episodes_so_far += len(lens)
        self.timesteps_so_far += sum(lens)
        self.iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", self.episodes_so_far)
        logger.record_tabular("TimestepsSoFar", self.timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - self.tstart)
        logger.record_tabular("IterSoFar", self.iters_so_far)
        logger.record_tabular("CalulateActions", self.act_times)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    def flatten_lists(self, listoflists):
        return [el for list_ in listoflists for el in list_]

    def get_actions(self, state_inputs):
        self.act_times += len(state_inputs)
        stochastic = True
        explor_value = self.exploration.value(self.act_times)
        actions, vpreds = self.pi.acts(stochastic=stochastic,
                                       update_eps=explor_value,
                                       ob=state_inputs)
        return actions, explor_value, vpreds

    def get_action(self, state_input):
        self.act_times += 1
        stochastic = True
        explor_value = self.exploration.value(self.act_times)
        actions, vpred = self.pi.act(stochastic=stochastic,
                                     update_eps=explor_value,
                                     ob=state_input)
        actions = np.array([actions])
        return actions, explor_value, vpred

    @staticmethod
    # 只使用当前帧(做决定帧)+下一帧来计算奖惩,目的是在游戏结束时候可以计算所有之前行为的奖惩,不会因为需要延迟n下而没法计算
    # 另外最核心的是,ppo本身就不要要求奖惩值是根据上一个行动来得到的
    def cal_target_ppo(prev_state, cur_state, next_state, hero_name,
                       rival_hero_name, line_idx):
        # 只计算当前帧的得失,得失为金币获取情况 + 敌方血量变化
        # 获得小兵死亡情况, 根据小兵属性计算他们的金币情况
        cur_rival_hero = cur_state.get_hero(rival_hero_name)
        rival_team = cur_rival_hero.team
        cur_hero = cur_state.get_hero(hero_name)
        cur_rival_hero = cur_state.get_hero(rival_hero_name)
        next_hero = next_state.get_hero(hero_name)
        next_rival_hero = next_state.get_hero(rival_hero_name)
        # 找到英雄附近死亡的敌方小兵
        dead_units = StateUtil.get_dead_units_in_line(
            next_state, rival_team, line_idx, cur_hero,
            StateUtil.GOLD_GAIN_RADIUS)
        dead_golds = sum([
            StateUtil.get_unit_value(u.unit_name, u.cfg_id) for u in dead_units
        ])
        dead_unit_str = (','.join([u.unit_name for u in dead_units]))

        # 如果英雄有小额金币变化,则忽略
        gold_delta = next_hero.gold - cur_hero.gold
        if gold_delta % 10 == 3 or gold_delta % 10 == 8 or gold_delta == int(
                dead_golds / 2) + 3:
            gold_delta -= 3

        # 很难判断英雄的最后一击,所以我们计算金币变化,超过死亡单位一半的金币作为英雄获得金币
        gold_delta = gold_delta * 2 - dead_golds
        if gold_delta < 0:
            print('获得击杀金币不应该小于零', cur_state.tick, 'dead_units', dead_unit_str,
                  'gold_gain', (next_hero.gold - cur_hero.gold))
            gold_delta = 0

        # if dead_golds > 0:
        # print('dead_gold', dead_golds, 'delta_gold', gold_delta, "hero", hero_name, "tick", cur_state.tick)

        # 计算对指定敌方英雄造成的伤害,计算接受的伤害
        # 伤害信息和击中信息都有延迟,在两帧之后(但是一般会出现在同一条信息中,偶尔也会出现在第二条中)
        # 这里只计算下一帧中英雄对对方造成的伤害
        # 扩大自己受到伤害的惩罚
        # 扩大对方低血量下受到伤害的奖励
        # 扩大攻击伤害的权重
        # TODO 防御型辅助型法术的定义,辅助法术不能乱放,否则惩罚
        dmg = next_state.get_hero_total_dmg(
            hero_name, rival_hero_name) / float(cur_rival_hero.maxhp)
        dmg *= 3 * cur_rival_hero.maxhp / float(cur_rival_hero.hp +
                                                cur_rival_hero.maxhp)

        # 估算玩家接收的伤害时候,只考虑下一帧中的变化,像塔的攻击需要飞行所有有延迟这种情况这里不需要考虑
        self_hp_loss = (cur_hero.hp -
                        next_hero.hp) / float(cur_hero.maxhp) / 2 if (
                            cur_hero.hp >= next_hero.hp >= next_hero.hp) else 0
        self_hp_loss *= 3 * cur_hero.maxhp / float(cur_hero.hp +
                                                   cur_hero.maxhp)
        dmg_delta = int((dmg - self_hp_loss) * LineModel.REWARD_RIVAL_DMG)

        # 统计和更新变量
        # print('reward debug info, hero: %s, max_gold: %s, gold_gain: %s, dmg: %s, hp_loss: %s, dmg_delta: %s, '
        #       'dead_units: %s'
        #       % (
        #       hero_name, str(dead_golds), str(gold_delta), str(dmg), str(self_hp_loss), str(dmg_delta), dead_unit_str))

        # 最大奖励是击杀小兵和塔的金币加上对方一条命血量的奖励
        # 最大惩罚是被对方造成了一条命伤害
        # 零分为获得了所有的死亡奖励
        reward = float(gold_delta + dmg_delta) / 100

        # 特殊情况处理
        # 鼓励攻击对方小兵,塔
        if_hit_unit = next_state.if_hero_hit_any_unit(hero_name,
                                                      rival_hero_name)
        if if_hit_unit is not None:
            # print("物理攻击到了小兵", if_hit_unit)
            reward += 0.01
        if_hit_tower = next_state.if_hero_hit_tower(hero_name)
        if if_hit_tower is not None:
            # print("物理攻击到了塔", if_hit_tower)
            reward += 0.01

        # 将所有奖励缩小
        final_reward = reward / 10
        final_reward = min(max(final_reward, -1), 1)

        # 特殊奖励,放在最后面
        # 英雄击杀最后一击,直接最大奖励(因为gamma的存在,扩大这个惩罚)
        if cur_rival_hero.hp > 0 and next_rival_hero.hp <= 0:
            # print('对线英雄%s死亡' % rival_hero_name)
            dmg_hit_rival = next_state.get_hero_total_dmg(
                hero_name, rival_hero_name)
            if dmg_hit_rival > 0:
                # print('英雄%s对对方造成了最后一击' % hero_name)
                final_reward = 1
                if cur_hero.hp > 0 and next_hero.hp <= 0:
                    final_reward = 0
        elif cur_hero.hp > 0 and next_hero.hp <= 0:
            print('英雄死亡')
            final_reward = -5
        return final_reward

    @staticmethod
    def assert_tower_in_input(cur_state, hero_name, rival_hero):
        # 如果敌方塔要攻击英雄的话,检查塔的信息是不是在input中
        att_info = cur_state.if_tower_attack_hero(hero_name)
        if att_info is not None:
            tower = str(att_info.atker)
            tower_info = cur_state.get_obj(tower)
            hero_info = cur_state.get_hero(hero_name)
            model_input = LineModel_PPO1.gen_input(cur_state, hero_name,
                                                   rival_hero)
            if model_input[44] == Line_Input_Lite.normalize_value_static(
                    int(tower)):
                print('yes found attack tower in input', tower, 'distance',
                      model_input[50], 'cal_distance',
                      StateUtil.cal_distance2(tower_info.pos, hero_info.pos))
            else:
                print('not found attack tower in input', tower, 'distance',
                      model_input[50], 'cal_distance',
                      StateUtil.cal_distance2(tower_info.pos, hero_info.pos))

    @staticmethod
    # 只使用当前帧(做决定帧)+下一帧来计算奖惩,目的是在游戏结束时候可以计算所有之前行为的奖惩,不会因为需要延迟n下而没法计算
    # 另外最核心的是,ppo本身就不要要求奖惩值是根据上一个行动来得到的
    def cal_target_ppo_2(prev_state, cur_state, next_state, hero_name,
                         rival_hero_name, line_idx):
        LineModel_PPO1.assert_tower_in_input(cur_state, hero_name,
                                             rival_hero_name)

        # 只计算当前帧的得失,得失为金币获取情况 + 敌方血量变化
        # 获得小兵死亡情况, 根据小兵属性计算他们的金币情况
        cur_rival_hero = cur_state.get_hero(rival_hero_name)
        rival_team = cur_rival_hero.team
        cur_hero = cur_state.get_hero(hero_name)
        cur_rival_hero = cur_state.get_hero(rival_hero_name)
        next_hero = next_state.get_hero(hero_name)
        next_rival_hero = next_state.get_hero(rival_hero_name)
        # 找到英雄附近死亡的敌方小兵
        dead_units = StateUtil.get_dead_units_in_line(
            next_state, rival_team, line_idx, cur_hero,
            StateUtil.GOLD_GAIN_RADIUS)
        dead_golds = sum([
            StateUtil.get_unit_value(u.unit_name, u.cfg_id) for u in dead_units
        ])

        # 如果英雄有小额金币变化,则忽略
        gold_delta = next_hero.gold - cur_hero.gold
        if gold_delta % 10 == 3 or gold_delta % 10 == 8 or gold_delta == int(
                dead_golds / 2) + 3:
            gold_delta -= 3

        # 很难判断英雄的最后一击,所以我们计算金币变化,超过死亡单位一半的金币作为英雄获得金币
        if gold_delta > 0:
            gold_delta = gold_delta * 2 - dead_golds
            if gold_delta < 0:
                print('获得击杀金币不应该小于零', cur_state.tick, 'dead_golds', dead_golds,
                      'gold_delta', (next_hero.gold - cur_hero.gold))
                gold_delta = 0

        # if dead_golds > 0:
        #     print('dead_gold', dead_golds, 'delta_gold', gold_delta, "hero", hero_name, "tick", cur_state.tick)

        reward = float(gold_delta) / 100

        # 将所有奖励缩小
        final_reward = reward / 100
        final_reward = min(max(final_reward, -1), 1)

        # 特殊奖励,放在最后面
        # 英雄击杀最后一击,直接最大奖励(因为gamma的存在,扩大这个惩罚)
        if cur_rival_hero.hp > 0 and next_rival_hero.hp <= 0:
            # print('对线英雄%s死亡' % rival_hero_name)
            dmg_hit_rival = next_state.get_hero_total_dmg(
                hero_name, rival_hero_name)
            if dmg_hit_rival > 0:
                # print('英雄%s对对方造成了最后一击' % hero_name)
                final_reward = 1
                if cur_hero.hp > 0 and next_hero.hp <= 0:
                    final_reward = 0
        elif cur_hero.hp > 0 and next_hero.hp <= 0:
            print('英雄死亡')
            final_reward = -1
        return final_reward
示例#27
0
class HRAAdaptive(object):
    """HRAAdaptive using HRA architecture"""
    def __init__(self, name, choices, reward_types, network_config,
                 reinforce_config):
        super(HRAAdaptive, self).__init__()
        self.name = name
        self.choices = choices
        self.network_config = network_config
        self.reinforce_config = reinforce_config
        self.replace_frequency = reinforce_config.replace_frequency
        self.memory = PrioritizedReplayBuffer(
            self.reinforce_config.memory_size, 0.6)
        self.learning = True
        self.reward_types = reward_types
        self.steps = 0
        self.episode = 0
        self.reward_history = []
        self.best_reward_mean = -sys.maxsize

        self.beta_schedule = LinearSchedule(
            self.reinforce_config.beta_timesteps,
            initial_p=self.reinforce_config.beta_initial,
            final_p=self.reinforce_config.beta_final)

        self.epsilon_schedule = LinearSchedule(
            self.reinforce_config.epsilon_timesteps,
            initial_p=self.reinforce_config.starting_epsilon,
            final_p=self.reinforce_config.final_epsilon)

        self.reset()

        self.eval_model = HRAModel(self.name + "_eval", self.network_config,
                                   use_cuda)
        self.target_model = HRAModel(self.name + "_target",
                                     self.network_config, use_cuda)

        reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name

        if not network_config.restore_network:
            clear_summary_path(reinforce_summary_path)
        else:
            self.restore_state()

        self.summary = SummaryWriter(log_dir=reinforce_summary_path)

    def __del__(self):
        self.save()
        self.summary.close()

    def should_explore(self):
        self.epsilon = self.epsilon_schedule.value(self.steps)
        self.summary.add_scalar(tag='%s/Epsilon' % self.name,
                                scalar_value=self.epsilon,
                                global_step=self.steps)

        return random.random() < self.epsilon

    def predict(self, state):
        self.steps += 1

        if (self.previous_state is not None
                and self.previous_action is not None):
            self.memory.add(self.previous_state, self.previous_action,
                            self.reward_list(), state, 0)

        if self.learning and self.should_explore():
            action = random.choice(list(range(len(self.choices))))
            q_values = None
            combined_q_values = None
            choice = self.choices[action]
        else:
            _state = Tensor(state).unsqueeze(0)
            model_start_time = time.time()
            action, q_values, combined_q_values = self.eval_model.predict(
                _state, self.steps, self.learning)
            choice = self.choices[action]
            self.model_time += time.time() - model_start_time

        if self.learning and self.steps % self.replace_frequency == 0:
            logger.debug("Replacing target model for %s" % self.name)
            self.target_model.replace(self.eval_model)

        if (self.learning and self.steps > self.reinforce_config.update_start
                and self.steps % self.reinforce_config.update_steps == 0):

            update_start_time = time.time()
            self.update()
            self.update_time += time.time() - update_start_time

        self.clear_current_rewards()

        self.previous_state = state
        self.previous_action = action

        return choice, q_values, combined_q_values

    def disable_learning(self):
        logger.info("Disabled Learning for %s agent" % self.name)
        self.save()

        self.learning = False
        self.episode = 0

    def end_episode(self, state):
        if not self.learning:
            return

        self.reward_history.append(self.total_reward)

        logger.info("End of Episode %d with total reward %.2f, epsilon %.2f" %
                    (self.episode + 1, self.total_reward, self.epsilon))

        self.episode += 1
        self.summary.add_scalar(tag='%s/Episode Reward' % self.name,
                                scalar_value=self.total_reward,
                                global_step=self.episode)

        for reward_type in self.reward_types:
            tag = '%s/Decomposed Reward/%s' % (self.name, reward_type)
            value = self.decomposed_total_reward[reward_type]
            self.summary.add_scalar(tag=tag,
                                    scalar_value=value,
                                    global_step=self.episode)

        self.memory.add(self.previous_state, self.previous_action,
                        self.reward_list(), state, 1)

        self.episode_time = time.time() - self.episode_time

        logger.debug("Episode Time: %.2f, "
                     "Model prediction time: %.2f, "
                     "Updated time: %.2f, "
                     "Update fit time: %.2f" %
                     (self.episode_time, self.model_time, self.update_time,
                      self.fit_time))

        self.save()
        self.reset()

    def reset(self):
        self.clear_current_rewards()
        self.clear_episode_rewards()

        self.previous_state = None
        self.previous_action = None
        self.episode_time = time.time()
        self.update_time = 0
        self.fit_time = 0
        self.model_time = 0

    def reward_list(self):
        reward = [0] * len(self.reward_types)

        for i, reward_type in enumerate(sorted(self.reward_types)):
            reward[i] = self.current_reward[reward_type]

        return reward

    def clear_current_rewards(self):
        self.current_reward = {}
        for reward_type in self.reward_types:
            self.current_reward[reward_type] = 0

    def clear_episode_rewards(self):
        self.total_reward = 0
        self.decomposed_total_reward = {}
        for reward_type in self.reward_types:
            self.decomposed_total_reward[reward_type] = 0

    def reward(self, reward_type, value):
        self.current_reward[reward_type] += value
        self.decomposed_total_reward[reward_type] += value
        self.total_reward += value

    def restore_state(self):
        restore_path = self.network_config.network_path + "/adaptive.info"

        if self.network_config.network_path and os.path.exists(restore_path):
            logger.info("Restoring state from %s" %
                        self.network_config.network_path)

            with open(restore_path, "rb") as file:
                info = pickle.load(file)

            self.steps = info["steps"]
            self.best_reward_mean = info["best_reward_mean"]
            self.episode = info["episode"]

            logger.info(
                "Continuing from %d episode (%d steps) with best reward mean %.2f"
                % (self.episode, self.steps, self.best_reward_mean))

    def save(self, force=False):
        info = {
            "steps": self.steps,
            "best_reward_mean": self.best_reward_mean,
            "episode": self.episode
        }

        if force:
            logger.info("Forced to save network")
            self.eval_model.save_network()
            self.target_model.save_network()
            pickle.dump(info,
                        self.network_config.network_path + "adaptive.info")

        if (len(self.reward_history) >= self.network_config.save_steps
                and self.episode % self.network_config.save_steps == 0):

            total_reward = sum(
                self.reward_history[-self.network_config.save_steps:])
            current_reward_mean = total_reward / self.network_config.save_steps

            if current_reward_mean >= self.best_reward_mean:
                self.best_reward_mean = current_reward_mean
                info["best_reward_mean"] = current_reward_mean
                logger.info("Saving network. Found new best reward (%.2f)" %
                            current_reward_mean)
                self.eval_model.save_network()
                self.target_model.save_network()
                with open(self.network_config.network_path + "/adaptive.info",
                          "wb") as file:
                    pickle.dump(info, file, protocol=pickle.HIGHEST_PROTOCOL)
            else:
                logger.info("The best reward is still %.2f. Not saving" %
                            current_reward_mean)

    def update(self):
        if len(self.memory) <= self.reinforce_config.batch_size:
            return

        beta = self.beta_schedule.value(self.steps)
        self.summary.add_scalar(tag='%s/Beta' % self.name,
                                scalar_value=beta,
                                global_step=self.steps)

        batch = self.memory.sample(self.reinforce_config.batch_size, beta)

        (states, actions, reward, next_states, is_terminal, weights,
         batch_idxes) = batch

        self.summary.add_histogram(tag='%s/Batch Indices' % self.name,
                                   values=Tensor(batch_idxes),
                                   global_step=self.steps)

        states = Tensor(states)
        next_states = Tensor(next_states)
        terminal = FloatTensor(is_terminal)
        reward = FloatTensor(reward)
        batch_index = torch.arange(self.reinforce_config.batch_size,
                                   dtype=torch.long)

        # Find the target values
        q_actions, q_values, _ = self.eval_model.predict_batch(states)
        q_values = q_values[:, batch_index, actions]
        _, q_next, _ = self.target_model.predict_batch(next_states)
        q_next = q_next.mean(2).detach()
        q_next = (1 - terminal) * q_next
        q_target = reward.t() + self.reinforce_config.discount_factor * q_next

        # Update the model
        fit_start_time = time.time()
        self.eval_model.fit(q_values, q_target, self.steps)
        self.fit_time += time.time() - fit_start_time

        # Update priorities
        td_errors = q_values - q_target
        td_errors = torch.sum(td_errors, 0)
        new_priorities = torch.abs(td_errors) + 1e-6  # prioritized_replay_eps
        self.memory.update_priorities(batch_idxes, new_priorities.data)
示例#28
0
文件: DQN.py 项目: PwnerHarry/DQN
class DQN(RL_AGENT):
    def __init__(self,
        env, 
        network_policy,
        gamma=1.0,
        exploration_fraction=0.02, exploration_final_eps=0.01, steps_total=50000000,
        size_buffer=1000000, prioritized_replay=True, alpha_prioritized_replay=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6,
        type_optimizer='Adam', lr=5e-4, eps=1.5e-4,
        time_learning_starts=20000, freq_targetnet_update=8000, freq_train=4, size_batch=32,
        callback=None, load_path=None, # for debugging
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        seed=42,
        **network_kwargs):

        super(DQN, self).__init__(env, gamma, seed)
        self.create_replay_buffer(prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total)
        self.exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * steps_total), initial_p=1.0, final_p=exploration_final_eps)
        
        self.network_policy = network_policy # an instance of DQN_NETWORK, which contains an instance of FEATURE_EXTRACTOR and 1 additional head
        self.optimizer = eval('optim.%s' % type_optimizer)(self.network_policy.parameters(), lr=lr, eps=eps)

        # initialize target network
        self.network_target = copy.deepcopy(self.network_policy)
        for param in self.network_target.parameters():
            param.requires_grad = False
        self.network_target.eval()

        self.size_batch = size_batch
        self.time_learning_starts = time_learning_starts
        self.freq_train = freq_train
        self.freq_targetnet_update = freq_targetnet_update
        self.t, self.steps_total = 0, steps_total
        self.device = device
        self.step_last_print, self.time_last_print = 0, None
    
    def load_checkpoint(self, checkpoint):
        """
        loads checkpoint saved by utils/save_checkpoint
        """
        self.load_state_dict(checkpoint['agent_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.t = checkpoint['t']
        self.gamma = checkpoint['gamma']
        self.seed = checkpoint['seed']
        self.exploration = checkpoint['exploration']
        self.observation_space = checkpoint['observation_space']
        self.action_space = checkpoint['action_space']
        self.beta_schedule = checkpoint['beta_schedule']
        self.replay_buffer = checkpoint['replay_buffer']
        self.size_batch = checkpoint['size_batch']
        self.time_learning_starts = checkpoint['time_learning_starts']
        self.freq_train = checkpoint['freq_train']
        self.freq_targetnet_update = checkpoint['freq_targetnet_update']
        self.steps_total = checkpoint['steps_total']
        self.device = checkpoint['device']
        self.step_last_print = checkpoint['step_last_print']
        self.time_last_print = checkpoint['time_last_print']
        print('checkpoint loaded with replay buffer of size %d' % (len(self.replay_buffer)))

    def create_replay_buffer(self, prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total):
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        if prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(size_buffer, alpha=alpha_prioritized_replay)
            if prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = steps_total
            self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(size_buffer)
            self.beta_schedule = None
        pass

    def decide(self, obs, eval=False): # Validated by Harry 17h45 23-11-2019
        """
        input observation and output action
        some through the computations of the policy network
        """
        if eval or random.random() > self.exploration.value(self.t):
            with torch.no_grad():
                return int(torch.argmax(self.network_policy(obs)))
        else: # explore
            return self.action_space.sample()
    
    def step(self, obs_curr, action, reward, obs_next, done, eval=False):
        """
        an agent step: in this step the agent does whatever it needs
        """
        if obs_next is not None:
            self.replay_buffer.add(obs_curr, action, np.sign(reward), obs_next, done) # clip rewards, done is the flag for whether obs_next is terminal
        if self.t >= self.time_learning_starts:
            if len(self.replay_buffer) >= self.size_batch and self.t % self.freq_train == 0:
                self.update()
            if self.t % self.freq_targetnet_update == 0:
                self.sync_parameters()
        self.t += 1

    def update(self):
        """
        update the parameters of the DQN model using the weighted sampled Bellman error
        """
        # sample a batch
        if self.prioritized_replay:
            experience = self.replay_buffer.sample(self.size_batch, beta=self.beta_schedule.value(self.t))
            (batch_obs_curr, batch_action, batch_reward, batch_obs_next, batch_done, weights, batch_idxes) = experience
        else:
            batch_obs_curr, batch_action, batch_reward, batch_obs_next, batch_done = self.replay_buffer.sample(self.size_batch)
            weights, batch_idxes = np.ones_like(batch_reward), None
        batch_action, batch_reward = torch.tensor(batch_action, dtype=torch.int64, device=self.device).view(-1, 1), torch.tensor(batch_reward, dtype=torch.float32, device=self.device)
        weights = torch.tensor(weights, dtype=torch.float32, device=self.device)
        # calculate the weighted Bellman error
        index_nonterm_trans = np.argwhere(batch_done == False).reshape(-1)
        values_next = torch.zeros_like(batch_reward, dtype=torch.float32)
        values_next[index_nonterm_trans] = self.network_target(batch_obs_next[index_nonterm_trans]).max(1)[0].detach()
        values_curr = self.network_policy(batch_obs_curr).gather(1, index=batch_action).view(-1)
        error_bellman = F.smooth_l1_loss(values_curr, batch_reward + self.gamma * values_next, reduction='none') # Huber loss
        error_bellman_weighted = torch.dot(error_bellman, weights)
        # calculate gradient w.r.t. to the weighted Bellman error
        self.optimizer.zero_grad()
        error_bellman_weighted.backward()
        # gradient clipping
        for param in self.network_policy.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        # update prioritized replay, if used
        if self.prioritized_replay:
            new_priorities = np.abs(error_bellman.detach().cpu().numpy()) + self.prioritized_replay_eps
            self.replay_buffer.update_priorities(batch_idxes, new_priorities)

    def sync_parameters(self):
        """
        synchronize the parameters of self.network_policy and self.network_target
        """
        self.network_target.load_state_dict(self.network_policy.state_dict())
        for param in self.network_target.parameters():
            param.requires_grad = False
        self.network_target.eval()

    def reset_parameters(self):
        self.network_policy.reset_parameters()
示例#29
0
def learn(env,
          network,
          seed=None,
          lr=5e-4,
          total_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=100000,
          checkpoint_path=None,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None,
          load_path=None,
          **network_kwargs):
    """Train a deepq model.

    Parameters
    -------
    env: gym.Env
        environment to train on
    network: string or a function
        neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models
        (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which
        will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that)
    seed: int or None
        prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used.
    lr: float
        learning rate for adam optimizer
    total_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
    batch_size: int
        size of a batch sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
        set to None to disable printing
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to total_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    load_path: str
        path to load the model from. (default: None)
    **network_kwargs
        additional keyword arguments to pass to the network builder.

    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = get_session()
    set_global_seeds(seed)

    q_func = build_q_func(network, **network_kwargs)

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph

    observation_space = env.observation_space

    def make_obs_ph(name):
        return ObservationInput(observation_space, name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        param_noise=param_noise)

    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    act = ActWrapper(act, act_params)

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = total_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = PiecewiseSchedule([(0, 1.0), (int(1e6), 0.1),
                                     (int(1e7), 0.01)],
                                    outside_value=0.01)
    '''exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
    '''

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True

    with tempfile.TemporaryDirectory() as td:
        td = checkpoint_path or td

        model_file = os.path.join(td, "model")
        model_saved = False

        if tf.train.latest_checkpoint(td) is not None:
            load_variables(model_file)
            logger.log('Loaded model from {}'.format(model_file))
            model_saved = True
        elif load_path is not None:
            load_variables(load_path)
            logger.log('Loaded model from {}'.format(load_path))

        for t in range(total_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = -np.log(1. - exploration.value(
                    t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            action = act(np.array(obs)[None], update_eps=update_eps,
                         **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    save_variables(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            load_variables(model_file)

    return act
示例#30
0
            reset = False
            new_obs, rew, done, info = env.step(action)
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs
            if done:
                num_iters_since_reset = 0
                obs = env.reset()
                reset = True

            if (num_iters > max(5 * args.batch_size,
                                args.replay_buffer_size // 20)
                    and num_iters % args.learning_freq == 0):
                # Sample a bunch of transitions from replay buffer
                if args.prioritized:
                    experience = replay_buffer.sample(
                        args.batch_size, beta=beta_schedule.value(num_iters))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        args.batch_size)
                    weights = np.ones_like(rewards)
                # Minimize the error in Bellman's equation and compute TD-error
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                # Update the priorities in the replay buffer
                if args.prioritized:
                    new_priorities = np.abs(td_errors) + args.prioritized_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)
            # Update target network.
def learn(env,
          q_func,
          num_actions=4,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None):
  """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
  # Create all the functions necessary to train the model

  sess = U.make_session(num_cpu=num_cpu)
  sess.__enter__()

  def make_obs_ph(name):
    return U.BatchInput((32, 32), name=name)

  act, train, update_target, debug = deepq.build_train(
    make_obs_ph=make_obs_ph,
    q_func=q_func,
    num_actions=num_actions,
    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    gamma=gamma,
    grad_norm_clipping=10,
    scope="deepq")
  #
  # act_y, train_y, update_target_y, debug_y = deepq.build_train(
  #   make_obs_ph=make_obs_ph,
  #   q_func=q_func,
  #   num_actions=num_actions,
  #   optimizer=tf.train.AdamOptimizer(learning_rate=lr),
  #   gamma=gamma,
  #   grad_norm_clipping=10,
  #   scope="deepq_y"
  # )

  act_params = {
    'make_obs_ph': make_obs_ph,
    'q_func': q_func,
    'num_actions': num_actions,
  }

  # Create the replay buffer
  if prioritized_replay:
    replay_buffer = PrioritizedReplayBuffer(
      buffer_size, alpha=prioritized_replay_alpha)
    # replay_buffer_y = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha)

    if prioritized_replay_beta_iters is None:
      prioritized_replay_beta_iters = max_timesteps
    beta_schedule = LinearSchedule(
      prioritized_replay_beta_iters,
      initial_p=prioritized_replay_beta0,
      final_p=1.0)

    # beta_schedule_y = LinearSchedule(prioritized_replay_beta_iters,
    #                                  initial_p=prioritized_replay_beta0,
    #                                  final_p=1.0)
  else:
    replay_buffer = ReplayBuffer(buffer_size)
    # replay_buffer_y = ReplayBuffer(buffer_size)

    beta_schedule = None
    # beta_schedule_y = None
  # Create the schedule for exploration starting from 1.
  exploration = LinearSchedule(
    schedule_timesteps=int(exploration_fraction * max_timesteps),
    initial_p=1.0,
    final_p=exploration_final_eps)

  # Initialize the parameters and copy them to the target network.
  U.initialize()
  update_target()
  # update_target_y()

  episode_rewards = [0.0]
  saved_mean_reward = None

  obs = env.reset()
  # Select all marines first
  obs = env.step(
    actions=[sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])

  player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

  screen = (player_relative == _PLAYER_NEUTRAL).astype(int)  #+ path_memory

  player_y, player_x = (player_relative == _PLAYER_FRIENDLY).nonzero()
  player = [int(player_x.mean()), int(player_y.mean())]

  if (player[0] > 16):
    screen = shift(LEFT, player[0] - 16, screen)
  elif (player[0] < 16):
    screen = shift(RIGHT, 16 - player[0], screen)

  if (player[1] > 16):
    screen = shift(UP, player[1] - 16, screen)
  elif (player[1] < 16):
    screen = shift(DOWN, 16 - player[1], screen)

  reset = True
  with tempfile.TemporaryDirectory() as td:
    model_saved = False
    model_file = os.path.join("model/", "mineral_shards")
    print(model_file)

    for t in range(max_timesteps):
      if callback is not None:
        if callback(locals(), globals()):
          break
      # Take action and update exploration to the newest value
      kwargs = {}
      if not param_noise:
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.
      else:
        update_eps = 0.
        if param_noise_threshold >= 0.:
          update_param_noise_threshold = param_noise_threshold
        else:
          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
          # for detailed explanation.
          update_param_noise_threshold = -np.log(
            1. - exploration.value(t) +
            exploration.value(t) / float(num_actions))
        kwargs['reset'] = reset
        kwargs[
          'update_param_noise_threshold'] = update_param_noise_threshold
        kwargs['update_param_noise_scale'] = True

      action = act(
        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      # action_y = act_y(np.array(screen)[None], update_eps=update_eps, **kwargs)[0]

      reset = False

      coord = [player[0], player[1]]
      rew = 0

      if (action == 0):  #UP

        if (player[1] >= 8):
          coord = [player[0], player[1] - 8]
          #path_memory_[player[1] - 16 : player[1], player[0]] = -1
        elif (player[1] > 0):
          coord = [player[0], 0]
          #path_memory_[0 : player[1], player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 1):  #DOWN

        if (player[1] <= 23):
          coord = [player[0], player[1] + 8]
          #path_memory_[player[1] : player[1] + 16, player[0]] = -1
        elif (player[1] > 23):
          coord = [player[0], 31]
          #path_memory_[player[1] : 63, player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 2):  #LEFT

        if (player[0] >= 8):
          coord = [player[0] - 8, player[1]]
          #path_memory_[player[1], player[0] - 16 : player[0]] = -1
        elif (player[0] < 8):
          coord = [0, player[1]]
          #path_memory_[player[1], 0 : player[0]] = -1
          #else:
          #  rew -= 1

      elif (action == 3):  #RIGHT

        if (player[0] <= 23):
          coord = [player[0] + 8, player[1]]
          #path_memory_[player[1], player[0] : player[0] + 16] = -1
        elif (player[0] > 23):
          coord = [31, player[1]]
          #path_memory_[player[1], player[0] : 63] = -1

      if _MOVE_SCREEN not in obs[0].observation["available_actions"]:
        obs = env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])

      new_action = [
        sc2_actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
      ]

      # else:
      #   new_action = [sc2_actions.FunctionCall(_NO_OP, [])]

      obs = env.step(actions=new_action)

      player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
      new_screen = (player_relative == _PLAYER_NEUTRAL).astype(
        int)  #+ path_memory

      player_y, player_x = (
        player_relative == _PLAYER_FRIENDLY).nonzero()
      player = [int(player_x.mean()), int(player_y.mean())]

      if (player[0] > 16):
        new_screen = shift(LEFT, player[0] - 16, new_screen)
      elif (player[0] < 16):
        new_screen = shift(RIGHT, 16 - player[0], new_screen)

      if (player[1] > 16):
        new_screen = shift(UP, player[1] - 16, new_screen)
      elif (player[1] < 16):
        new_screen = shift(DOWN, 16 - player[1], new_screen)

      rew = obs[0].reward

      done = obs[0].step_type == environment.StepType.LAST

      # Store transition in the replay buffer.
      replay_buffer.add(screen, action, rew, new_screen, float(done))
      # replay_buffer_y.add(screen, action_y, rew, new_screen, float(done))

      screen = new_screen

      episode_rewards[-1] += rew
      reward = episode_rewards[-1]

      if done:
        obs = env.reset()
        player_relative = obs[0].observation["screen"][
          _PLAYER_RELATIVE]

        screen = (player_relative == _PLAYER_NEUTRAL).astype(
          int)  #+ path_memory

        player_y, player_x = (
          player_relative == _PLAYER_FRIENDLY).nonzero()
        player = [int(player_x.mean()), int(player_y.mean())]

        # Select all marines first
        env.step(actions=[
          sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
        ])
        episode_rewards.append(0.0)
        #episode_minerals.append(0.0)

        reset = True

      if t > learning_starts and t % train_freq == 0:
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:

          experience = replay_buffer.sample(
            batch_size, beta=beta_schedule.value(t))
          (obses_t, actions, rewards, obses_tp1, dones, weights,
           batch_idxes) = experience

          # experience_y = replay_buffer.sample(batch_size, beta=beta_schedule.value(t))
          # (obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y, batch_idxes_y) = experience_y
        else:

          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
            batch_size)
          weights, batch_idxes = np.ones_like(rewards), None

          # obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y = replay_buffer_y.sample(batch_size)
          # weights_y, batch_idxes_y = np.ones_like(rewards_y), None

        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                          weights)

        # td_errors_y = train_x(obses_t_y, actions_y, rewards_y, obses_tp1_y, dones_y, weights_y)

        if prioritized_replay:
          new_priorities = np.abs(td_errors) + prioritized_replay_eps
          # new_priorities = np.abs(td_errors) + prioritized_replay_eps
          replay_buffer.update_priorities(batch_idxes,
                                          new_priorities)
          # replay_buffer.update_priorities(batch_idxes, new_priorities)

      if t > learning_starts and t % target_network_update_freq == 0:
        # Update target network periodically.
        update_target()
        # update_target_y()

      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
      num_episodes = len(episode_rewards)
      if done and print_freq is not None and len(
          episode_rewards) % print_freq == 0:
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("reward", reward)
        logger.record_tabular("mean 100 episode reward",
                              mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * exploration.value(t)))
        logger.dump_tabular()

      if (checkpoint_freq is not None and t > learning_starts
          and num_episodes > 100 and t % checkpoint_freq == 0):
        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
          if print_freq is not None:
            logger.log(
              "Saving model due to mean reward increase: {} -> {}".
                format(saved_mean_reward, mean_100ep_reward))
          U.save_state(model_file)
          model_saved = True
          saved_mean_reward = mean_100ep_reward
    if model_saved:
      if print_freq is not None:
        logger.log("Restored model with mean reward: {}".format(
          saved_mean_reward))
      U.load_state(model_file)

  return ActWrapper(act)
示例#32
0
def learn(env,
          q_func,
          policy_fn,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=100,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          param_noise=False,
          callback=None):
    # Create all the functions necessary to train the model

    sess = tf.Session()
    sess.__enter__()

    # capture the shape outside the closure so that the env object is not serialized
    # by cloudpickle when serializing make_obs_ph
    observation_space_shape = env.observation_space.shape
    def make_obs_ph(name):
        return BatchInput(observation_space_shape, name=name)
    
    scope = "ampi"
    reuse=None
    grad_norm_clipping=None
    num_actions=env.action_space.n
    optimizer_q=tf.train.AdamOptimizer(learning_rate=lr)
    optimizer_pi=tf.train.AdamOptimizer(learning_rate=lr)
    act = build_act(make_obs_ph, q_func, num_actions=env.action_space.n, scope=scope, reuse=reuse)
    
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")
        
        # add
        ob_space = env.observation_space
        ac_space = env.action_space
        pi, act = policy_fn(obs_t_input.get(), ob_space, ac_space, scope="pi_func") # train pi
        pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/pi_func")
        
        pi_tp1, act_tp1 = policy_fn(obs_tp1_input.get(), ob_space, ac_space, scope="target_pi_func") # target pi
        target_pi_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/taget_pi_func")
 
        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func")
        
        # Q_{train}(a,s)
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) 
        
        # y_j
        act_best = tf.argmax(pi, axis=1) # argmax \pi(s_{j+1})
        q_tp1_sampled = tf.reduce_sum(q_tp1 * tf.one_hot(act_best, num_actions), 1) # Q_{target}(s_{j+1}, argmax(\pi(s_{j+1}))
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_sampled
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked
        
        # Regression loss
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        
        # argmax_a Q_{target}(s_j, a)
        z_j = tf.argmax(q_tp1, axis=1) # max Q(s',a')

        # classification loss
        cl_error = tf.nn.sparse_softmax_cross_entropy_with_logits(
                      logits=pi, labels=z_j)
        
        # Q optimization
        if grad_norm_clipping is not None:
            gradients_q = optimizer_q.compute_gradients(weighted_error, var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients_qq):
                if grad is not None:
                    gradients_q[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_q = optimizer_q.apply_gradients(gradients_q)
        else:
            optimize_q = optimizer_q.minimize(weighted_error, var_list=q_func_vars)

        # pi optimization
        if grad_norm_clipping is not None:
            gradients_pi = optimizer_pi.compute_gradients(cl_error, var_list=pi_func_vars)
            for i, (grad, var) in enumerate(gradients_pi):
                if grad is not None:
                    gradients_pi[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var)
            optimize_pi = optimizer_pi.apply_gradients(gradients_pi)
        else:
            optimize_pi = optimizer_pi.minimize(cl_error, var_list=pi_func_vars)

        # update_target Q
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # update_target pi
        update_target_pi = []
        for var, var_target in zip(sorted(pi_func_vars, key=lambda v: v.name),
                                   sorted(target_pi_func_vars, key=lambda v: v.name)):
            update_target_pi.append(var_target.assign(var))
        update_target_pi = tf.group(*update_target_pi)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=[td_error, cl_error],
            updates=[optimize_q, optimize_pi]
        )
        update_target = U.function([], [], updates=[update_target_expr, update_target_pi])

        q_values = U.function([obs_t_input], q_t)

        debug = {'q_values': q_values}

    # Create the replay buffer
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n))
                kwargs['reset'] = reset
                kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True
            
            action = env.action_space.sample() # not used, just so we have the datatype
            stochastic=True
            ac1, vpred1 =  act(stochastic, np.array(obs)[None])
            action = ac1[0]
            #action, _ = pi.act(stochastic, obs)
            
            #action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            env_action = action
            reset = False
            new_obs, rew, done, _ = env.step(env_action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0.0)
                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()
            

            # Log train and res
            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts and
                    num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
                    save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            load_state(model_file)

    return act
示例#33
0
            action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
            reset = False
            new_obs, rew, done, info = env.step(action)
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs
            if done:
                num_iters_since_reset = 0
                obs = env.reset()
                reset = True

            if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and
                    num_iters % args.learning_freq == 0):
                # Sample a bunch of transitions from replay buffer
                if args.prioritized:
                    experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters))
                    (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
                    weights = np.ones_like(rewards)
                # Minimize the error in Bellman's equation and compute TD-error
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
                # Update the priorities in the replay buffer
                if args.prioritized:
                    new_priorities = np.abs(td_errors) + args.prioritized_eps
                    replay_buffer.update_priorities(batch_idxes, new_priorities)
            # Update target network.
            if num_iters % args.target_update_freq == 0:
                update_target()

            if start_time is not None:
示例#34
0
class DQN(BaseAgent):
    def __init__(self,
                 env,
                 name='default',
                 alg_name='dqn',
                 network_type='mini-mlp',
                 total_timesteps=5e7,
                 batch_size=32,
                 lr=1e-3,
                 gamma=0.99,
                 buffer_size=1e6,
                 final_eps=0.05,
                 exploration_fraction=0.1,
                 training_start=1e5,
                 target_update_freq=1e4,
                 optimizer=tf.train.AdamOptimizer,
                 gradient_clipping=None,
                 reward_clipping=False,
                 tau=1.,
                 double_q=False,
                 dueling=False,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.5,
                 prioritized_replay_beta_init=0.4,
                 prioritized_replay_beta_fraction=1.0,
                 prioritized_replay_eps=1e-6,
                 rolling_reward_mean=20,
                 solved_callback=None,
                 render_training=False,
                 **kwargs):
        """
        Implementation of the Deep Q Learning (DQN) algorithm formulated by Mnih et. al.
        Contains some well known improvements over the vanilla DQN.

        Parameters
        ----------
        env: gym.Environment
            (gym) Environment the agent shall learn from and act on

        name: str
            descriptive name of this DQN configuration, e.g. 'atari-breakout'

        network_type: str
            which network is from 'networks.py'

        total_timesteps: int or float
            number of training timesteps

        batch_size: int
            size of minibatch per backprop

        lr: float
            learning rate

        gamma: float
            discount factor gamma for bellman target

        buffer_size: int or float
            maximum number of in replay buffer

        final_eps: float
            value to which epsilon is annealed

        exploration_fraction: float
            fraction of traing timesteps over which epsilon is annealed

        training_start: int
            timestep at which training of the q network begins

        target_update_freq: int
            frequency of target network updates (in timesteps)

        optimizer: tf.Optimizer
            optimizer class which shall be used such as Adam or RMSprop

        gradient_clipping: int
            if not None, gradients are clipped by this value by norm

        reward_clipping: float
            rewards will be clipped to this value if not None

        tau: float
            interpolation constant for soft update. 1.0 corresponds to
            a full synchronisation of networks weights, as in the original DQN paper

        double_q: bool
            enables Double Q Learning for DQN

        dueling: bool
            splits network architecture into advantage and value streams. V(s, a) gets
            more frequent updates, should stabalize learning

        prioritized_replay: True
            use (proportional) prioritized replay

        prioritized_replay_alpha: float
            alpha for weighting priorization

        prioritized_replay_beta_init: float
            initial value of beta for prioritized replay buffer

        prioritized_replay_beta_fraction: float
            fraction of total timesteps to anneal beta to 1.0

        prioritized_replay_eps: float
            epsilon to add to the TD errors when updating priorities.

        rolling_reward_mean: int
            window of which the rolling mean in the statistics is computed

        solved_callback: function
            function which gets as an input the episode rewards as an array and must return a bool.
            if returned True, the training is considered as done and therefore prematurely interrupted.

        render_training: bool
            whether to render the environment while training

        """

        # instance name
        self.name = name

        # environment to act on / learn from
        self.env = env

        # basic DQN parameters
        self.total_timesteps = float(total_timesteps)
        self.buffer_size = int(float(buffer_size))
        self.batch_size = batch_size
        self.final_eps = final_eps
        self.lr = float(lr)
        self.gamma = float(gamma)
        self.exploration_fraction = float(exploration_fraction)
        self.training_start = int(float(training_start))
        self.target_update_freq = int(float(target_update_freq))

        # tf.Optimizer
        self.optimizer = optimizer

        # minor changes as suggested in some papers
        self.gradient_clipping = int(
            gradient_clipping) if gradient_clipping is not None else None
        self.reward_clipping = int(
            reward_clipping) if reward_clipping is not None else None

        # enhancements to DQN published in papers
        self.tau = float(tau)
        self.double_q = double_q
        self.dueling = dueling
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_alpha = float(prioritized_replay_alpha)
        self.prioritized_replay_beta_init = float(prioritized_replay_beta_init)
        self.prioritized_replay_beta_fraction = float(
            prioritized_replay_beta_fraction)
        self.prioritized_replay_eps = float(prioritized_replay_eps)

        # function to determine whether agent is able to act well enough
        self.solved_callback = solved_callback

        # call env.render() each training step
        self.render_training = render_training

        # sliding window for reward calc
        self.rolling_reward_mean = rolling_reward_mean

        # stores latest measure for best policy, e.g. best mean over last N episodes
        self.latest_best = 0.0

        super().__init__(env, alg_name, name, **kwargs)

        # calculate timestep where epsilon reaches its final value
        self.schedule_timesteps = int(self.total_timesteps *
                                      self.exploration_fraction)

        # sanity checks
        assert 0.0 < self.tau <= 1.0

        # env specific parameter
        self.obs_shape = env.observation_space.shape
        self.num_actions = env.action_space.n

        # tf scopes
        self.Q_SCOPE = 'q_network'
        self.TARGET_SCOPE = 'target_network'

        # build Q and target network; using different scopes to distinguish variables for gradient computation
        self.q_t_in, self.q_t = build_network(self.obs_shape,
                                              self.num_actions,
                                              network_type=network_type,
                                              dueling=self.dueling,
                                              scope=self.Q_SCOPE,
                                              summaries=True)
        self.target_tp1_in, self.target_tp1 = build_network(
            self.obs_shape,
            self.num_actions,
            dueling=self.dueling,
            network_type=network_type,
            scope=self.TARGET_SCOPE)

        # double Q learning needs to pass observations t+1 to the q networks for action selection
        # so we reuse already created q network variables but with different input
        if self.double_q:
            self.q_tp1_in, self.q_tp1 = build_network(
                self.obs_shape,
                self.num_actions,
                dueling=self.dueling,
                network_type=network_type,
                scope=self.Q_SCOPE,
                reuse=True)

        # create replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)

        # list of variables of the different networks. required for copying
        # Q to target network and excluding target network variables from backprop
        self.q_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            scope=self.Q_SCOPE)
        self.target_net_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope=self.TARGET_SCOPE)

        # placeholders used in loss function
        self._L_r = tf.placeholder(tf.float32, (None, ), name='loss_rewards')
        self._L_a = tf.placeholder(tf.int32, (None, ), name='loss_actions')
        self._L_d = tf.placeholder(tf.float32, (None, ), name='loss_dones')

        # pointer to td error vector
        self._td_errors = tf.placeholder(tf.float32, (None, ),
                                         name='td_errors')

        # configure prioritized replay
        if self.prioritized_replay:
            self._is_weights = tf.placeholder(
                tf.float32, (None, ), name='importance_sampling_weights')

            # schedule for PR beta
            beta_steps = int(self.total_timesteps *
                             self.prioritized_replay_beta_fraction)
            self.pr_beta = LinearSchedule(
                beta_steps,
                initial_p=prioritized_replay_beta_init,
                final_p=1.0)

        # epsilon schedule
        self.eps = LinearSchedule(self.schedule_timesteps, final_p=final_eps)

        # init optimizer
        self.opt = self.optimizer(self.lr)

        # specify loss function, only include Q network variables for gradient computation
        self.gradients = self.opt.compute_gradients(self._loss(),
                                                    var_list=self.q_net_vars)

        # clip gradients by norm
        if self.gradient_clipping is not None:
            for idx, (grad, var) in enumerate(self.gradients):
                if grad is not None:
                    self.gradients[idx] = (tf.clip_by_norm(
                        grad, self.gradient_clipping), var)

        # create training op
        self.train_op = self.opt.apply_gradients(self.gradients)

        # update_target_fn will be called periodically to copy Q network to target Q network
        # variable lists are sorted by name to ensure that correct values are copied
        self.update_target_ops = []
        for var_q, var_target in zip(
                sorted(self.q_net_vars, key=lambda v: v.name),
                sorted(self.target_net_vars, key=lambda v: v.name)):
            v_update = var_target.assign(self.tau * var_q +
                                         (1 - self.tau) * var_target)
            self.update_target_ops.append(v_update)
        self.update_target_ops = tf.group(*self.update_target_ops)

        # global tf.Session and Graph init
        self.sess = tf.Session()

        # init tensorboard, variables and debug
        self._finalize_init()

        # sync networks before training
        self.sess.run(self.update_target_ops)

    def _setup_tensorboard(self):
        """
        Adds all variables that might help debugging to Tensorboard.
        At the end, the FileWriter is constructed pointing to the specified directory.

        """

        # more placeholders for summarised variables; along with summaries
        self.eps_ph = tf.placeholder(tf.float32, (), name='epsilon')
        self.rew_ph = tf.placeholder(tf.float32, (), name='rolling-reward')

        scalar_summary('epsilon', self.eps_ph)
        scalar_summary('reward', self.rew_ph)

        # display q_values while training
        for a_i in range(self.num_actions):
            scalar_summary('QTa_{}'.format(a_i + 1),
                           tf.reduce_mean(self.target_tp1[:, a_i]),
                           scope='Q-Values')
            scalar_summary('Qa_{}'.format(a_i + 1),
                           tf.reduce_mean(self.q_t[:, a_i]),
                           scope='Q-Values')

        # plot network weights
        with tf.variable_scope('weights'):
            for qv in self.q_net_vars:
                tf.summary.histogram('{}'.format(qv.name), qv)
            for tv in self.target_net_vars:
                tf.summary.histogram('{}'.format(tv.name), tv)

        # gradient histograms
        with tf.variable_scope('gradients'):
            for g in self.gradients:
                tf.summary.histogram('{}-grad'.format(g[1].name), g[0])

    def _loss(self):
        """ Defines loss as layed out in the original Nature paper """

        with tf.variable_scope('loss'):

            # either use maximum target q or use value from target network while the action is chosen by the q net
            if self.double_q:
                act_tp1_idxs = tf.stop_gradient(tf.argmax(self.q_tp1, axis=1))
                q_tp1 = tf.reduce_sum(
                    self.target_tp1 *
                    tf.one_hot(act_tp1_idxs, self.num_actions),
                    axis=1)
            else:

                q_tp1 = tf.reduce_max(self.target_tp1, axis=1)

            # bellman target
            y = self._L_r + (self.gamma * (1.0 - self._L_d) * q_tp1)

            # select q value of taken action
            qj = tf.reduce_sum(self.q_t *
                               tf.one_hot(self._L_a, self.num_actions),
                               axis=1)

            # TD errors
            self._td_errors = qj - y

            # apply huber loss
            loss = tf.losses.huber_loss(y, qj)

        if self.use_tensorboard:
            scalar_summary('target', tf.reduce_mean(y))
            scalar_summary('huber-loss', tf.reduce_mean(loss))
            tf.summary.histogram('selected_Q', qj)

        #  importance sampling weights
        if self.prioritized_replay:
            updates = tf.reduce_mean(self._is_weights * loss)
        else:
            updates = tf.reduce_mean(loss)

        return updates

    def _build_feed_dict(self,
                         obs_t,
                         ac_t,
                         rew_t,
                         obs_tp1,
                         dones,
                         eps,
                         rolling_rew,
                         weights=None):
        """ Takes minibatch and returns feed dict for a tf.Session based on the algorithms configuration. """

        # first, add data required in all DQN configs
        feed_d = {
            self.q_t_in: obs_t,
            self.target_tp1_in: obs_tp1,
            self._L_r: rew_t,
            self._L_a: ac_t,
            self._L_d: dones
        }

        # pass obs t+1 to q network
        if self.double_q:
            feed_d[self.q_tp1_in] = obs_tp1

        # importance sampling weights
        if self.prioritized_replay:
            feed_d[self._is_weights] = weights

        # variables only necessary for TensorBoard visualisation
        if self.use_tensorboard:
            feed_d[self.eps_ph] = eps
            feed_d[self.rew_ph] = rolling_rew

        return feed_d

    def learn(self):
        """ Learns Q function for a given amount of timesteps """

        # reset env, store first observation
        obs_t = self.env.reset()

        # save all episode rewards
        episode_reward_series = [[0.0]]
        episode_rewards = []

        self.logger.info(
            'Starting Exploration, training will start at step {}.'.format(
                self.training_start))

        for t in tqdm(range(int(self.total_timesteps))):

            # decide on action either by policy or chose a random one
            epsilon = self.eps.value(t)
            _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon])
            if _rand:
                action = self.env.action_space.sample()
            else:
                action = np.argmax(self.sess.run(self.q_t,
                                                 {self.q_t_in: [obs_t]}),
                                   axis=1)
                assert len(action) == 1, 'only one action can be taken!'
                action = action[0]

            # act on environment with chosen action
            obs_tp1, reward, done, _ = self.env.step(action)

            # clip reward
            if self.reward_clipping:
                reward = 1 if reward > 0 else -1 if reward < 0 else 0

            # store new transition
            self.replay_buffer.add(obs_t, action, reward, obs_tp1, float(done))

            # new observation will be current one in next iteration
            obs_t = obs_tp1

            # append current rewards to episode reward series
            episode_reward_series[-1].append(reward)

            if self.render_training:
                self.env.render()

            if t == self.training_start:
                self.logger.info('Training starts now! (t = {})'.format(t))

            # final calculations and env reset
            if done:
                # calculate total reward
                episode_rewards.append(np.sum(episode_reward_series[-1]))
                episode_reward_series.append([0.0])

                # reset env to initial state
                obs_t = self.env.reset()

            # start training after warmup period
            if t >= self.training_start:

                # calculate rolling reward
                rolling_r = np.mean(episode_rewards[-self.rolling_reward_mean:]
                                    ) if len(episode_rewards) > 0 else 0.0

                # post episode stuff: printing and saving
                if done:
                    result_table = [['t', t],
                                    ['episode',
                                     len(episode_rewards)],
                                    ['mean_reward [20]', rolling_r],
                                    ['epsilon', epsilon]]
                    print('\n{}'.format(tabulate(result_table)))

                    # if the policy improved, save as new best ... achieving a good reward in one episode
                    # might not be the best metric. continuously achieving good rewards would better
                    if len(episode_rewards) >= 25:
                        mr = np.mean(
                            episode_rewards[-self.rolling_reward_mean:])
                        if mr >= self.latest_best:
                            self.latest_best = mr
                            self.logger.info(
                                'Saving new best policy with mean[{}]_r = {} ...'
                                .format(self.rolling_reward_mean, mr))
                            self._save('best')

                    # save latest policy
                    self._save()

                    # write current values to csv log
                    self.csvlog.write('{}, {}, {}\n'.format(
                        len(episode_rewards), epsilon, episode_rewards[-1]))

                # sample batch of transitions randomly for training and build feed dictionary
                # prioritized replay needs a beta and returns weights.
                if self.prioritized_replay:
                    o_t, a_t, r_t, o_tp1, do, is_ws, batch_idxs = self.replay_buffer.sample(
                        self.batch_size, self.pr_beta.value(t))
                    feed = self._build_feed_dict(o_t,
                                                 a_t,
                                                 r_t,
                                                 o_tp1,
                                                 do,
                                                 epsilon,
                                                 rolling_r,
                                                 weights=is_ws)
                else:
                    o_t, a_t, r_t, o_tp1, do = self.replay_buffer.sample(
                        self.batch_size)
                    feed = self._build_feed_dict(o_t, a_t, r_t, o_tp1, do,
                                                 epsilon, rolling_r)

                # run training (and summary) operations
                if self.use_tensorboard:
                    summary, _, td_errors = self.sess.run(
                        [self.merge_op, self.train_op, self._td_errors],
                        feed_dict=feed)
                    self.writer.add_summary(summary, t)
                else:
                    self.sess.run(self.train_op, feed_dict=feed)

                # new td errors needed to update buffer weights
                if self.prioritized_replay:
                    new_prios = np.abs(td_errors) + self.prioritized_replay_eps
                    self.replay_buffer.update_priorities(batch_idxs, new_prios)

                # sync target network every C steps
                if (t - self.training_start) % self.target_update_freq == 0:
                    self.sess.run(self.update_target_ops)

            if self.solved_callback is not None:
                if self.solved_callback(episode_rewards):
                    self.logger.info('Solved!')
                    break

        # total reward of last episode
        episode_rewards.append(np.sum(episode_reward_series[-1]))

        # finalize training, e.g. set flags, write done-file
        self._finalize_training()

    def run(self, render=True):
        """ Runs policy on given environment """

        if not self.is_trained:
            self.logger.warning('Trying to run untrained model!')

        # set necessary parameters to their defaults
        epsilon = self.final_eps
        reward = 0.0
        obs = self.env.reset()

        while True:

            # decide on action either by policy or chose a random one
            _rand = np.random.choice([True, False], p=[epsilon, 1 - epsilon])
            if _rand:
                action = self.env.action_space.sample()
            else:
                action = np.argmax(self.sess.run(self.q_t,
                                                 {self.q_t_in: [obs]}),
                                   axis=1)
                assert len(action) == 1, 'only one action can be taken!'
                action = action[0]

            # act on environment with chosen action
            obs, rew, done, _ = self.env.step(action)
            reward += rew

            if render:
                self.env.render()

            if done:
                self.logger.info('Done! Reward {}'.format(reward))
                reward = 0.0
                obs = self.env.reset()