示例#1
0
    def test_replay_buffer_sample(self):
        templates = [('test1', tf.int32, ()), ('test2', tf.int32, (1, )),
                     ('test3', tf.int32, (2, 2))]
        capacity = 5
        rep_buf = replay_buffer.ReplayBuffer(templates, capacity)
        memory = [
            tf.Variable(1, dtype=tf.int32, trainable=False),
            tf.Variable([2], dtype=tf.int32, trainable=False),
            tf.Variable([[1, 2], [3, 4]], dtype=tf.int32, trainable=False)
        ]
        with tf.control_dependencies([v.assign(v + 1) for v in memory]):
            inc_index_op = rep_buf.append(memory)
        samples_t = rep_buf.sample(5)

        sess = tf.InteractiveSession()
        sess.run(tf.global_variables_initializer())
        sess.run(inc_index_op)
        sess.run(inc_index_op)
        sess.run(inc_index_op)
        sess.run(inc_index_op)
        sess.run(inc_index_op)
        samples = sess.run(samples_t)

        self.assertEqual(len(samples), 3)
        self.assertEqual(len(samples[0]), 5)
示例#2
0
    def test_replay_buffer_init(self):
        templates = [('test1', tf.int32, ()), ('test2', tf.int32, (1, )),
                     ('test3', tf.int32, (2, 2))]
        capacity = 2
        rep_buf = replay_buffer.ReplayBuffer(templates, capacity)

        sess = tf.InteractiveSession()
        sess.run(tf.global_variables_initializer())
        buffers = sess.run(rep_buf.buffers)

        self.assertEqual(np.array_equal(buffers['test1'], [0, 0]), True)
        self.assertEqual(np.array_equal(buffers['test2'], [[0], [0]]), True)
        self.assertEqual(
            np.array_equal(buffers['test3'],
                           [[[0, 0], [0, 0]], [[0, 0], [0, 0]]]), True)
示例#3
0
    def train(self):

        global_step = 0

        if self.prioritized:
            self.memory = replay_buffer.PrioritizedReplayBuffer(
                self.replay_buffer_size, self.prioritized_alpha)
            self.beta_schedule = schedules.LinearSchedule(
                self.num_episodes, initial_p=self.prioritized_beta, final_p=0)
        else:
            self.memory = replay_buffer.ReplayBuffer(self.replay_buffer_size)
            self.beta_schedule = None

        for episode in range(self.num_episodes):

            global_step = self._episode(episode, global_step)

            # Save checkpoint
            if episode % self.save_frequency == 0:
                model_name = 'dqn_checkpoint_' + str(episode) + '.pth'
                torch.save(self.q_fn.state_dict(),
                           os.path.join(self.model_path, model_name))
示例#4
0
    def test_replay_buffer_append(self):
        templates = [('test1', tf.int32, ()), ('test2', tf.int32, (1, )),
                     ('test3', tf.int32, (2, 2))]
        capacity = 2
        rep_buf = replay_buffer.ReplayBuffer(templates, capacity)
        memory = [
            tf.constant(1),
            tf.constant([2]),
            tf.constant([[1, 2], [3, 4]])
        ]
        inc_index_op = rep_buf.append(memory)

        sess = tf.InteractiveSession()
        sess.run(tf.global_variables_initializer())
        sess.run(inc_index_op)
        index, buffers = sess.run([rep_buf.index, rep_buf.buffers])

        self.assertEqual(index, 1)
        self.assertEqual(np.array_equal(buffers['test1'], [1, 0]), True)
        self.assertEqual(np.array_equal(buffers['test2'], [[2], [0]]), True)
        self.assertEqual(
            np.array_equal(buffers['test3'],
                           [[[1, 2], [3, 4]], [[0, 0], [0, 0]]]), True)
示例#5
0
    if args.save_model and not os.path.exists("./models"):
        os.makedirs("./models")

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        if not os.path.exists(f"./models/{policy_file}"):
            assert f"The loading model path of `../models/{policy_file}` does not exist! "
        policy.load(f"./models/{policy_file}")

    # Setup loggers
    logger_kwargs = setup_logger_kwargs(args.exp_name,
                                        args.seed,
                                        datestamp=False)
    logger = EpochLogger(**logger_kwargs)

    _replay_buffer = replay_buffer.ReplayBuffer(state_dim, action_dim)

    state, done = env.reset(), False
    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    start_time = time.time()

    for t in range(int(args.max_timesteps)):
        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < int(args.start_timesteps):
            action = env.action_space.sample()
        else:
            if args.policy.startswith("SAC"):
示例#6
0
    def __init__(self, sess, state_dim, action_dim, name, env):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.name = name
        self.env = env

        self.config = ConfigParser.ConfigParser()
        self.config.read("./config.ini")

        self.exploitability = 0
        self.iteration = 0

        # init parameters
        self.minibatch_size = int(self.config.get('Agent', 'MiniBatchSize'))
        self.n_hidden = int(self.config.get('Agent', 'HiddenLayer'))
        self.lr_br = float(self.config.get('Agent', 'LearningRateBR'))
        self.lr_ar = float(self.config.get('Agent', 'LearningRateAR'))
        self.epsilon = float(self.config.get('Agent', 'Epsilon'))
        self.epsilon_min = float(self.config.get('Agent', 'EpsilonMin'))
        self.gamma = float(self.config.get('Agent', 'Gamma'))
        self.omega = float(self.config.get('Agent', 'Omega'))
        self.sgd_br = SGD(lr=self.lr_br)
        self.sgd_ar = SGD(lr=self.lr_ar)
        self.target_model_update_rate = int(self.config.get('Agent', 'TargetModelUpdateRate'))

        self.iteration = 0
        self.temp = (1 + 0.02 * np.sqrt(self.iteration))**(-1)

        # mixed anticipatory parameter
        self.eta = float(self.config.get('Agent', 'Eta'))

        # target network update counter
        self.target_br_model_update_count = 0

        # reinforcement learning memory
        self._rl_memory = ReplayBuffer.ReplayBuffer(int(self.config.get('Utils', 'Buffersize')),
                                                    int(self.config.get('Utils', 'Seed')))

        # supervised learning memory
        self._sl_memory = ReservoirBuffer.ReservoirBuffer(int(self.config.get('Utils', 'Buffersize')),
                                                          int(self.config.get('Utils', 'Seed')))

        # build average strategy model
        self.avg_strategy_model = self._build_avg_response_model()

        # build dqn aka best response model
        self.best_response_model = self._build_best_response_model()
        self.target_br_model = self._build_best_response_model()
        self.target_br_model.set_weights(self.best_response_model.get_weights())

        # build supervised learning model
        # self.average_response_model = self._build_avg_response_model()
        self.actions = np.zeros(3)

        self.played = 0
        self.reward = 0
        self.test_reward = 0
        self.game_step = 0

        # tensorBoard
        self.tensorboard_br = TensorBoard(log_dir='./logs/'+self.name+'rl', histogram_freq=0,
                                          write_graph=False, write_images=True)

        self.tensorboard_sl = TensorBoard(log_dir='./logs/'+self.name+'sl', histogram_freq=0,
                                          write_graph=False, write_images=True)
示例#7
0
def build_actor(agent, env, level_name, action_set, id_actor):
    """Builds the actor loop."""
    # Initial values.
    initial_env_output, initial_env_state = env.initial()
    initial_agent_state = agent.initial_state(1)
    initial_action = tf.zeros([1], dtype=tf.int32)

    # Run agent
    dummy_agent_output, _ = agent(
        (initial_action,
         nest.map_structure(lambda t: tf.expand_dims(t, 0),
                            initial_env_output)), initial_agent_state)
    initial_agent_output = nest.map_structure(
        lambda t: tf.zeros(t.shape, t.dtype), dummy_agent_output)

    # Initialize buffer
    if bool_buffer:
        buffer = replay_buffer.ReplayBuffer(id_actor, FLAGS.unroll_length,
                                            FLAGS.buffer_size,
                                            initial_env_output,
                                            initial_agent_output)

    # All state that needs to persist across training iterations. This includes
    # the last environment output, agent state and last agent output. These
    # variables should never go on the parameter servers.
    def create_state(t):
        # Creates a unique variable scope to ensure the variable name is unique.
        with tf.variable_scope(None, default_name='state'):
            return tf.get_local_variable(t.op.name,
                                         initializer=t,
                                         use_resource=True)

    persistent_state = nest.map_structure(
        create_state, (initial_env_state, initial_env_output,
                       initial_agent_state, initial_agent_output))

    def step(input_, unused_i):
        """Steps through the agent and the environment."""
        env_state, env_output, agent_state, agent_output = input_

        # Run agent.
        action = agent_output[0]
        batched_env_output = nest.map_structure(lambda t: tf.expand_dims(t, 0),
                                                env_output)

        # Forward one-step
        agent_output, agent_state = agent((action, batched_env_output),
                                          agent_state)

        # Convert action index to the native action.
        action = agent_output[0][0]
        raw_action = tf.gather(action_set, action)
        env_output, env_state = env.step(raw_action, env_state)

        return env_state, env_output, agent_state, agent_output

    # Run the unroll. `read_value()` is needed to make sure later usage will
    # return the first values and not a new snapshot of the variables.
    first_values = nest.map_structure(lambda v: v.read_value(),
                                      persistent_state)
    _, first_env_output, first_agent_state, first_agent_output = first_values

    # Use scan to apply `step` multiple times, therefore unrolling the agent
    # and environment interaction for `FLAGS.unroll_length`. `tf.scan` forwards
    # the output of each call of `step` as input of the subsequent call of `step`.
    # The unroll sequence is initialized with the agent and environment states
    # and outputs as stored at the end of the previous unroll.
    # `output` stores lists of all states and outputs stacked along the entire
    # unroll. Note that the initial states and outputs (fed through `initializer`)
    # are not in `output` and will need to be added manually later.
    output = tf.scan(step, tf.range(FLAGS.unroll_length), first_values)
    _, env_outputs, _, agent_outputs = output

    # Update persistent state with the last output from the loop.
    assign_ops = nest.map_structure(lambda v, t: v.assign(t[-1]),
                                    persistent_state, output)

    # The control dependency ensures that the final agent and environment states
    # and outputs are stored in `persistent_state` (to initialize next unroll).
    with tf.control_dependencies(nest.flatten(assign_ops)):
        # Remove the batch dimension from the agent state/output.
        first_agent_state = nest.map_structure(lambda t: t[0],
                                               first_agent_state)
        first_agent_output = nest.map_structure(lambda t: t[0],
                                                first_agent_output)
        agent_outputs = nest.map_structure(lambda t: t[:, 0], agent_outputs)

        # Concatenate first output and the unroll along the time dimension.
        full_agent_outputs, full_env_outputs = nest.map_structure(
            lambda first, rest: tf.concat([[first], rest], 0),
            (first_agent_output, first_env_output),
            (agent_outputs, env_outputs))

        # Append buffer
        if bool_buffer:
            op_assign_index = buffer.append(full_env_outputs,
                                            full_agent_outputs)

            with tf.control_dependencies([op_assign_index]):
                # Sample buffer
                env_outputs_vr, agent_outputs_vr = buffer.sample_sequence()
                env_outputs_pc, agent_outputs_pc = buffer.sample_sequence(
                    shift=1)
                env_outputs_rp, agent_outputs_rp = buffer.sample_rp_sequence()
                is_full = buffer.is_full()

                with tf.control_dependencies([is_full]):
                    output = ActorOutput(level_name=level_name,
                                         agent_state=first_agent_state,
                                         env_outputs=full_env_outputs,
                                         agent_outputs=full_agent_outputs,
                                         env_outputs_vr=env_outputs_vr,
                                         agent_outputs_vr=agent_outputs_vr,
                                         env_outputs_pc=env_outputs_pc,
                                         agent_outputs_pc=agent_outputs_pc,
                                         env_outputs_rp=env_outputs_rp,
                                         buffer_full=is_full)

                    # No backpropagation should be done here.
                    return nest.map_structure(tf.stop_gradient, output)
        else:
            output = ActorOutput(level_name=level_name,
                                 agent_state=first_agent_state,
                                 env_outputs=full_env_outputs,
                                 agent_outputs=full_agent_outputs)

            # No backpropagation should be done here.
            return nest.map_structure(tf.stop_gradient, output)
示例#8
0
文件: main.py 项目: LQNew/LWDRLD
    if args.save_model and not os.path.exists("./models"):
        os.makedirs("./models")

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        if not os.path.exists(f"./models/{policy_file}"):
            assert f"The loading model path of `../models/{policy_file}` does not exist! "
        policy.load(f"./models/{policy_file}")

    # Setup loggers
    logger_kwargs = setup_logger_kwargs(args.exp_name,
                                        args.seed,
                                        datestamp=False)
    logger = EpochLogger(**logger_kwargs)

    _replay_buffer = replay_buffer.ReplayBuffer(int(args.buffer_size))

    print("Collecting experience...")
    epinfobuf = deque(maxlen=100)  # episode step for accumulate reward
    start_time = time.time()  # check learning time

    states = np.array(
        env.reset())  # env reset, output array of num of `#num_envs` states

    step = 0
    for t in range(1, int(args.max_timesteps) // int(args.num_envs) + 1):
        actions = policy.select_action(states, eps_schedule.value)
        next_states, rewards, dones, infos = env.step(
            actions)  # take actions and get next states
        next_states = np.array(next_states)
        # log arrange