示例#1
0
    def test_memory_buffer_autosave(self):
        print("\n ================= AUTOSAVE TEST ====================")
        # Make sure the folder doesn't exist so the manifest has to be created.
        if os.path.exists("./memory/memory_buffer_test/"):
            shutil.rmtree("./memory/memory_buffer_test/")
        info_set_size = 1 + 1 + 24
        item_size = 64
        max_size = int(1e3)

        # Add autosave params.
        mb = MemoryBuffer(info_set_size,
                          item_size,
                          max_size=max_size,
                          autosave_params=("./memory/memory_buffer_test/",
                                           "test_buffer"))

        for _ in range(max_size):
            mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 1234)
        self.assertTrue(mb.full())

        # This should trigger the save and reset.
        mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 1234)
示例#2
0
    def test_resample(self):
        if os.path.exists("./memory/memory_buffer_test/"):
            shutil.rmtree("./memory/memory_buffer_test/")

        # Make a few saved memory buffers.
        info_set_size = 1 + 1 + 16
        item_size = 6
        max_size = int(1e4)
        mb = MemoryBuffer(info_set_size, item_size, max_size=max_size)

        buf1_size = 100
        for i in range(buf1_size):
            mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 0)
        mb.save("./memory/memory_buffer_test/", "advt_mem_0")
        mb.clear()

        buf2_size = 200
        for i in range(buf2_size):
            mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 1)
        mb.save("./memory/memory_buffer_test/", "advt_mem_0")
        mb.clear()

        buf3_size = 300
        for i in range(buf3_size):
            mb.add(make_dummy_ev_infoset(), torch.zeros(item_size), 2)
        mb.save("./memory/memory_buffer_test/", "advt_mem_0")
        mb.clear()

        # Make a dataset using the saved buffers.
        # n = (buf1_size + buf2_size) // 10
        n = 1000
        dataset = MemoryBufferDataset("./memory/memory_buffer_test/",
                                      "advt_mem_0", n)
        # min_size = min(n, buf1_size + buf2_size + buf3_size)
        # print(min_size)

        for _ in range(1):
            dataset.resample()
            self.assertEqual(len(dataset), n)
            self.assertEqual(len(dataset._infosets), n)
            self.assertEqual(len(dataset._items), n)
            self.assertEqual(len(dataset._weights), n)
            # print(dataset._weights)

        # Test iteration over the dataset.
        for inputs in dataset:
            print(inputs.keys())

        print(dataset._weights)
示例#3
0
net = DeepQNetwork(env.numActions(), args)
buf = MemoryBuffer(args)

if args.load_weights:
  print "Loading weights from %s" % args.load_weights
  net.load_weights(args.load_weights)

env.gym.monitor.start(args.output_folder, force=True)
avg_reward = 0
num_episodes = args.num_episodes
for i_episode in xrange(num_episodes):
    env.restart()
    observation = env.getScreen()
    buf.reset()
    i_total_reward = 0
    for t in xrange(10000):
        buf.add(observation)
        if t < args.history_length or random.random() < args.exploration_rate_test:
            action = random.randrange(env.numActions())
        else:
            qvalues = net.predict(buf.getStateMinibatch())
            action = np.argmax(qvalues[0])
        reward = env.act(action)
        observation = env.getScreen()
        i_total_reward += reward
        if env.isTerminal():
            avg_reward += i_total_reward
            print "Episode {} finished after {} timesteps with reward {}".format(i_episode+1, t+1, i_total_reward)
            break
print "Avg reward {}".format(avg_reward / float(num_episodes))
env.gym.monitor.close()
class Agent(AgentBase):
    def __init__(self,
                 load_policy=False,
                 learning_rate=0.001,
                 dim_a=3,
                 fc_layers_neurons=100,
                 loss_function_type='mean_squared',
                 policy_loc='./racing_car_m2/network',
                 image_size=64,
                 action_upper_limits='1,1',
                 action_lower_limits='-1,-1',
                 e='1',
                 show_ae_output=True,
                 show_state=True,
                 resize_observation=True,
                 ae_training_threshold=0.0011,
                 ae_evaluation_frequency=40):

        self.image_size = image_size

        super(Agent, self).__init__(dim_a=dim_a,
                                    policy_loc=policy_loc,
                                    action_upper_limits=action_upper_limits,
                                    action_lower_limits=action_lower_limits,
                                    e=e,
                                    load_policy=load_policy,
                                    loss_function_type=loss_function_type,
                                    learning_rate=learning_rate,
                                    fc_layers_neurons=fc_layers_neurons)

        # High-dimensional state initialization
        self.resize_observation = resize_observation
        self.show_state = show_state
        self.show_ae_output = show_ae_output

        # Autoencoder training control variables
        self.ae_training = True
        self.ae_loss_history = MemoryBuffer(
            min_size=50,
            max_size=50)  # reuse memory buffer for the ae loss history
        self.ae_trainig_threshold = ae_training_threshold
        self.ae_evaluation_frequency = ae_evaluation_frequency
        self.mean_ae_loss = 1e7

        if self.show_state:
            self.state_plot = FastImagePlot(1,
                                            np.zeros([image_size, image_size]),
                                            image_size,
                                            'Image State',
                                            vmax=0.5)

        if self.show_ae_output:
            self.ae_output_plot = FastImagePlot(2,
                                                np.zeros(
                                                    [image_size, image_size]),
                                                image_size,
                                                'Autoencoder Output',
                                                vmax=0.5)

    def _build_network(self, dim_a, params):
        # Initialize graph
        with tf.variable_scope('base'):
            # Build autoencoder
            ae_inputs = tf.placeholder(
                tf.float32, (None, self.image_size, self.image_size, 1),
                name='input')
            self.loss_ae, latent_space, self.ae_output = autoencoder(ae_inputs)

            # Build fully connected layers
            self.y, loss_policy = fully_connected_layers(
                tf.contrib.layers.flatten(latent_space), dim_a,
                params['fc_layers_neurons'], params['loss_function_type'])

        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'base')
        self.train_policy = tf.train.GradientDescentOptimizer(
            learning_rate=params['learning_rate']).minimize(loss_policy,
                                                            var_list=variables)

        self.train_ae = tf.train.AdamOptimizer(
            learning_rate=params['learning_rate']).minimize(self.loss_ae)

        # Initialize tensorflow
        init = tf.global_variables_initializer()
        self.sess = tf.Session()
        self.sess.run(init)
        self.saver = tf.train.Saver()

    def _preprocess_observation(self, observation):
        if self.resize_observation:
            observation = cv2.resize(observation,
                                     (self.image_size, self.image_size))
        self.high_dim_observation = observation_to_gray(
            observation, self.image_size)
        self.network_input = self.high_dim_observation

    def _batch_update_extra(self, state_batch, y_label_batch):
        # Calculate autoencoder loss and train if necessary
        if self.ae_training:
            _, loss_ae = self.sess.run([self.train_ae, self.loss_ae],
                                       feed_dict={'base/input:0': state_batch})

        else:
            loss_ae = self.sess.run(self.loss_ae,
                                    feed_dict={'base/input:0': state_batch})

        # Append loss to loss buffer
        self.ae_loss_history.add(loss_ae)

    def _evaluate_ae(self, t):
        # Check autoencoder mean loss in history and update ae_training flag
        if t % self.ae_evaluation_frequency == 0:
            self.mean_ae_loss = np.array(self.ae_loss_history.buffer).mean()
            last_ae_training_state = self.ae_training

            if self.ae_loss_history.initialized(
            ) and self.mean_ae_loss < self.ae_trainig_threshold:
                self.ae_training = False
            else:
                self.ae_training = True

            # If flag changed, print
            if last_ae_training_state is not self.ae_training:
                print('\nTraining autoencoder:', self.ae_training, '\n')

    def _refresh_image_plots(self, t):
        if t % 4 == 0 and self.show_state:
            self.state_plot.refresh(self.high_dim_observation)

        if (t + 2) % 4 == 0 and self.show_ae_output:
            self.ae_output_plot.refresh(
                self.ae_output.eval(
                    session=self.sess,
                    feed_dict={'base/input:0': self.high_dim_observation})[0])

    def time_step(self, t):
        self._evaluate_ae(t)
        self._refresh_image_plots(t)

    def new_episode(self):
        print('\nTraining autoencoder:', self.ae_training)
        print('Last autoencoder mean loss:', self.mean_ae_loss, '\n')
示例#5
0
buf = MemoryBuffer(args)

if args.load_weights:
    print "Loading weights from %s" % args.load_weights
    net.load_weights(args.load_weights)

env.gym.monitor.start(args.output_folder, force=True)
avg_reward = 0
num_episodes = args.num_episodes
for i_episode in xrange(num_episodes):
    env.restart()
    observation = env.getScreen()
    buf.reset()
    i_total_reward = 0
    for t in xrange(10000):
        buf.add(observation)
        if t < args.history_length or random.random(
        ) < args.exploration_rate_test:
            action = random.randrange(env.numActions())
        else:
            qvalues = net.predict(buf.getStateMinibatch())
            action = np.argmax(qvalues[0])
        reward = env.act(action)
        observation = env.getScreen()
        i_total_reward += reward
        if env.isTerminal():
            avg_reward += i_total_reward
            print "Episode {} finished after {} timesteps with reward {}".format(
                i_episode + 1, t + 1, i_total_reward)
            break
print "Avg reward {}".format(avg_reward / float(num_episodes))
示例#6
0
            h = teacher.get_feedback_signal(observation, action, t_counter)
        else:
            h = human_feedback.get_h()
            # print("Received feedback:", h_counter, "; Total timesteps:", t_counter)

        # Update weights
        if train:
            if np.any(h):  # if any element is not 0
                agent.update(h, observation)
                if not use_simulated_teacher:
                    print("feedback", h)
                h_counter += 1
                # Add state action-label pair to memory buffer
                if use_memory_buffer:
                    if agent.last_step() is not None:
                        buffer.add(agent.last_step())

                    # Train sampling from buffer
                    if buffer.initialized():
                        batch = buffer.sample(
                            batch_size=config_buffer.getint('sampling_size'))
                        agent.batch_update(batch)

            # Train every k time steps
            if buffer.initialized() and t % history_training_rate == 0:
                batch = buffer.sample(
                    batch_size=config_buffer.getint('sampling_size'))
                agent.batch_update(batch)

        t_counter += 1