Пример #1
0
 def __init__(self, is_show=False, is_save=False):
     self.env = Environment(is_show=is_show, is_save=is_save)
     self.flap_prob = 0.1
     self.epsilon = 1.0
     self.epsilon_min = 0.1
     self.epsilon_iters = 600000
     self.epsilon_reduce = 1.0 * (self.epsilon - self.epsilon_min) / self.epsilon_iters
     self.image_queue_maxsize = 5
     self.replay_memory = []
     self.replay_memory_maxsize = 20000
     self.batch_size = 32
     self.n_history = self.image_queue_maxsize
     self.image_y_size = 80
     self.image_x_size = 80
     self.action_options = ['flap', 'noflap']
     self.n_action = 2
     self.gamma = 0.95
     self.n_before = 3000
     self.n_update_target = 1000
Пример #2
0
 def __init__(self, index=0, seed=0):
     self.env = Environment()
     self.index = index
     # init variable
     self.actions = self.env.actions
     self.trajectory_list = []
     # init q network
     rng = numpy.random.RandomState(int(random.random() * 100))
     print '%s %s %s' % ('=' * 5, 'Compile Network Start', '=' * 5)
     self.q_network = QNetwork(rng=rng,
                               n_state=5,
                               n_action=len(self.actions))
     self.q_func = self.q_network.get_q_func()
     self.q_update = self.q_network.train_one_batch()
     print '%s %s %s' % ('=' * 5, 'Compile Network End', '=' * 5)
     # init params
     self.gamma = 0.9
     self.epsilon = 0.1
     self.yita = 0.001
Пример #3
0
def observe():
    print os.getcwd()
    env = Environment()
    env.reset(seed=0)
    for i in range(200):
        state, reward, done = env.step(action='fly' if i % 19 == 0 else 'stay')
        print state, reward, done
        env.render()
        # time.sleep(0.5)
        if done:
            break
Пример #4
0
 def __init__(self, index=0, seed=0, observe=True):
     self.observe = observe
     self.seed = seed
     self.env = Environment(observe=self.observe)
     self.index = index
     # init variable
     self.actions = ['fly', 'stay']
     self.trajectory_list = []
     self.memory_size = 20000
     self.mempry_start = 3000
     self.memory = []
     # init params
     self.gamma = 0.95
     self.epsilon = 1.0
     self.epsilon_bound = 0.1
     self.epsilon_decrease = 1e-6
     # init model params
     self.batch_size = 32
     self.state_size = 6
     self.hidden_size = 20
     self.learning_rate = 0.01
     self.model = self._build_model()
     if os.path.exists('../experiments/model.h5'):
         self.model.load_weights('../experiments/model.h5')
Пример #5
0
class QLearning:
    def __init__(self, index=0, seed=0):
        self.env = Environment()
        self.index = index
        # init variable
        self.actions = self.env.actions
        self.trajectory_list = []
        # init q network
        rng = numpy.random.RandomState(int(random.random() * 100))
        print '%s %s %s' % ('=' * 5, 'Compile Network Start', '=' * 5)
        self.q_network = QNetwork(rng=rng,
                                  n_state=5,
                                  n_action=len(self.actions))
        self.q_func = self.q_network.get_q_func()
        self.q_update = self.q_network.train_one_batch()
        print '%s %s %s' % ('=' * 5, 'Compile Network End', '=' * 5)
        # init params
        self.gamma = 0.9
        self.epsilon = 0.1
        self.yita = 0.001

    def train(self, iteration=1000):
        start_time = time.time()
        while True:
            # initial state
            state = self.env.reset()
            # initial gone
            done, trajectory = False, []
            while not done:
                sample = [state]
                # choose action
                actionid = self._sample_action(state)
                action = self.actions[actionid]
                sample.append(actionid)
                # get information from evironment
                done, reward, new_state = self.env.step(action=action)
                sample.append(reward)
                trajectory.append(sample)
                # get y
                if done:
                    y = reward
                else:
                    new_state = numpy.array([new_state],
                                            dtype=theano.config.floatX)
                    q_value = self.q_func(new_state)[0, :]
                    y = reward + self.gamma * max(q_value)
                self.q_update(numpy.array([state], dtype=theano.config.floatX), \
                   numpy.array([actionid], dtype=theano.config.floatX), \
                   numpy.array([y], dtype=theano.config.floatX), self.yita)
                # render and observe
                self.env.render()
            # save trajectory
            self.trajectory_list.append(trajectory)
        end_time = time.time()
        print '%s consumes %i tries' % ('QLearning', self.n_try)
        self.log.append('%s consumes %i tries' % ('QLearning', self.n_try))
        print '%s consumes %.2f seconds' % ('QLearning', end_time - start_time)
        self.log.append('%s consumes %.2f seconds' %
                        ('QLearning', end_time - start_time))
        # save trajectory
        self._save_trajectory(self.trajectory_list, [])
        self._save_log(self.log, self._get_log_path(self.index))

    def _sample_action(self, state):
        if random.random() < self.epsilon:
            action = random.choice(range(len(self.actions)))
        else:
            state = numpy.array([state], dtype=theano.config.floatX)
            q_value = self.q_func(state)[0, :]
            action = max(enumerate(q_value))[0]

        return action

    def _get_image_path(self, index):
        return '../pic/env/flappy_' + str(index) + '.png'

    def _get_log_path(self, index):
        return '../experiments/trajectory/QLearning_' + str(index) + '.txt'
Пример #6
0
class QLearning:
    def __init__(self, is_show=False, is_save=False):
        self.env = Environment(is_show=is_show, is_save=is_save)
        self.flap_prob = 0.1
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_iters = 600000
        self.epsilon_reduce = 1.0 * (self.epsilon - self.epsilon_min) / self.epsilon_iters
        self.image_queue_maxsize = 5
        self.replay_memory = []
        self.replay_memory_maxsize = 20000
        self.batch_size = 32
        self.n_history = self.image_queue_maxsize
        self.image_y_size = 80
        self.image_x_size = 80
        self.action_options = ['flap', 'noflap']
        self.n_action = 2
        self.gamma = 0.95
        self.n_before = 3000
        self.n_update_target = 1000

    def init_replay_memory(self):
        n_frame = 0
        while n_frame <= self.n_before:
            init_image = self.env.reset()
            is_end = False
            image_queue = []
            for j in range(self.image_queue_maxsize):
                image_queue.append(copy.deepcopy(init_image)) 
            n_frame += 1
            while not is_end:
                rnd = random.random()
                action = 0 if rnd < self.flap_prob else 1
                next_image, reward, is_end = self.env.render(self.action_options[action])
                state = self._extract_feature(image_queue)
                del image_queue[0]
                image_queue.append(copy.deepcopy(next_image))
                next_state = self._extract_feature(image_queue)
                self.replay_memory.append({
                    'state': state, 'action': action, 'reward': reward, 
                    'is_end': is_end, 'next_state': next_state})
                n_frame += 1

    def init_q_network(self):
        # 创建placeholder
        self.images = tf.placeholder(
            dtype=tf.float32, shape=[
                None, self.image_y_size, self.image_x_size, self.n_history],
            name='images')
        self.next_images = tf.placeholder(
            dtype=tf.float32, shape=[
                None, self.image_y_size, self.image_x_size, self.n_history],
            name='next_images')
        self.actions = tf.placeholder(
            dtype=tf.float32, shape=[
                self.batch_size, self.n_action],
            name='actions')
        self.rewards = tf.placeholder(
            dtype=tf.float32, shape=[
                self.batch_size, 1],
            name='rewards')
        self.is_terminals = tf.placeholder(
            dtype=tf.float32, shape=[
                self.batch_size, 1],
            name='is_terminals')
        self.global_step = tf.Variable(0, dtype=tf.int32, name='global_step')
        
        # 构建会话和Network对象
        gpu_options = tf.GPUOptions(allow_growth=True)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.q_network = Network(
            batch_size=self.batch_size, n_history=self.image_queue_maxsize, 
            image_y_size=self.image_y_size, image_x_size=self.image_x_size,
            n_action=self.n_action, gamma=self.gamma, name='q_network')
        self.target_network = Network(
            batch_size=self.batch_size, n_history=self.image_queue_maxsize, 
            image_y_size=self.image_y_size, image_x_size=self.image_x_size,
            n_action=self.n_action, gamma=self.gamma, name='target_network')
        
        # 构建优化器
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-6, decay=0.9, momentum=0.95)
        self.temp_labels = self.q_network.cal_labels(self.next_images, self.rewards, self.is_terminals)
        self.avg_loss = self.q_network.get_loss(self.images, self.actions, self.temp_labels)
        self.optimizer_handle = self.optimizer.minimize(self.avg_loss, global_step=self.global_step)
        # 构建预测器
        self.action_score = self.q_network.get_inference(self.images, batch_size=1)
        
        # 模型保存器
        self.saver = tf.train.Saver(
            var_list=tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=100)
        # 模型初始化
        self.sess.run(tf.global_variables_initializer())

    def train(self, n_episodes, backup_dir):        
        self.init_replay_memory()
        self.init_q_network()

        print('\nstart training ...\n')
        n_frame = 0
        max_total_reward = 0
        for n_episode in range(n_episodes):
            # 用q_network更新target_network的参数
            if n_frame % self.n_update_target == 0:
                self._update_target(self.q_network, self.target_network)
            
            # 初始化trajectory
            init_image = self.env.reset()
            image_queue = []
            for i in range(self.image_queue_maxsize):
                image_queue.append(copy.deepcopy(init_image)) 
            total_reward = 0.0
            is_end = False
            n_step = 0
            n_frame += 1
            
            while not is_end:
                state = self._extract_feature(image_queue)
                # 采样action
                if random.random() < self.epsilon:
                    action = 0 if random.random() < self.flap_prob else 1
                else:
                    state_np = numpy.array([state], dtype='float32')
                    action_score = self.sess.run(
                        fetches=[self.action_score], 
                        feed_dict={self.images: state_np})
                    action = 0 if numpy.argmax(action_score[0]) == 0 else 1
                
                # 更新env
                next_image, reward, is_end = self.env.render(self.action_options[action])
                self.epsilon = max(self.epsilon - self.epsilon_reduce, self.epsilon_min)
                total_reward += reward
                n_step += 1
                n_frame += 1
                del image_queue[0]
                image_queue.append(copy.deepcopy(next_image))
                next_state = self._extract_feature(image_queue)
                self.replay_memory.append({
                    'state': state, 'action': action, 'reward': reward,
                    'is_end': is_end, 'next_state': next_state})
                if len(self.replay_memory) > self.replay_memory_maxsize:
                    del self.replay_memory[0]
                
                # 随机从replay_memory中取出1个batch
                batch_images = numpy.zeros((
                    self.batch_size, self.image_y_size, self.image_x_size, 
                    self.n_history), dtype='float32')
                batch_next_images = numpy.zeros((
                    self.batch_size, self.image_y_size, self.image_x_size, 
                    self.n_history), dtype='float32')
                batch_actions = numpy.zeros((
                    self.batch_size, self.n_action), dtype='float32')
                batch_rewards = numpy.zeros((
                    self.batch_size, 1), dtype='float32')
                batch_is_terminals = numpy.zeros((
                    self.batch_size, 1), dtype='float32')
                for j in range(self.batch_size):
                    index = random.randint(0, len(self.replay_memory)-1)
                    item = self.replay_memory[index]
                    batch_images[j,:,:,:] = item['state']
                    batch_next_images[j,:,:,:] = item['next_state']
                    batch_actions[j,:] = [1.0, 0.0] if item['action'] == 0 else [0.0, 1.0]
                    batch_rewards[j,:] = [item['reward']]
                    batch_is_terminals[j,:] = [0.0] if item['is_end'] else [1.0]
                [_, avg_loss] = self.sess.run(
                    fetches=[self.optimizer_handle, self.avg_loss],
                    feed_dict={
                        self.images: batch_images, self.next_images: batch_next_images,
                        self.actions: batch_actions, self.rewards: batch_rewards, 
                        self.is_terminals: batch_is_terminals})
            
            print('[%d] avg_loss: %.6f, total_reward: %.1f, n_score: %d' % (
                n_episode, avg_loss, total_reward, self.env.n_score))
            
            # trajectory结束后保存模型
            if n_episode % 1000 == 0:
                total_reward = self.valid()
                print('[%d] valid n_score: %d' % (n_episode, total_reward))
                if total_reward >= max_total_reward:
                    model_path = os.path.join(backup_dir, 'model_best.ckpt')
                    self.saver.save(self.sess, model_path)
                    max_total_reward = total_reward

    def valid(self):
        total_rewards = 0
        n_iters = 50
        for i in range(n_iters):
            init_image = self.env.reset()
            image_queue = []
            for i in range(self.image_queue_maxsize):
                image_queue.append(copy.deepcopy(init_image))
            total_reward = 0.0
            is_end = False
            while not is_end:
                state = self._extract_feature(image_queue)
                state_np = numpy.array([state], dtype='float32')
                action_score = self.sess.run(
                    fetches=[self.action_score],
                    feed_dict={self.images: state_np})
                action = 0 if numpy.argmax(action_score[0]) == 0 else 1
                next_image, reward, is_end = self.env.render(self.action_options[action])
                total_reward += reward
                del image_queue[0]
                image_queue.append(copy.deepcopy(next_image))
            total_rewards += self.env.n_score

        return 1.0 * total_rewards / n_iters

    def test(self, model_path):
        self.init_q_network()
        self.saver.restore(self.sess, model_path)

        for i in range(100):
            init_image = self.env.reset()
            image_queue = []
            for i in range(self.image_queue_maxsize):
                image_queue.append(copy.deepcopy(init_image))
            is_end = False
            while not is_end:
                state = self._extract_feature(image_queue)
                state_np = numpy.array([state], dtype='float32')
                action_score = self.sess.run(
                    fetches=[self.action_score], 
                    feed_dict={self.images: state_np})
                action = 0 if numpy.argmax(action_score[0]) == 0 else 1
                next_image, reward, is_end = self.env.render(self.action_options[action])
                del image_queue[0]
                image_queue.append(copy.deepcopy(next_image))
            total_reward = self.env.n_score
            print('total reward: %d' % (total_reward))

    def _extract_feature(self, images):
        features = []
        for image in images:
            new_image = cv2.resize(image, (self.image_x_size, self.image_y_size))
            new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2GRAY)
            new_image = numpy.array(new_image/255.0, dtype='float32')
            new_image = numpy.reshape(new_image, (self.image_y_size, self.image_x_size, 1))
            features.append(new_image)
        feature = numpy.concatenate(features, axis=2)
        
        return feature

    def _update_target(self, q_network, target_network):
        for i in range(len(q_network.layers)):
            for j in range(len(q_network.layers[i].variables)):
                target_network.layers[i].variables[j].assign(
                    q_network.layers[i].variables[j])
Пример #7
0
class QLearning:
    def __init__(self, index=0, seed=0, observe=True):
        self.observe = observe
        self.seed = seed
        self.env = Environment(observe=self.observe)
        self.index = index
        # init variable
        self.actions = ['fly', 'stay']
        self.trajectory_list = []
        self.memory_size = 20000
        self.mempry_start = 3000
        self.memory = []
        # init params
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_bound = 0.1
        self.epsilon_decrease = 1e-6
        # init model params
        self.batch_size = 32
        self.state_size = 6
        self.hidden_size = 20
        self.learning_rate = 0.01
        self.model = self._build_model()
        if os.path.exists('../experiments/model.h5'):
            self.model.load_weights('../experiments/model.h5')

    def train(self):
        start_time = time.time()
        n_iter = 0
        while True:
            n_iter += 1
            # initial state
            state = self.env.reset(self.seed)
            # initial gone
            done, trajectory = False, []
            while not done:
                # render and observe
                if self.observe:
                    self.env.render()
                # choose action
                sample = [list(state)]
                actionid = self._sample_action(state)
                sample.append(actionid)
                # get information from evironment
                new_state, reward, done = self.env.step(
                    action=self.actions[actionid])
                reward = reward if not done else -1000
                sample.append(reward)
                trajectory.append(sample)
                # store memory
                self.memory.append((numpy.reshape(state, [1, self.state_size]), \
                    actionid, reward, \
                    numpy.reshape(new_state, [1, self.state_size]), done))
                if len(self.memory) > self.memory_size:
                    self.memory = self.memory[1:]
                # memory replay
                if len(self.memory) >= self.mempry_start:
                    self._memory_replay()
                # update state
                state = copy.deepcopy(new_state)
            # save trajectory
            print '@iter: %i, score: %i, epsilon: %.2f' % (n_iter, \
             int(sum([t[2] for t in trajectory[:-1]])), self.epsilon)
            self.trajectory_list.append(trajectory)
            if (n_iter - 1) % 100 == 0:
                dir = os.path.split(os.path.realpath(__file__))[0]
                model_json = self.model.save_weights('../experiments/model.h5')

        end_time = time.time()
        print '%s consumes %i tries' % ('QLearning', self.n_try)
        self.log.append('%s consumes %i tries' % ('QLearning', self.n_try))
        print '%s consumes %.2f seconds' % ('QLearning', end_time - start_time)
        self.log.append('%s consumes %.2f seconds' %
                        ('QLearning', end_time - start_time))
        # save trajectory
        # self._save_trajectory(self.trajectory_list, [])
        # self._save_log(self.log, self._get_log_path(self.index))

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(
            Dense(self.hidden_size,
                  input_dim=self.state_size,
                  activation='tanh'))
        model.add(
            Dense(self.hidden_size,
                  activation='tanh',
                  kernel_initializer='uniform'))
        model.add(Dense(len(self.actions), activation='linear'))
        model.compile(loss='mse', optimizer=RMSprop(lr=self.learning_rate))
        return model

    def _memory_replay(self):
        batch_data = random.sample(self.memory, self.batch_size)
        X = numpy.zeros((self.batch_size, self.state_size))
        Y = numpy.zeros((self.batch_size, len(self.actions)))
        for i in range(self.batch_size):
            state, action, reward, next_state, done = batch_data[i]
            target = self.model.predict(state)[0]
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.gamma * \
                 numpy.amax(self.model.predict(next_state)[0])
            X[i], Y[i] = state, target
        # print X, Y
        # exit()
        self.model.fit(X, Y, batch_size=self.batch_size, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_bound:
            self.epsilon -= self.epsilon_decrease

    def _sample_action(self, state):
        state = numpy.reshape(state, [1, self.state_size])
        if random.random() < self.epsilon:
            action = random.choice(range(len(self.actions)))
        else:
            q_value = self.model.predict(state)[0, :]
            action = max(enumerate(q_value), key=lambda x: x[1])[0]
        return action

    def _get_image_path(self, index):
        return '../pic/env/flappy_' + str(index) + '.png'

    def _get_log_path(self, index):
        return '../experiments/trajectory/QLearning_' + str(index) + '.txt'