def __init__(self, is_show=False, is_save=False): self.env = Environment(is_show=is_show, is_save=is_save) self.flap_prob = 0.1 self.epsilon = 1.0 self.epsilon_min = 0.1 self.epsilon_iters = 600000 self.epsilon_reduce = 1.0 * (self.epsilon - self.epsilon_min) / self.epsilon_iters self.image_queue_maxsize = 5 self.replay_memory = [] self.replay_memory_maxsize = 20000 self.batch_size = 32 self.n_history = self.image_queue_maxsize self.image_y_size = 80 self.image_x_size = 80 self.action_options = ['flap', 'noflap'] self.n_action = 2 self.gamma = 0.95 self.n_before = 3000 self.n_update_target = 1000
def __init__(self, index=0, seed=0): self.env = Environment() self.index = index # init variable self.actions = self.env.actions self.trajectory_list = [] # init q network rng = numpy.random.RandomState(int(random.random() * 100)) print '%s %s %s' % ('=' * 5, 'Compile Network Start', '=' * 5) self.q_network = QNetwork(rng=rng, n_state=5, n_action=len(self.actions)) self.q_func = self.q_network.get_q_func() self.q_update = self.q_network.train_one_batch() print '%s %s %s' % ('=' * 5, 'Compile Network End', '=' * 5) # init params self.gamma = 0.9 self.epsilon = 0.1 self.yita = 0.001
def observe(): print os.getcwd() env = Environment() env.reset(seed=0) for i in range(200): state, reward, done = env.step(action='fly' if i % 19 == 0 else 'stay') print state, reward, done env.render() # time.sleep(0.5) if done: break
def __init__(self, index=0, seed=0, observe=True): self.observe = observe self.seed = seed self.env = Environment(observe=self.observe) self.index = index # init variable self.actions = ['fly', 'stay'] self.trajectory_list = [] self.memory_size = 20000 self.mempry_start = 3000 self.memory = [] # init params self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_bound = 0.1 self.epsilon_decrease = 1e-6 # init model params self.batch_size = 32 self.state_size = 6 self.hidden_size = 20 self.learning_rate = 0.01 self.model = self._build_model() if os.path.exists('../experiments/model.h5'): self.model.load_weights('../experiments/model.h5')
class QLearning: def __init__(self, index=0, seed=0): self.env = Environment() self.index = index # init variable self.actions = self.env.actions self.trajectory_list = [] # init q network rng = numpy.random.RandomState(int(random.random() * 100)) print '%s %s %s' % ('=' * 5, 'Compile Network Start', '=' * 5) self.q_network = QNetwork(rng=rng, n_state=5, n_action=len(self.actions)) self.q_func = self.q_network.get_q_func() self.q_update = self.q_network.train_one_batch() print '%s %s %s' % ('=' * 5, 'Compile Network End', '=' * 5) # init params self.gamma = 0.9 self.epsilon = 0.1 self.yita = 0.001 def train(self, iteration=1000): start_time = time.time() while True: # initial state state = self.env.reset() # initial gone done, trajectory = False, [] while not done: sample = [state] # choose action actionid = self._sample_action(state) action = self.actions[actionid] sample.append(actionid) # get information from evironment done, reward, new_state = self.env.step(action=action) sample.append(reward) trajectory.append(sample) # get y if done: y = reward else: new_state = numpy.array([new_state], dtype=theano.config.floatX) q_value = self.q_func(new_state)[0, :] y = reward + self.gamma * max(q_value) self.q_update(numpy.array([state], dtype=theano.config.floatX), \ numpy.array([actionid], dtype=theano.config.floatX), \ numpy.array([y], dtype=theano.config.floatX), self.yita) # render and observe self.env.render() # save trajectory self.trajectory_list.append(trajectory) end_time = time.time() print '%s consumes %i tries' % ('QLearning', self.n_try) self.log.append('%s consumes %i tries' % ('QLearning', self.n_try)) print '%s consumes %.2f seconds' % ('QLearning', end_time - start_time) self.log.append('%s consumes %.2f seconds' % ('QLearning', end_time - start_time)) # save trajectory self._save_trajectory(self.trajectory_list, []) self._save_log(self.log, self._get_log_path(self.index)) def _sample_action(self, state): if random.random() < self.epsilon: action = random.choice(range(len(self.actions))) else: state = numpy.array([state], dtype=theano.config.floatX) q_value = self.q_func(state)[0, :] action = max(enumerate(q_value))[0] return action def _get_image_path(self, index): return '../pic/env/flappy_' + str(index) + '.png' def _get_log_path(self, index): return '../experiments/trajectory/QLearning_' + str(index) + '.txt'
class QLearning: def __init__(self, is_show=False, is_save=False): self.env = Environment(is_show=is_show, is_save=is_save) self.flap_prob = 0.1 self.epsilon = 1.0 self.epsilon_min = 0.1 self.epsilon_iters = 600000 self.epsilon_reduce = 1.0 * (self.epsilon - self.epsilon_min) / self.epsilon_iters self.image_queue_maxsize = 5 self.replay_memory = [] self.replay_memory_maxsize = 20000 self.batch_size = 32 self.n_history = self.image_queue_maxsize self.image_y_size = 80 self.image_x_size = 80 self.action_options = ['flap', 'noflap'] self.n_action = 2 self.gamma = 0.95 self.n_before = 3000 self.n_update_target = 1000 def init_replay_memory(self): n_frame = 0 while n_frame <= self.n_before: init_image = self.env.reset() is_end = False image_queue = [] for j in range(self.image_queue_maxsize): image_queue.append(copy.deepcopy(init_image)) n_frame += 1 while not is_end: rnd = random.random() action = 0 if rnd < self.flap_prob else 1 next_image, reward, is_end = self.env.render(self.action_options[action]) state = self._extract_feature(image_queue) del image_queue[0] image_queue.append(copy.deepcopy(next_image)) next_state = self._extract_feature(image_queue) self.replay_memory.append({ 'state': state, 'action': action, 'reward': reward, 'is_end': is_end, 'next_state': next_state}) n_frame += 1 def init_q_network(self): # 创建placeholder self.images = tf.placeholder( dtype=tf.float32, shape=[ None, self.image_y_size, self.image_x_size, self.n_history], name='images') self.next_images = tf.placeholder( dtype=tf.float32, shape=[ None, self.image_y_size, self.image_x_size, self.n_history], name='next_images') self.actions = tf.placeholder( dtype=tf.float32, shape=[ self.batch_size, self.n_action], name='actions') self.rewards = tf.placeholder( dtype=tf.float32, shape=[ self.batch_size, 1], name='rewards') self.is_terminals = tf.placeholder( dtype=tf.float32, shape=[ self.batch_size, 1], name='is_terminals') self.global_step = tf.Variable(0, dtype=tf.int32, name='global_step') # 构建会话和Network对象 gpu_options = tf.GPUOptions(allow_growth=True) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.q_network = Network( batch_size=self.batch_size, n_history=self.image_queue_maxsize, image_y_size=self.image_y_size, image_x_size=self.image_x_size, n_action=self.n_action, gamma=self.gamma, name='q_network') self.target_network = Network( batch_size=self.batch_size, n_history=self.image_queue_maxsize, image_y_size=self.image_y_size, image_x_size=self.image_x_size, n_action=self.n_action, gamma=self.gamma, name='target_network') # 构建优化器 self.optimizer = tf.train.RMSPropOptimizer(learning_rate=1e-6, decay=0.9, momentum=0.95) self.temp_labels = self.q_network.cal_labels(self.next_images, self.rewards, self.is_terminals) self.avg_loss = self.q_network.get_loss(self.images, self.actions, self.temp_labels) self.optimizer_handle = self.optimizer.minimize(self.avg_loss, global_step=self.global_step) # 构建预测器 self.action_score = self.q_network.get_inference(self.images, batch_size=1) # 模型保存器 self.saver = tf.train.Saver( var_list=tf.global_variables(), write_version=tf.train.SaverDef.V2, max_to_keep=100) # 模型初始化 self.sess.run(tf.global_variables_initializer()) def train(self, n_episodes, backup_dir): self.init_replay_memory() self.init_q_network() print('\nstart training ...\n') n_frame = 0 max_total_reward = 0 for n_episode in range(n_episodes): # 用q_network更新target_network的参数 if n_frame % self.n_update_target == 0: self._update_target(self.q_network, self.target_network) # 初始化trajectory init_image = self.env.reset() image_queue = [] for i in range(self.image_queue_maxsize): image_queue.append(copy.deepcopy(init_image)) total_reward = 0.0 is_end = False n_step = 0 n_frame += 1 while not is_end: state = self._extract_feature(image_queue) # 采样action if random.random() < self.epsilon: action = 0 if random.random() < self.flap_prob else 1 else: state_np = numpy.array([state], dtype='float32') action_score = self.sess.run( fetches=[self.action_score], feed_dict={self.images: state_np}) action = 0 if numpy.argmax(action_score[0]) == 0 else 1 # 更新env next_image, reward, is_end = self.env.render(self.action_options[action]) self.epsilon = max(self.epsilon - self.epsilon_reduce, self.epsilon_min) total_reward += reward n_step += 1 n_frame += 1 del image_queue[0] image_queue.append(copy.deepcopy(next_image)) next_state = self._extract_feature(image_queue) self.replay_memory.append({ 'state': state, 'action': action, 'reward': reward, 'is_end': is_end, 'next_state': next_state}) if len(self.replay_memory) > self.replay_memory_maxsize: del self.replay_memory[0] # 随机从replay_memory中取出1个batch batch_images = numpy.zeros(( self.batch_size, self.image_y_size, self.image_x_size, self.n_history), dtype='float32') batch_next_images = numpy.zeros(( self.batch_size, self.image_y_size, self.image_x_size, self.n_history), dtype='float32') batch_actions = numpy.zeros(( self.batch_size, self.n_action), dtype='float32') batch_rewards = numpy.zeros(( self.batch_size, 1), dtype='float32') batch_is_terminals = numpy.zeros(( self.batch_size, 1), dtype='float32') for j in range(self.batch_size): index = random.randint(0, len(self.replay_memory)-1) item = self.replay_memory[index] batch_images[j,:,:,:] = item['state'] batch_next_images[j,:,:,:] = item['next_state'] batch_actions[j,:] = [1.0, 0.0] if item['action'] == 0 else [0.0, 1.0] batch_rewards[j,:] = [item['reward']] batch_is_terminals[j,:] = [0.0] if item['is_end'] else [1.0] [_, avg_loss] = self.sess.run( fetches=[self.optimizer_handle, self.avg_loss], feed_dict={ self.images: batch_images, self.next_images: batch_next_images, self.actions: batch_actions, self.rewards: batch_rewards, self.is_terminals: batch_is_terminals}) print('[%d] avg_loss: %.6f, total_reward: %.1f, n_score: %d' % ( n_episode, avg_loss, total_reward, self.env.n_score)) # trajectory结束后保存模型 if n_episode % 1000 == 0: total_reward = self.valid() print('[%d] valid n_score: %d' % (n_episode, total_reward)) if total_reward >= max_total_reward: model_path = os.path.join(backup_dir, 'model_best.ckpt') self.saver.save(self.sess, model_path) max_total_reward = total_reward def valid(self): total_rewards = 0 n_iters = 50 for i in range(n_iters): init_image = self.env.reset() image_queue = [] for i in range(self.image_queue_maxsize): image_queue.append(copy.deepcopy(init_image)) total_reward = 0.0 is_end = False while not is_end: state = self._extract_feature(image_queue) state_np = numpy.array([state], dtype='float32') action_score = self.sess.run( fetches=[self.action_score], feed_dict={self.images: state_np}) action = 0 if numpy.argmax(action_score[0]) == 0 else 1 next_image, reward, is_end = self.env.render(self.action_options[action]) total_reward += reward del image_queue[0] image_queue.append(copy.deepcopy(next_image)) total_rewards += self.env.n_score return 1.0 * total_rewards / n_iters def test(self, model_path): self.init_q_network() self.saver.restore(self.sess, model_path) for i in range(100): init_image = self.env.reset() image_queue = [] for i in range(self.image_queue_maxsize): image_queue.append(copy.deepcopy(init_image)) is_end = False while not is_end: state = self._extract_feature(image_queue) state_np = numpy.array([state], dtype='float32') action_score = self.sess.run( fetches=[self.action_score], feed_dict={self.images: state_np}) action = 0 if numpy.argmax(action_score[0]) == 0 else 1 next_image, reward, is_end = self.env.render(self.action_options[action]) del image_queue[0] image_queue.append(copy.deepcopy(next_image)) total_reward = self.env.n_score print('total reward: %d' % (total_reward)) def _extract_feature(self, images): features = [] for image in images: new_image = cv2.resize(image, (self.image_x_size, self.image_y_size)) new_image = cv2.cvtColor(new_image, cv2.COLOR_BGR2GRAY) new_image = numpy.array(new_image/255.0, dtype='float32') new_image = numpy.reshape(new_image, (self.image_y_size, self.image_x_size, 1)) features.append(new_image) feature = numpy.concatenate(features, axis=2) return feature def _update_target(self, q_network, target_network): for i in range(len(q_network.layers)): for j in range(len(q_network.layers[i].variables)): target_network.layers[i].variables[j].assign( q_network.layers[i].variables[j])
class QLearning: def __init__(self, index=0, seed=0, observe=True): self.observe = observe self.seed = seed self.env = Environment(observe=self.observe) self.index = index # init variable self.actions = ['fly', 'stay'] self.trajectory_list = [] self.memory_size = 20000 self.mempry_start = 3000 self.memory = [] # init params self.gamma = 0.95 self.epsilon = 1.0 self.epsilon_bound = 0.1 self.epsilon_decrease = 1e-6 # init model params self.batch_size = 32 self.state_size = 6 self.hidden_size = 20 self.learning_rate = 0.01 self.model = self._build_model() if os.path.exists('../experiments/model.h5'): self.model.load_weights('../experiments/model.h5') def train(self): start_time = time.time() n_iter = 0 while True: n_iter += 1 # initial state state = self.env.reset(self.seed) # initial gone done, trajectory = False, [] while not done: # render and observe if self.observe: self.env.render() # choose action sample = [list(state)] actionid = self._sample_action(state) sample.append(actionid) # get information from evironment new_state, reward, done = self.env.step( action=self.actions[actionid]) reward = reward if not done else -1000 sample.append(reward) trajectory.append(sample) # store memory self.memory.append((numpy.reshape(state, [1, self.state_size]), \ actionid, reward, \ numpy.reshape(new_state, [1, self.state_size]), done)) if len(self.memory) > self.memory_size: self.memory = self.memory[1:] # memory replay if len(self.memory) >= self.mempry_start: self._memory_replay() # update state state = copy.deepcopy(new_state) # save trajectory print '@iter: %i, score: %i, epsilon: %.2f' % (n_iter, \ int(sum([t[2] for t in trajectory[:-1]])), self.epsilon) self.trajectory_list.append(trajectory) if (n_iter - 1) % 100 == 0: dir = os.path.split(os.path.realpath(__file__))[0] model_json = self.model.save_weights('../experiments/model.h5') end_time = time.time() print '%s consumes %i tries' % ('QLearning', self.n_try) self.log.append('%s consumes %i tries' % ('QLearning', self.n_try)) print '%s consumes %.2f seconds' % ('QLearning', end_time - start_time) self.log.append('%s consumes %.2f seconds' % ('QLearning', end_time - start_time)) # save trajectory # self._save_trajectory(self.trajectory_list, []) # self._save_log(self.log, self._get_log_path(self.index)) def _build_model(self): # Neural Net for Deep-Q learning Model model = Sequential() model.add( Dense(self.hidden_size, input_dim=self.state_size, activation='tanh')) model.add( Dense(self.hidden_size, activation='tanh', kernel_initializer='uniform')) model.add(Dense(len(self.actions), activation='linear')) model.compile(loss='mse', optimizer=RMSprop(lr=self.learning_rate)) return model def _memory_replay(self): batch_data = random.sample(self.memory, self.batch_size) X = numpy.zeros((self.batch_size, self.state_size)) Y = numpy.zeros((self.batch_size, len(self.actions))) for i in range(self.batch_size): state, action, reward, next_state, done = batch_data[i] target = self.model.predict(state)[0] if done: target[action] = reward else: target[action] = reward + self.gamma * \ numpy.amax(self.model.predict(next_state)[0]) X[i], Y[i] = state, target # print X, Y # exit() self.model.fit(X, Y, batch_size=self.batch_size, epochs=1, verbose=0) if self.epsilon > self.epsilon_bound: self.epsilon -= self.epsilon_decrease def _sample_action(self, state): state = numpy.reshape(state, [1, self.state_size]) if random.random() < self.epsilon: action = random.choice(range(len(self.actions))) else: q_value = self.model.predict(state)[0, :] action = max(enumerate(q_value), key=lambda x: x[1])[0] return action def _get_image_path(self, index): return '../pic/env/flappy_' + str(index) + '.png' def _get_log_path(self, index): return '../experiments/trajectory/QLearning_' + str(index) + '.txt'