class GameManager: def __init__(self, id): self.visualize = False if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0: self.visualize = True elif Config.PLAY_MODE: self.visualize = True world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)] self.env = Environment(world_name) print("Env {} for Agent {} started.".format(world_name, id)) self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END) self.env.set_observation_rotation_size(Config.OBSERVATION_ROTATION_SIZE) self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION) self.env.set_cluster_size(Config.CLUSTER_SIZE) self.reset() def reset(self): observation, _, _, _ = self.env.reset() input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) obs = np.array([ [map.S_image], [rotation] ]) return obs def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) #obs = np.array([[map.States_map, map.Reward_map], [rotation]]) obs = np.array([[map.S_image], [rotation]]) reward = 0 done = False else: linear, angular = map_action(action) observation, reward, done, info = self.env.step(linear, angular, 20) input_laser, rotation = self.process_observation(observation) map = StateMap(input_laser) obs = np.array([[map.S_image], [rotation]]) return obs, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): return self.env.observation_size() def process_observation(self, observation): laser_scan = np.array(observation[:Config.OBSERVATION_SIZE]) oriontaion = np.array(observation[Config.OBSERVATION_SIZE:]) return laser_scan, oriontaion
class GameManager: def __init__(self, id): self.visualize = False if Config.VISUALIZE and int(id / len(Config.PATH_TO_WORLD)) == 0: self.visualize = True elif Config.PLAY_MODE: self.visualize = True world_name = Config.PATH_TO_WORLD[id % len(Config.PATH_TO_WORLD)] self.env = Environment(world_name) print("Env {} for Agent {} started.".format(world_name, id)) self.env.set_mode(Config.MODE, Config.TERMINATE_AT_END) self.env.set_observation_rotation_size( Config.OBSERVATION_ROTATION_SIZE) self.env.use_observation_rotation_size(Config.USE_OBSERVATION_ROTATION) self.env.set_cluster_size(Config.CLUSTER_SIZE) self.reset() def reset(self): observation, _, _, _ = self.env.reset() return observation def step(self, action): self._update_display() if action is None: observation, reward, done, info = self.env.step(0, 0, 20) reward = 0 done = False else: linear, angular = map_action(action) observation, reward, done, info = self.env.step( linear, angular, 20) return observation, reward, done, info def _update_display(self): if self.visualize: self.env.visualize() def observation_size(self): return self.env.observation_size()
def _build_graph(self): env = Environment(self.world_name) # @TODO Vernünftig machen env.set_cluster_size(CLUSTER_SIZE) env.use_observation_rotation_size(self.use_target) input = tflearn.layers.input_data(shape=(None, env.observation_size()), dtype=tf.float32) input = tf.expand_dims(input, -1) net = input net = tflearn.layers.conv_1d(net, 16, 3, padding='same') net = tflearn.layers.max_pool_1d(net, 3) net = tflearn.layers.conv_1d(net, 16, 2) net = tflearn.layers.max_pool_1d(net, 2) net = tflearn.layers.fully_connected(net, 64, activation='relu') net = tflearn.layers.fully_connected(net, self.action_mapper.ACTION_SIZE, activation='linear') # net = tflearn.layers.fully_connected(net, 512, activation='relu') # net = tflearn.layers.fully_connected(net, 256, activation='relu') # net = tflearn.layers.fully_connected(net, self.action_size, activation='linear') return input, net
class WorkerAgent(threading.Thread): def __init__(self, name, graph_ops, update_ops, world_name, use_target, session, saver): super().__init__() self.name = name self.graph_ops = graph_ops self.session = session self.saver = saver self.graph_ops = graph_ops self.update_ops = update_ops self.env = Environment(world_name) self.env.use_observation_rotation_size(use_target) self.env.set_cluster_size(CLUSTER_SIZE) self.state_size = self.env.observation_size() self.action_size = action_mapper.ACTION_SIZE def run(self): global global_episode, global_step print('Thread {} started.'.format(self.name)) local_episodes = 0 accumulated_reward = 0 best_reward = 0 epsilon = INITIAL_EPSILON state_batch = [] reward_batch = [] action_batch = [] period_start_time = time.time() while global_episode <= MAX_EPISODES: self.env.reset() state, _, _, _ = self.env.step(0, 0) state = self.reshape_state(state) episode_step = 0 episode_reward = 0 while True: q_output = self.graph_ops['network']['q_values'].eval( session=self.session, feed_dict={self.graph_ops['network']['input']: [state]}) if random() <= epsilon: action_index = randrange(self.action_size) else: action_index = np.argmax(q_output) a_t = np.zeros([self.action_size]) a_t[action_index] = 1 if epsilon > final_epsilon: epsilon -= (INITIAL_EPSILON - final_epsilon) / anneal_epsilon_timesteps #print("Choosing Action {}".format(action_index)) x1, x2 = action_mapper.map_action(action_index) next_state, reward, term, info = self.env.step(x1, x2, 10) next_state = self.reshape_state(next_state) episode_reward += reward if visualize: self.env.visualize() #print("Reward: {} \n\n".format(reward)) next_q_values = self.graph_ops['target_network'][ 'q_values'].eval( session=self.session, feed_dict={ self.graph_ops['target_network']['input']: [next_state] }) if not term: reward = reward + gamma * np.amax(next_q_values) state_batch.append(state) action_batch.append(a_t) reward_batch.append(reward) if global_step % target_update_timestep == 0: self.session.run(self.update_ops['reset_target_network']) print("Target Net Resetted") # start = time.time() if episode_step % UPDATE_PERIOD == 0 or term: self.session.run(self.update_ops['minimize'], feed_dict={ self.update_ops['y']: reward_batch, self.update_ops['a']: action_batch, self.graph_ops['network']['input']: state_batch }) state_batch = [] action_batch = [] reward_batch = [] # end = time.time() # print('Time for updating: ', end - start) if global_step % CHECKPOINT_PERIOD_TIMESTEPS == 0: self.saver.save(self.session, CHECKPOINT_PATH, global_step=global_step) global_step += 1 state = next_state episode_step += 1 if term: break accumulated_reward += episode_reward best_reward = episode_reward if ( episode_reward > best_reward) else best_reward local_episodes += 1 global_episode += 1 if local_episodes % PRINT_EVERY == 0: period_end_time = time.time() #writer.add_summary(tf.summary.scalar('AVG Reward', accumulated_reward / PRINT_EVERY)) print( "Thread {0:}. Total Episodes {1:}. Reward AVG: {2:.3f}, Best Reward: {3:.3f}, Globalstep: {4:6d}, Epsilon: {5:f}, Time: {6:}" .format(self.name, global_episode, accumulated_reward / PRINT_EVERY, best_reward, global_step, epsilon, period_end_time - period_start_time)) accumulated_reward = 0 best_reward = -99999 period_start_time = time.time() def reshape_state(self, state): return np.reshape(state, [self.state_size, 1])
class Worker(object): def __init__(self, name, globalAC): if MULTIPLE_ROOMS: if name == "W_0" or name == "W_1" or name == "W_2": self.env = Environment(ENV_NAME) elif name == "W_3" or name == "W_4" or name == "W_5": self.env = Environment(ENV_NAME_2) else: self.env = Environment(ENV_NAME_3) else: self.env = Environment(ENV_NAME) self.env.set_cluster_size(CLUSTER_SIZE) self.env.set_observation_rotation_size(64) # TODO self.env.use_observation_rotation_size(True) self.name = name self.AC = ACNet(name, globalAC) def convert_action(self, action): angular = 0 linear = 0 if action == 0: angular = 1.0 linear = 0.5 elif action == 1: angular = 0.5 linear = 0.75 elif action == 2: angular = 0.0 linear = 1.0 elif action == 3: angular = -0.5 linear = 0.75 else: angular = -1.0 linear = 0.5 return linear, angular def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s, _, _, _ = self.env.reset() s = np.reshape(s, [1, N_S]) ep_r = 0 # rnn_state = SESS.run(self.AC.init_state) # zero rnn state at beginning # keep_state = deepcopy(rnn_state) # keep rnn state for updating global net for ep_t in range(MAX_EP_STEP): # a, rnn_state_ = self.AC.choose_action(s, rnn_state) # get the action and next rnn state a = self.AC.choose_action( s) # get the action and next rnn state b = np.asarray(a) b = b[0][0] action = np.argmax(b) linear, angular = self.convert_action(action) s_, r, done, _ = self.env.step(linear, angular, SKIP_LRF) s_ = np.reshape(s_, [1, N_S]) # if (self.name == 'W_0' or self.name == "W_3") and VISUALIZE: if (self.name == 'W_0') and VISUALIZE: self.env.visualize() done = True if ep_t == MAX_EP_STEP - 1 else done ep_r += r buffer_s.append(s) buffer_a.append(b) buffer_r.append(r) # buffer_r.append((r+8)/8) # normalize if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: # v_s_ = SESS.run(self.AC.v, {self.AC.s: s_, self.AC.init_state: rnn_state_})[0, 0] v_s_ = SESS.run(self.AC.v, {self.AC.s: s_})[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, # self.AC.init_state: keep_state, } self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() # keep_state = deepcopy(rnn_state_) # replace the keep_state as the new initial rnn state_ s = s_ # rnn_state = rnn_state_ # renew rnn state total_step += 1 if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) if self.name == "W_0": print(self.name, "Ep:", GLOBAL_EP, "Ep_r:", ep_r) # print( # self.name, # "Ep:", GLOBAL_EP, # "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], # ) GLOBAL_EP += 1 if GLOBAL_EP % SAVE_INTERVAL == 0: print("Versuche zu Speichern...") self.AC.save_global() print("...gespeichert!") break
GAMMA = 0.9 ENTROPY_BETA = 0.01 LR_A = 0.0001 # 0.0001 # learning rate for actor LR_C = 0.001 # learning rate for critic GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 ENV_NAME = "square" ENV_NAME_2 = "roblab" ENV_NAME_3 = "room" CLUSTER_SIZE = 10 SKIP_LRF = 20 env = Environment(ENV_NAME) env.set_cluster_size(CLUSTER_SIZE) N_S = env.observation_size() + 64 # state_size TODO N_A = 5 # action size class ACNet(object): def __init__(self, scope, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_params, self.c_params = self._build_net(scope)[-2:] else: # local net, calculate losses with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
class Worker(object): def __init__(self, name, globalAC): self.env = Environment(ENV_NAME) self.env.set_cluster_size(CLUSTER_SIZE) self.name = name self.AC = ACNet(name, globalAC) def convert_action(self, action): angular = 0 linear = 0 if action == 0: angular = 1.0 linear = 0.5 elif action == 1: angular = 0.5 linear = 0.75 elif action == 2: angular = 0.0 linear = 1.0 elif action == 3: angular = -0.5 linear = 0.75 else: angular = -1.0 linear = 0.5 return linear, angular def work(self): global GLOBAL_RUNNING_R, GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s, _, _, _ = self.env.reset() s = np.reshape(s, [1, N_S]) ep_r = 0 rnn_state = SESS.run( self.AC.init_state) # zero rnn state at beginning keep_state = rnn_state.copy( ) # keep rnn state for updating global net for ep_t in range(MAX_EP_STEP): if self.name == 'W_0': self.env.visualize() a, rnn_state_ = self.AC.choose_action( s, rnn_state) # get the action and next rnn state action = np.argmax(a) linear, angular = self.convert_action(action) s_, r, done, _ = self.env.step( linear, angular, 10) # Die Zahl heißt: überspringe so viele Laserscanns s_ = np.reshape(s_, [1, N_S]) done = True if ep_t == MAX_EP_STEP - 1 else done ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) # buffer_r.append((r+8)/8) # normalize if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: v_s_ = SESS.run(self.AC.v, { self.AC.s: s_, self.AC.init_state: rnn_state_ })[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() buffer_s, buffer_a, buffer_v_target = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack( buffer_v_target) feed_dict = { self.AC.s: buffer_s, self.AC.a_his: buffer_a, self.AC.v_target: buffer_v_target, self.AC.init_state: keep_state, } self.AC.update_global(feed_dict) buffer_s, buffer_a, buffer_r = [], [], [] self.AC.pull_global() keep_state = rnn_state_.copy( ) # replace the keep_state as the new initial rnn state_ s = s_ rnn_state = rnn_state_ # renew rnn state total_step += 1 if self.name == 'W_0': self.env.visualize() if done: if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.9 * GLOBAL_RUNNING_R[-1] + 0.1 * ep_r) print( self.name, "Ep:", GLOBAL_EP, "| Ep_r: %i" % GLOBAL_RUNNING_R[-1], ) GLOBAL_EP += 1 break
angular = 0 linear = 1.5 elif action == 3: angular = -0.44 linear = 1.25 else: angular = -0.77 linear = 0.75 return linear, angular if __name__ == "__main__": env = Environment("test") env.set_cluster_size(10) state_size = env.observation_size() #Anzahl der Laserscans action_size = 5 agent = RNNAgent(state_size, action_size) # agent.load("./save/cartpole-dqn.h5") done = False batch_size = 32 print("START") for e in range(EPISODES): reward_sum = 0