def __init__( self, config, env, demo_transitions=None ): #we need another file to give the defination of configuration self.sess = tf.InteractiveSession() self.config = config self.generate_memory = Memory( capacity=self.config.generate_memory_size, permanent_data=0) self.expert_memory = Memory(capacity=self.config.expert_memory_size, permanent_data=0) self.add_data_to_genarte_memory(source=demo_transitions) self.add_data_to_expert_memory(source=demo_transitions) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.ob_space = env.observation_space self.gamma = 0.95 self.Policy = Policy_net('policy', env) self.Old_Policy = Policy_net('old_policy', env) self.PPO = PPOTrain(self.Policy, self.Policy, self.gamma) self.D = Discriminator(env) self.epsilon = self.config.INITIAL_EPSILON self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) print("we have initialized the GAIL") self.save_model() self.restore_model()
def __init__(self, env, config, demo_transitions=None): self.sess = tf.InteractiveSession() self.config = config # replay_memory stores both demo data and generated data, while demo_memory only store demo data self.replay_memory = Memory(capacity=self.config.replay_buffer_size, permanent_data=len(demo_transitions)) self.demo_memory = Memory(capacity=self.config.demo_buffer_size, permanent_data=self.config.demo_buffer_size) self.add_demo_to_memory(demo_transitions=demo_transitions) # add demo data to both demo_memory & replay_memory self.time_step = 0 self.epsilon = self.config.INITIAL_EPSILON self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.action_batch = tf.placeholder("int32", [None]) self.y_input = tf.placeholder("float", [None, self.action_dim]) self.ISWeights = tf.placeholder("float", [None, 1]) self.n_step_y_input = tf.placeholder("float", [None, self.action_dim]) # for n-step reward self.isdemo = tf.placeholder("float", [None]) self.eval_input = tf.placeholder("float", [None, self.state_dim]) self.select_input = tf.placeholder("float", [None, self.state_dim]) self.Q_eval self.Q_select self.loss self.optimize self.update_target_net self.abs_errors self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) self.save_model() self.restore_model()
class SOUL3: def __init__(self, env, config, sess): # self.Soulsess = tf.InteractiveSession() if sess is None: self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) else: self.sess = sess self.config = config self.expert_memory = Memory(capacity=self.config.EXPERT_MEMORY_SIZE, permanent_data=0) self.generate_memory = Memory( capacity=self.config.GENERATE_MEMORY_SIZE, permanent_data=0) #self.sess.run(tf.global_variables_initializer()) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.ob_space = env.observation_space self.gamma = 0.95 self.Policy = Policy_net('policy', env) self.Old_Policy = Policy_net('old_policy', env) self.PPO = PPOTrain(self.Policy, self.Old_Policy, self.gamma) self.D = Discriminator(env) self.epsilon = self.config.INITIAL_EPSILON self.saver = tf.train.Saver() def add_data_to_memory(self, D, M): for t in D: M.store(np.array(t, dtype=object)) def get_data_from_fullmemory(self, MFULL): _, D, _ = MFULL.sample(len(MFULL)) return D def copy_AFULL_to_B(self, AFULL, B): _, data, _ = AFULL.sample(len(AFULL)) for t in data: B.store(np.array(t, dtype=object)) def random_action(self): return random.randint(0, self.action_dim - 1) def perceive(self, transition): self.generate_memory.store(np.array(transition)) # epsilon->FINAL_EPSILON(min_epsilon) if self.generate_memory.full(): self.epsilon = max(self.config.FINAL_EPSILON, self.epsilon * self.config.EPSILIN_DECAY) def save_model(self): print("soul Model saved in : {}".format( self.saver.save(self.sess, self.config.SOUL_MODEL_PATH))) def restore_model(self): self.saver.restore(self.sess, self.config.SOUL_MODEL_PATH) print("soul Model restored.")
def __init__(self, env, config): self.Soulsess = tf.InteractiveSession() self.config = config self.expert_memory = Memory(capacity=self.config.EXPERT_MEMORY_SIZE, permanent_data=0) self.generate_memory = Memory( capacity=self.config.GENERATE_MEMORY_SIZE, permanent_data=0) #self.sess.run(tf.global_variables_initializer()) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.ob_space = env.observation_space self.gamma = 0.95 self.Policy = Policy_net('policy', env) self.Old_Policy = Policy_net('old_policy', env) self.PPO = PPOTrain(self.Policy, self.Old_Policy, self.gamma) self.D = Discriminator(env) self.epsilon = self.config.INITIAL_EPSILON self.saver = tf.train.Saver()
class GAIL: def __init__( self, config, env, demo_transitions=None ): #we need another file to give the defination of configuration self.sess = tf.InteractiveSession() self.config = config self.generate_memory = Memory( capacity=self.config.generate_memory_size, permanent_data=0) self.expert_memory = Memory(capacity=self.config.expert_memory_size, permanent_data=0) self.add_data_to_genarte_memory(source=demo_transitions) self.add_data_to_expert_memory(source=demo_transitions) self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.ob_space = env.observation_space self.gamma = 0.95 self.Policy = Policy_net('policy', env) self.Old_Policy = Policy_net('old_policy', env) self.PPO = PPOTrain(self.Policy, self.Policy, self.gamma) self.D = Discriminator(env) self.epsilon = self.config.INITIAL_EPSILON self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) print("we have initialized the GAIL") self.save_model() self.restore_model() def add_data_to_genarte_memory(self, source): for t in source: self.generate_memory.store(np.array(t, dtype=object)) def add_data_to_expert_memory(self, source): for t in source: self.expert_memory.store(np.array(t, dtype=object)) def perceive(self, transition): self.generate_memory.store(np.array(transition)) # epsilon->FINAL_EPSILON(min_epsilon) if self.generate_memory.full(): self.epsilon = max(self.config.FINAL_EPSILON, self.epsilon * self.config.EPSILIN_DECAY) def save_model(self): print("Model saved in : {}".format( self.saver.save(self.sess, self.config.MODEL_PATH_soul))) def restore_model(self): self.saver.restore(self.sess, self.config.MODEL_PATH_soul) print("Model restored.")
class DQfD: def __init__(self, env, config, demo_transitions=None): self.sess = tf.InteractiveSession() self.config = config # replay_memory stores both demo data and generated data, while demo_memory only store demo data self.replay_memory = Memory(capacity=self.config.replay_buffer_size, permanent_data=len(demo_transitions)) self.demo_memory = Memory(capacity=self.config.demo_buffer_size, permanent_data=self.config.demo_buffer_size) self.add_demo_to_memory(demo_transitions=demo_transitions) # add demo data to both demo_memory & replay_memory self.time_step = 0 self.epsilon = self.config.INITIAL_EPSILON self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.action_batch = tf.placeholder("int32", [None]) self.y_input = tf.placeholder("float", [None, self.action_dim]) self.ISWeights = tf.placeholder("float", [None, 1]) self.n_step_y_input = tf.placeholder("float", [None, self.action_dim]) # for n-step reward self.isdemo = tf.placeholder("float", [None]) self.eval_input = tf.placeholder("float", [None, self.state_dim]) self.select_input = tf.placeholder("float", [None, self.state_dim]) self.Q_eval self.Q_select self.loss self.optimize self.update_target_net self.abs_errors self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer()) self.save_model() self.restore_model() def add_demo_to_memory(self, demo_transitions): # add demo data to both demo_memory & replay_memory for t in demo_transitions: self.demo_memory.store(np.array(t, dtype=object)) self.replay_memory.store(np.array(t, dtype=object)) assert len(t) == 10#??? # use the expert-demo-data to pretrain def pre_train(self): print('Pre-training ...') for i in range(self.config.PRETRAIN_STEPS): self.train_Q_network(pre_train=True) if i % 200 == 0 and i > 0: print('{} th step of pre-train finish ...'.format(i)) self.time_step = 0 print('All pre-train finish.') # TODO: How to add the variable created in tf.layers.dense to the customed collection? # def build_layers(self, state, collections, units_1, units_2, w_i, b_i, regularizer=None): # with tf.variable_scope('dese1'): # dense1 = tf.layers.dense(tf.contrib.layers.flatten(state), activation=tf.nn.relu, units=units_1, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # with tf.variable_scope('dens2'): # dense2 = tf.layers.dense(dense1, activation=tf.nn.relu, units=units_2, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # with tf.variable_scope('dene3'): # dense3 = tf.layers.dense(dense2, activation=tf.nn.relu, units=self.action_dim, # kernel_initializer=w_i, bias_initializer=b_i, # kernel_regularizer=regularizer, bias_regularizer=regularizer) # return dense3 def build_layers(self, state, c_names, units_1, units_2, w_i, b_i, reg=None): a_d = self.action_dim with tf.variable_scope('l1'): #w1 = tf.get_variable('w1', [a_d, units_1], initializer=w_i, collections=c_names, regularizer=reg)#self.state_dim w1 = tf.get_variable('w1', [self.state_dim, units_1], initializer=w_i, collections=c_names, regularizer=reg)#20190122 b1 = tf.get_variable('b1', [1, units_1], initializer=b_i, collections=c_names, regularizer=reg) dense1 = tf.nn.relu(tf.matmul(state, w1) + b1) with tf.variable_scope('l2'): w2 = tf.get_variable('w2', [units_1, units_2], initializer=w_i, collections=c_names, regularizer=reg) b2 = tf.get_variable('b2', [1, units_2], initializer=b_i, collections=c_names, regularizer=reg) dense2 = tf.nn.relu(tf.matmul(dense1, w2) + b2) with tf.variable_scope('l3'): w3 = tf.get_variable('w3', [units_2, a_d], initializer=w_i, collections=c_names, regularizer=reg) b3 = tf.get_variable('b3', [1, a_d], initializer=b_i, collections=c_names, regularizer=reg) dense3 = tf.matmul(dense2, w3) + b3 return dense3 @lazy_property def Q_select(self): with tf.variable_scope('select_net') as scope: c_names = ['select_net_params', tf.GraphKeys.GLOBAL_VARIABLES] w_i = tf.random_uniform_initializer(-0.1, 0.1) b_i = tf.constant_initializer(0.1) reg = tf.contrib.layers.l2_regularizer(scale=0.2) # Note: only parameters in select-net need L2 return self.build_layers(self.select_input, c_names, 24, 24, w_i, b_i, reg) @lazy_property def Q_eval(self): with tf.variable_scope('eval_net') as scope: c_names = ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES] w_i = tf.random_uniform_initializer(-0.1, 0.1) b_i = tf.constant_initializer(0.1) return self.build_layers(self.eval_input, c_names, 24, 24, w_i, b_i) def loss_l(self, ae, a): return 0.0 if ae == a else 0.8 def loss_jeq(self, Q_select): jeq = 0.0 for i in range(self.config.BATCH_SIZE): ae = self.action_batch[i] max_value = float("-inf") for a in range(self.action_dim): max_value = tf.maximum(Q_select[i][a] + self.loss_l(ae, a), max_value) jeq += self.isdemo[i] * (max_value - Q_select[i][ae]) return jeq @lazy_property def loss(self): l_dq = tf.reduce_mean(tf.squared_difference(self.Q_select, self.y_input)) l_n_dq = tf.reduce_mean(tf.squared_difference(self.Q_select, self.n_step_y_input)) # l_n_step_dq = self.loss_n_step_dq(self.Q_select, self.n_step_y_input) l_jeq = self.loss_jeq(self.Q_select) l_l2 = tf.reduce_sum([tf.reduce_mean(reg_l) for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)]) return self.ISWeights * tf.reduce_sum([l * λ for l, λ in zip([l_dq, l_n_dq, l_jeq, l_l2], self.config.LAMBDA)]) @lazy_property def abs_errors(self): return tf.reduce_sum(tf.abs(self.y_input - self.Q_select), axis=1) # only use 1-step R to compute abs_errors @lazy_property def optimize(self): optimizer = tf.train.AdamOptimizer(self.config.LEARNING_RATE) return optimizer.minimize(self.loss) # only parameters in select-net is optimized here @lazy_property def update_target_net(self): select_params = tf.get_collection('select_net_params') eval_params = tf.get_collection('eval_net_params') return [tf.assign(e, s) for e, s in zip(eval_params, select_params)] def save_model(self): print("Model saved in : {}".format(self.saver.save(self.sess, self.config.MODEL_PATH))) def restore_model(self): self.saver.restore(self.sess, self.config.MODEL_PATH) print("Model restored.") def perceive(self, transition): self.replay_memory.store(np.array(transition)) # epsilon->FINAL_EPSILON(min_epsilon) if self.replay_memory.full(): self.epsilon = max(self.config.FINAL_EPSILON, self.epsilon * self.config.EPSILIN_DECAY) def train_Q_network(self, pre_train=False, update=True): """ :param pre_train: True means should sample from demo_buffer instead of replay_buffer :param update: True means the action "update_target_net" executes outside, and can be ignored in the function """ if not pre_train and not self.replay_memory.full(): # sampling should be executed AFTER replay_memory filled return self.time_step += 1 assert self.replay_memory.full() or pre_train actual_memory = self.demo_memory if pre_train else self.replay_memory tree_idxes, minibatch, ISWeights = actual_memory.sample(self.config.BATCH_SIZE) # todo 在此引入其他采样方法/其他的采样方法是要依赖于 其他构造树的方法。现在的想法是先把这个接口暴露出来 #actual_memory.memory_test() # np.random.shuffle(minibatch) state_batch = [data[0] for data in minibatch] action_batch = [data[1] for data in minibatch] reward_batch = [data[2] for data in minibatch] next_state_batch = [data[3] for data in minibatch] done_batch = [data[4] for data in minibatch] demo_data = [data[5] for data in minibatch] n_step_reward_batch = [data[6] for data in minibatch] n_step_state_batch = [data[7] for data in minibatch] n_step_done_batch = [data[8] for data in minibatch] actual_n = [data[9] for data in minibatch] # provide for placeholder,compute first Q_select = self.Q_select.eval(feed_dict={self.select_input: next_state_batch}) Q_eval = self.Q_eval.eval(feed_dict={self.eval_input: next_state_batch}) n_step_Q_select = self.Q_select.eval(feed_dict={self.select_input: n_step_state_batch}) n_step_Q_eval = self.Q_eval.eval(feed_dict={self.eval_input: n_step_state_batch}) y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) n_step_y_batch = np.zeros((self.config.BATCH_SIZE, self.action_dim)) for i in range(self.config.BATCH_SIZE): # state, action, reward, next_state, done, demo_data, n_step_reward, n_step_state, n_step_done = t temp = self.Q_select.eval(feed_dict={self.select_input: state_batch[i].reshape((-1, self.state_dim))})[0] temp_0 = np.copy(temp) # add 1-step reward action = np.argmax(Q_select[i]) temp[action_batch[i]] = reward_batch[i] + (1 - int(done_batch[i])) * self.config.GAMMA * Q_eval[i][action] y_batch[i] = temp # add n-step reward action = np.argmax(n_step_Q_select[i]) q_n_step = (1 - int(n_step_done_batch[i])) * self.config.GAMMA**actual_n[i] * n_step_Q_eval[i][action] temp_0[action_batch[i]] = n_step_reward_batch[i] + q_n_step n_step_y_batch[i] = temp_0 #tf.summary.FileWriter("logs/", self.sess.graph) # #print("train Q network") #print("self.y_input: ",y_batch, "self.n_step_y_input:", n_step_y_batch, "self.select_input:", state_batch, "self.action_batch:", action_batch,"self.isdemo:",demo_data,"self.ISWeights: ", ISWeights) _, abs_errors = self.sess.run([self.optimize, self.abs_errors], feed_dict={self.y_input: y_batch, self.n_step_y_input: n_step_y_batch, self.select_input: state_batch, self.action_batch: action_batch, self.isdemo: demo_data, self.ISWeights: ISWeights}) self.replay_memory.batch_update(tree_idxes, abs_errors) # update priorities for data in memory # 此例中一局步数有限,因此可以外部控制一局结束后update ,update为false时在外部控制 if update and self.time_step % self.config.UPDATE_TARGET_NET == 0: self.sess.run(self.update_target_net) def egreedy_action(self, state): if random.random() <= self.epsilon: return random.randint(0, self.action_dim - 1) return np.argmax(self.Q_select.eval(feed_dict={self.select_input: [state]})[0])