def _save_experience(experience: Experience, memory: Memory, next_state: np.ndarray): """ Save a new experience by replaced next state of agent :param next_state: nest state of agent :param experience: Experience of made action :param memory: memory of robot with all saved experiences :return: next state which agent will make """ experience = experience._replace(next_state=next_state) memory.add(experience)
class DDPG(Agent): def __init__(self, env, monitor_path: str, **usercfg) -> None: super(DDPG, self).__init__(**usercfg) self.env = env self.monitor_path: str = monitor_path self.config.update( n_episodes=100000, n_timesteps=env.spec.tags.get( "wrapper_config.TimeLimit.max_episode_steps"), actor_learning_rate=1e-4, critic_learning_rate=1e-3, ou_theta=0.15, ou_sigma=0.2, gamma=0.99, batch_size=64, tau=0.001, l2_loss_coef=1e-2, n_actor_layers=2, n_hidden_units=64, actor_layer_norm=True, critic_layer_norm= False, # Batch norm for critic does not seem to work replay_buffer_size=1e6, replay_start_size= 10000 # Required number of replay buffer entries to start training ) self.config.update(usercfg) self.state_shape: list = list(env.observation_space.shape) self.n_actions: int = env.action_space.shape[0] self.states = tf.placeholder(tf.float32, [None] + self.state_shape, name="states") self.actions_taken = tf.placeholder(tf.float32, [None, self.n_actions], name="actions_taken") self.critic_target = tf.placeholder(tf.float32, [None, 1], name="critic_target") self.is_training = tf.placeholder(tf.bool, name="is_training") with tf.variable_scope("actor"): self.action_output, self.actor_vars = self.build_actor_network() self.target_action_output, actor_target_update = self.build_target_actor_network( self.actor_vars) self.q_gradient_input = tf.placeholder("float", [None, self.n_actions], name="q_grad_input") self.actor_policy_gradients = tf.gradients(self.action_output, self.actor_vars, -self.q_gradient_input, name="actor_gradients") self.actor_train_op = tf.train.AdamOptimizer( self.config["actor_learning_rate"], name="actor_optimizer").apply_gradients( list(zip(self.actor_policy_gradients, self.actor_vars))) with tf.variable_scope("critic"): self.q_value_output, self.critic_vars = self.build_critic_network() self.target_q_value_output, critic_target_update = self.build_target_critic_network( self.critic_vars) l2_loss = tf.add_n([ self.config["l2_loss_coef"] * tf.nn.l2_loss(var) for var in self.critic_vars ]) self.critic_loss = tf.reduce_mean( tf.square(self.critic_target - self.q_value_output)) + l2_loss self.critic_train_op = tf.train.AdamOptimizer( self.config["critic_learning_rate"], name="critic_optimizer").minimize(self.critic_loss) self.action_gradients = tf.gradients(self.q_value_output, self.actions_taken, name="action_gradients") summaries = [] for v in self.actor_vars + self.critic_vars: summaries.append(tf.summary.histogram(v.name, v)) self.model_summary_op = tf.summary.merge(summaries) self.update_targets_op = tf.group(actor_target_update, critic_target_update, name="update_targets") self.init_op = tf.global_variables_initializer() self.action_noise = OrnsteinUhlenbeckActionNoise( self.n_actions, self.config["ou_sigma"], self.config["ou_theta"]) self.replay_buffer = Memory(int(self.config["replay_buffer_size"])) self.n_updates = 0 self.summary_writer = tf.summary.FileWriter( os.path.join(self.monitor_path, "summaries"), tf.get_default_graph()) def build_actor_network(self): layer1_size = 400 layer2_size = 300 x = self.states if self.config["actor_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="batch_norm_0", activation=tf.identity) with tf.variable_scope("L1"): x, l1_vars = linear_fan_in(x, layer1_size) if self.config["actor_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="batch_norm_1", activation=tf.nn.relu) with tf.variable_scope("L2"): x, l2_vars = linear_fan_in(x, layer2_size) if self.config["actor_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="batch_norm_2", activation=tf.nn.relu) with tf.variable_scope("L3"): W3 = tf.Variable(tf.random_uniform([layer2_size, self.n_actions], -3e-3, 3e-3), name="w") b3 = tf.Variable(tf.random_uniform([self.n_actions], -3e-3, 3e-3), name="b") action_output = tf.tanh(tf.nn.xw_plus_b(x, W3, b3)) l3_vars = [W3, b3] return action_output, l1_vars + l2_vars + l3_vars def build_target_actor_network(self, actor_vars: list): ema = tf.train.ExponentialMovingAverage(decay=1 - self.config["tau"]) target_update = ema.apply(actor_vars) target_net = [ema.average(v) for v in actor_vars] x = self.states if self.config["actor_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="target_batch_norm_0", activation=tf.identity) x = tf.nn.xw_plus_b(x, target_net[0], target_net[1]) if self.config["actor_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="target_batch_norm_1", activation=tf.nn.relu) x = tf.nn.xw_plus_b(x, target_net[2], target_net[3]) if self.config["actor_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="target_batch_norm_2", activation=tf.nn.relu) action_output = tf.tanh( tf.nn.xw_plus_b(x, target_net[4], target_net[5])) return action_output, target_update def build_critic_network(self): layer1_size = 400 layer2_size = 300 x = self.states with tf.variable_scope("L1"): if self.config[ "critic_layer_norm"]: # Defaults to False (= don't use it) x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="batch_norm_0", activation=tf.identity) x, l1_vars = linear_fan_in(x, layer1_size) x = tf.nn.relu(x) with tf.variable_scope("L2"): W2 = tf.get_variable( "w", [layer1_size, layer2_size], initializer=fan_in_initializer(layer1_size + self.n_actions)) W2_action = tf.get_variable( "w_action", [self.n_actions, layer2_size], initializer=fan_in_initializer(layer1_size + self.n_actions)) b2 = tf.get_variable( "b", [layer2_size], initializer=fan_in_initializer(layer1_size + self.n_actions)) x = tf.nn.relu( tf.matmul(x, W2) + tf.matmul(self.actions_taken, W2_action) + b2) with tf.variable_scope("L3"): W3 = tf.Variable(tf.random_uniform([layer2_size, 1], -3e-3, 3e-3), name="w") b3 = tf.Variable(tf.random_uniform([1], -3e-3, 3e-3), name="b") q_value_output = tf.nn.xw_plus_b(x, W3, b3, name="q_value") return q_value_output, l1_vars + [W2, W2_action, b2, W3, b3] def build_target_critic_network(self, critic_vars: list): ema = tf.train.ExponentialMovingAverage(decay=1 - self.config["tau"]) target_update = ema.apply(critic_vars) target_net = [ema.average(v) for v in critic_vars] x = self.states if self.config["critic_layer_norm"]: x = batch_norm_layer(x, training_phase=self.is_training, scope_bn="batch_norm_0", activation=tf.identity) x = tf.nn.relu(tf.nn.xw_plus_b(x, target_net[0], target_net[1])) x = tf.nn.relu( tf.matmul(x, target_net[2]) + tf.matmul(self.actions_taken, target_net[3]) + target_net[4]) q_value_output = tf.nn.xw_plus_b(x, target_net[5], target_net[6]) return q_value_output, target_update def actor_gradients(self, state_batch: np.ndarray, action_batch: np.ndarray): q, grads = tf.get_default_session().run( [self.q_value_output, self.action_gradients], feed_dict={ self.states: state_batch, self.actions_taken: action_batch, self.is_training: False }) summary = tf.Summary() summary.value.add(tag="model/actor_loss", simple_value=float(-np.mean(q))) self.summary_writer.add_summary(summary, self.n_updates) return grads[0] def target_q(self, states: np.ndarray, actions: np.ndarray): return tf.get_default_session().run(self.target_q_value_output, feed_dict={ self.states: states, self.actions_taken: actions, self.is_training: False }) def q_value(self, states: np.ndarray, actions: np.ndarray): return tf.get_default_session().run(self.q_value_output, feed_dict={ self.states: states, self.actions_taken: actions, self.is_training: False }) def actions(self, states: np.ndarray) -> np.ndarray: """Get the actions for a batch of states.""" return tf.get_default_session().run(self.action_output, feed_dict={ self.states: states, self.is_training: True }) def action(self, state: np.ndarray) -> np.ndarray: """Get the action for a single state.""" return tf.get_default_session().run(self.action_output, feed_dict={ self.states: [state], self.is_training: False })[0] def target_actions(self, states: np.ndarray) -> np.ndarray: """Get the actions for a batch of states using the target actor network.""" return tf.get_default_session().run(self.target_action_output, feed_dict={ self.states: states, self.is_training: True }) def train(self): sample = self.replay_buffer.get_batch(self.config["batch_size"]) # for n_actions = 1 action_batch = np.resize(sample["actions"], [self.config["batch_size"], self.n_actions]) # Calculate critic targets next_action_batch = self.target_actions(sample["states1"]) q_value_batch = self.target_q(sample["states1"], next_action_batch) critic_targets = sample["rewards"] + (1 - sample["terminals1"]) * \ self.config["gamma"] * q_value_batch.squeeze() critic_targets = np.resize( critic_targets, [self.config["batch_size"], 1]).astype(np.float32) # Update actor weights fetches = [self.q_value_output, self.critic_loss, self.critic_train_op] predicted_q, critic_loss, _ = tf.get_default_session().run( fetches, feed_dict={ self.critic_target: critic_targets, self.states: sample["states0"], self.actions_taken: action_batch, self.is_training: True }) summary = tf.Summary() summary.value.add(tag="model/critic_loss", simple_value=float(critic_loss)) summary.value.add(tag="model/predicted_q_mean", simple_value=np.mean(predicted_q)) summary.value.add(tag="model/predicted_q_std", simple_value=np.std(predicted_q)) self.summary_writer.add_summary(summary, self.n_updates) # Update the actor using the sampled gradient: action_batch_for_gradients = self.actions(sample["states0"]) q_gradient_batch = self.actor_gradients(sample["states0"], action_batch_for_gradients) tf.get_default_session().run(self.actor_train_op, feed_dict={ self.q_gradient_input: q_gradient_batch, self.states: sample["states0"], self.is_training: True }) # Update the target networks tf.get_default_session().run( [self.update_targets_op, self.model_summary_op]) self.n_updates += 1 def noise_action(self, state: np.ndarray): """Choose an action based on the actor and exploration noise.""" action = self.action(state) return action + self.action_noise() def learn(self): max_action = self.env.action_space.high with tf.Session() as sess, sess.as_default(): sess.run(self.init_op) for episode in range(self.config["n_episodes"]): state = self.env.reset() episode_reward = 0 episode_length = 0 for _ in range(self.config["n_timesteps"]): action = self.noise_action(state) new_state, reward, done, _ = self.env.step(action * max_action) episode_length += 1 episode_reward += reward self.replay_buffer.add(state, action, reward, new_state, done) if self.replay_buffer.n_entries > self.config[ "replay_start_size"]: self.train() state = new_state if done: self.action_noise.reset() summary = tf.Summary() summary.value.add(tag="global/Episode_length", simple_value=float(episode_length)) summary.value.add(tag="global/Reward", simple_value=float(episode_reward)) self.summary_writer.add_summary(summary, episode) self.summary_writer.flush() break
class MbPA: def __init__(self, sess, args): with tf.variable_scope(args.model_name): self.args = args self.learning_rate = args.learning_rate self.session = sess self.x = tf.placeholder(tf.float32, shape=[None, 784], name="x") self.y = tf.placeholder(tf.float32, shape=[None, 10], name="y") # self.trainable = tf.placeholder(tf.int32, shape=(), name="trainable") self.memory_sample_batch = tf.placeholder( tf.int16, shape=(), name="memory_sample_batch") self.embed = self.embedding(self.x) self.M = Memory(args.memory_size, self.embed.get_shape()[-1], self.y.get_shape()[-1]) embs_and_values = tf.py_func(self.get_memory_sample, [self.memory_sample_batch], [tf.float64, tf.float64]) self.memory_batch_x = tf.to_float(embs_and_values[0]) self.memory_batch_y = tf.to_float(embs_and_values[1]) self.xa = tf.concat(values=[self.embed, self.memory_batch_x], axis=0) self.ya = tf.concat(values=[self.y, self.memory_batch_y], axis=0) self.y_ = self.output_network(self.xa) self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.ya, logits=self.y_)) self.optim = tf.train.GradientDescentOptimizer( self.learning_rate).minimize(self.cross_entropy) self.correct_prediction = tf.equal(tf.argmax(self.ya, 1), tf.argmax(self.y_, 1)) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32)) self.session.run(tf.global_variables_initializer()) def train(self, xs, ys, memory_sample_batch): # print(memory_sample_batch) embeds, _ = self.session.run([self.embed, self.optim], feed_dict={ self.x: xs, self.y: ys, self.memory_sample_batch: memory_sample_batch }) return embeds def test(self, xs_test, ys_test): acc = self.session.run(self.accuracy, feed_dict={ self.x: xs_test, self.y: ys_test, self.memory_sample_batch: 0 }) return acc def get_memory_sample(self, batch_size): x, y = self.M.sample(batch_size) return x, y def add_to_memory(self, xs, ys): if self.args.sample_add == "normal": self.M.add(xs, ys) elif self.args.sample_add == "lru": self.M.add_lru(xs, ys) elif self.args.sample_add == "rand": self.M.add_rand(xs, ys) elif self.args.sample_add == "knn": self.M.add_knn(xs, ys) elif self.args.sample_add == "knn_lru": self.M.add_knn_lru(xs, ys) else: raise Exception( "error sample adding type, pleace choose in ['normal', 'lru', 'rand']" ) @staticmethod def embedding(x): out = tf.reshape(x, [-1, 28, 28, 1]) # convs = [(16, 8, 4), (32, 4, 2)] # with tf.variable_scope("conv1"): # out = layers.convolution2d(inputs=out, # num_outputs=16, # kernel_size=8, # stride=4, # trainable=trainable) # out = tf.nn.relu(out) # out = tf.nn.max_pool(out, ksize=[1, 2, 3, 1], strides=[1, 2, 2, 1], padding="SAME") with tf.variable_scope("conv2"): # out = layers.convolution2d(inputs=out, # num_outputs=32, # kernel_size=4, # stride=2, # trainable=trainable) # out = tf.nn.relu(out) # out = tf.nn.max_pool(out, ksize=[1, 2, 3, 1], strides=[1, 2, 2, 1], padding="SAME") embed = layers.flatten(out) return embed @staticmethod def output_network(embed): out = embed with tf.variable_scope("fc_1"): out = layers.fully_connected(inputs=out, num_outputs=1024) out = tf.nn.relu(out) with tf.variable_scope("fc_2"): out = layers.fully_connected(inputs=out, num_outputs=10) return out
class MbPA_KNN_Test: def __init__(self, sess, args): self.args = args self.session = sess self.w = {} self.eval_w = {} with tf.variable_scope(self.args.model_name): self.x = tf.placeholder(tf.float32, shape=[None, 784], name="x") self.y = tf.placeholder(tf.float32, shape=[None, 10], name="y") self.memory_sample_batch = tf.placeholder( tf.int16, shape=(), name="memory_sample_batch") with tf.variable_scope("training"): with tf.variable_scope("embedding"): self.out = tf.reshape(self.x, [-1, 28, 28, 1]) with tf.variable_scope("conv"): # # self.out, self.w["l1_w"], self.w["l1_b"] = conv2d( # # x=self.out, # # output_dim=16, # # kernel_size=[8, 8], # # stride=[4, 4], # # activation_fn=tf.nn.relu, # # name="conv1" # # ) # # self.out, self.w["l2_w"], self.w["l2_b"] = conv2d( # # x=self.out, # # output_dim=32, # # kernel_size=[4, 4], # # stride=[2, 2], # # activation_fn=tf.nn.relu, # # name="conv2" # # ) self.embed = layers.flatten(self.out) # self.embed_dim = self.embed.get_shape()[-1] self.M = Memory(self.args.memory_size, self.x.get_shape()[-1], self.y.get_shape()[-1]) embs_and_values = tf.py_func(self.get_memory_sample, [self.memory_sample_batch], [tf.float64, tf.float64]) self.memory_batch_x = tf.to_float(embs_and_values[0]) self.memory_batch_y = tf.to_float(embs_and_values[1]) self.xa = tf.concat(values=[self.x, self.memory_batch_x], axis=0) self.ya = tf.concat(values=[self.y, self.memory_batch_y], axis=0) with tf.variable_scope("fc"): self.out = self.xa # self.out, self.w["l3_w"], self.w["l3_b"] = linear( # input_=self.out, # output_size=1024, # activation_fn=tf.nn.relu, # name="fc_1" # ) self.out, self.w["l4_w"], self.w["l4_b"] = linear( input_=self.out, output_size=10, name="fc_2") self.ya_ = self.out self.cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=self.ya, logits=self.ya_)) self.optim = tf.train.GradientDescentOptimizer( self.args.learning_rate).minimize(self.cross_entropy) self.correct_prediction = tf.equal(tf.argmax(self.ya, 1), tf.argmax(self.ya_, 1)) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32)) self.session.run(tf.global_variables_initializer()) def update_training_to_prediction(self): for name in self.eval_w.keys(): self.t_w_assign_op[name].eval( {self.t_w_input[name]: self.w[name].eval()}) def train(self, xs, ys, memory_sample_batch): embeds, _ = self.session.run([self.embed, self.optim], feed_dict={ self.x: xs, self.y: ys, self.memory_sample_batch: memory_sample_batch }) return embeds def get_memory_sample(self, batch_size): xs, ys = self.M.sample(batch_size) return xs, ys def add_to_memory(self, xs, ys): if self.args.sample_add == "normal": self.M.add(xs, ys) elif self.args.sample_add == "lru": self.M.add_lru(xs, ys) elif self.args.sample_add == "rand": self.M.add_rand(xs, ys) elif self.args.sample_add == "knn": self.M.add_knn(xs, ys) elif self.args.sample_add == "knn_lru": self.M.add_knn_lru(xs, ys) else: raise Exception( "error sample adding type, pleace choose in ['normal', 'lru', 'rand']" ) def test(self, xs_test, ys_test): # self.update_training_to_prediction() acc = self.session.run(self.accuracy, feed_dict={ self.x: xs_test, self.y: ys_test, self.memory_sample_batch: 0 }) return acc @property def memory_length(self): return self.M.length