def __init__(self, cfg): # Replay memory self.memory = ReplayBuffer(**cfg['agent']['memory']) # Environment configuration self.action_shape = cfg['env']['action_shape'] # Algorithm parameters self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise'] self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.gamma = cfg['agent']['gamma'] self.tau = cfg['agent']['tau'] state_flatten_shape = [np.prod(self.memory.flatten_state_shape)] # Actor Model self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'], self.tau, self.memory.batch_size, cfg['actor']) # Critic Model self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic']) # Flag & Counter self.add_noise = True self.episode = 0 self.max_episode_explore = 100
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.99 self.tau = 0.001
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, task, sess): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Algorithm parameters self.gamma = 0.9 # discount factor self.tau = 2e-3 # for soft update of target parameters self.actor_lr = 2e-3 self.critic_lr = 2e-3 # END self.reward_variance = RunningVariance(1) self.q_values_variance = RunningVariance(1) # Actor (Policy) Model self.actor_local = Actor(sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) self.actor_target = Actor(sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.8 self.exploration_sigma = 0.05 * self.action_range self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 100 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # initialize sess.run(tf.global_variables_initializer())
def __init__(self, task, verbose=False): self.verbose = verbose self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #log_path = '/tmp/logs' #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1, # write_images=False, write_grads=True, write_graph=False) #self.callback.set_model(self.critic_local.model) #log_path = '/tmp/logs' #self.writer = tf.summary.FileWriter(log_path) #self.learn_counter = 0 # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.2 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 512 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.015 # for soft update of target parameters
def __init__(self, task, learning_rate_actor=0.0002, learning_rate_critic=0.00003, gamma=0.99, tau=0.01): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=learning_rate_actor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learning_rate=learning_rate_actor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learning_rate=learning_rate_critic) self.critic_target = Critic(self.state_size, self.action_size, learning_rate=learning_rate_critic) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(size=self.action_size, mu=0, theta=0.15, sigma=0.5) # Replay memory self.batch_size = 64 self.memory = ReplayBuffer(batch_size=self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.last_state = None self.reset_episode()
def reset_learning(self): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.set_weights(self.critic_local.get_weights()) self.actor_target.set_weights(self.actor_local.get_weights()) self.memory = GoodBadReplayBuffer(self.buffer_size, self.batch_size) self.noise = OUNoise(self.action_size, mu=self.exploration_mu,\ theta=self.exploration_theta, sigma=self.exploration_sigma) self.noise_scale = self.noise.calc_scale() self.best_score = -np.inf self.__evaluate() return self.reset_episode()
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Critic self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Exploration noise self.exploration_mu = 0.1 self.exploration_sigma = 0.1 self.exploration_theta = 0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Experience self.buffer_size = 100000000 self.batch_size = 64 self.buffer = ReplayBuffer(self.buffer_size) # Parameters self.gamma = 0.99 self.tau = 0.001
def __init__(self, task, prioritized_replay=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.15 #0.1 self.exploration_sigma = 0.2 #0.2 #0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 # 64 self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = 0.6 self.prioritized_replay_beta0 = 0.4 self.prioritized_replay_beta_iters = None self.prioritized_replay_eps = 1e-6 self.max_timesteps = 100000 # Replay buffer if self.prioritized_replay: self.memory = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters #self.tau = 0.001 # 0.001 per paper self.td_errors_list = [] self.actor_loss_list = [] self.critic_loss_list = []
class DDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Critic self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # Exploration noise self.exploration_mu = 0.1 self.exploration_sigma = 0.1 self.exploration_theta = 0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Experience self.buffer_size = 100000000 self.batch_size = 64 self.buffer = ReplayBuffer(self.buffer_size) # Parameters self.gamma = 0.99 self.tau = 0.001 def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self): # Sample states, actions, rewards, dones, next_states = self.buffer.sample( self.batch_size, self.action_size, self.state_size) # Predict actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) # Train Critic self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train Actor action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Update weights self.update_target_weights(self.critic_local.model, self.critic_target.model) self.update_target_weights(self.actor_local.model, self.actor_target.model) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.buffer.add(self.last_state, action, reward, next_state, done) self.learn() self.last_state = next_state def update_target_weights(self, local_model, target_model): target_model.set_weights( self.tau * np.array(local_model.get_weights()) + (1 - self.tau) * np.array(target_model.get_weights()))
class Agent(): def __init__(self, cfg): # Replay memory self.memory = ReplayBuffer(**cfg['agent']['memory']) # Environment configuration self.action_shape = cfg['env']['action_shape'] # Algorithm parameters self.exploration_mu, self.exploration_theta, self.exploration_sigma = cfg['agent']['noise'] self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.gamma = cfg['agent']['gamma'] self.tau = cfg['agent']['tau'] state_flatten_shape = [np.prod(self.memory.flatten_state_shape)] # Actor Model self.actor = Actor(state_flatten_shape, self.action_shape, cfg['env']['action_range'], self.tau, self.memory.batch_size, cfg['actor']) # Critic Model self.critic = Critic(state_flatten_shape, self.action_shape, self.tau, cfg['critic']) # Flag & Counter self.add_noise = True self.episode = 0 self.max_episode_explore = 100 def init_actor_critic(self): # Initialize target model self.critic.copy_local_in_target() self.actor.copy_local_in_target() def reset(self): self.memory.reset_past() self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma) def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) if done: self.reset() def act(self, state): self.last_state = state window_states = self.memory.get_state_vector(state).reshape(1, -1) action = self.actor.predict(window_states) if self.add_noise and self.episode < self.max_episode_explore: p = self.episode / self.max_episode_explore action = np.clip(action*p + (1-p)*self.noise.sample(), a_max=1, a_min=-1) return action def learn(self): if self.memory.is_sufficient(): experiences = self.memory.sample() states = experiences['state'][:, 0].reshape(self.memory.batch_size, -1) actions = experiences['action'] rewards = experiences['reward'] dones = experiences['done'] next_states = experiences['next_state'][:, 0].reshape(self.memory.batch_size, -1) # get predicted next state action and Q values from target models actions_next = self.actor.get_targets(next_states) Q_targets_next = self.critic.get_targets(next_states, actions_next) # Compute Q targets for current states and train critic model Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_summaries = self.critic.fit(states, actions, Q_targets) # Train actor model action_gradients = self.critic.get_actions_grad(states, actions)[0] actor_summaries = self.actor.fit(states, action_gradients) # Soft-update target models self.critic.soft_update() self.actor.soft_update() summary_reward = summary('sample_rewards', rewards) return critic_summaries, actor_summaries, summary_reward
def reset(self): self.memory.reset_past() self.noise = OUNoise(self.action_shape, self.exploration_mu, self.exploration_theta, self.exploration_sigma)
class MyAgent: def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.gamma = 0.99 self.tau = 0.001 def act(self, state): state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def step(self, action, reward, next_state, done): self.memory.add(self.last_state, action, reward, next_state, done) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1-dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1-self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, verbose=False): self.verbose = verbose self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #log_path = '/tmp/logs' #self.callback = callbacks.TensorBoard(log_dir=log_path, histogram_freq=1, # write_images=False, write_grads=True, write_graph=False) #self.callback.set_model(self.critic_local.model) #log_path = '/tmp/logs' #self.writer = tf.summary.FileWriter(log_path) #self.learn_counter = 0 # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.1 self.exploration_theta = 0.2 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 512 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.015 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state #self.learn_counter = 0 return state def mimic(self, experience_to_mimic): print("ready to mimic") self.memory.memory = experience_to_mimic def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) def save_grads(writer, model): for layer in model.layers: for weight in layer.weights: mapped_weight_name = weight.name.replace(':', '_') tf.summary.histogram(mapped_weight_name, weight) grads = model.optimizer.get_gradients( model.total_loss, weight) def is_indexed_slices(grad): return type(grad).__name__ == 'IndexedSlices' grads = [ grad.values if is_indexed_slices(grad) else grad for grad in grads ] tf.summary.histogram('{}_grad'.format(mapped_weight_name), grads) merged = tf.summary.merge_all() writer.flush() writer.close() #save_grads(self.writer, self.critic_local.model) #def write_log(callback, names, logs, batch_no): # for name, value in zip(names, logs): # summary = tf.Summary() # summary_value = summary.value.add() # summary_value.simple_value = value # summary_value.tag = name # callback.writer.add_summary(summary, batch_no) # callback.writer.flush() #train_names = ['train_loss', 'train_mae'] #print("about to write log") #write_log(self.callback, train_names, logs, self.learn_counter) #trainable_weights = critic_local.model.trainable_weights #gradients = critic_local.model.optimizer.get_gradients(critic_local.model.total_loss, trainable_weights) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) #self.learn_counter += 1 def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def _save_weight(self, model, directory_name, file_name): cwd = os.getcwd() directory_path = os.path.join(cwd, directory_name) if not os.path.exists(directory_path): os.makedirs(directory_path) file_path = os.path.join(directory_path, file_name) mv_file_to_dir_with_date(file_path, directory_path) model.save_weights(file_path) def save_weights(self, location='weights_backup'): if self.verbose: print("start save_weights") self._save_weight(self.critic_local.model, location, "critic_local.h5") self._save_weight(self.critic_target.model, location, "critic_target.h5") self._save_weight(self.actor_local.model, location, "actor_local.h5") self._save_weight(self.actor_target.model, location, "actor_target.h5") if self.verbose: print("done save_weights") def _h5(self, model, file_path): if os.path.exists(file_path): model.load_weights(file_path) else: print(f'could not find weight to load from [{file_path}]') def load_weights(self, location='weights_backup'): if self.verbose: print("start load_weights") cwd = os.getcwd() directory_path = os.path.join(cwd, location) self._h5(self.critic_local.model, os.path.join(directory_path, "critic_local.h5")) self._h5(self.critic_target.model, os.path.join(directory_path, "critic_target.h5")) self._h5(self.actor_local.model, os.path.join(directory_path, "actor_local.h5")) self._h5(self.actor_target.model, os.path.join(directory_path, "actor_target.h5")) if self.verbose: print("done load_weights")
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, sess): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # Algorithm parameters self.gamma = 0.9 # discount factor self.tau = 2e-3 # for soft update of target parameters self.actor_lr = 2e-3 self.critic_lr = 2e-3 # END self.reward_variance = RunningVariance(1) self.q_values_variance = RunningVariance(1) # Actor (Policy) Model self.actor_local = Actor(sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) self.actor_target = Actor(sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.8 self.exploration_sigma = 0.05 * self.action_range self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 100 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # initialize sess.run(tf.global_variables_initializer()) def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state self.last_reward = self.task.get_reward() self.reward_variance.update(self.last_reward) return state def step(self, action, reward, next_state, done): self.reward_variance.update(reward) # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done, self.last_reward) # Learn, if enough samples are available in memory experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.last_reward = reward def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_target.model.predict(state)[0] noise = self.noise.sample() action += noise return list(action) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([ self.reward_variance.normalize(e.reward) for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) last_rewards = np.array([ self.reward_variance.normalize(e.prev_reward) for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets_prev = self.critic_target.model.predict_on_batch( [states, actions]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * ( 1 - dones) - last_rewards for q in Q_targets: self.q_values_variance.update(q) Q_targets = np.array( [self.q_values_variance.normalize(q) for q in Q_targets]) #print("\n", Q_targets, "\n") self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) #action_gradients *= Q_targets #action_gradients /= self.batch_size #self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.actor_local.train(states, action_gradients) # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """ Reinforcement Learning agent that learns using DDPG. Deep DPG as described by Lillicrap et al. (2015) """ def __init__(self, task, prioritized_replay=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 #0.15 #0.1 self.exploration_sigma = 0.2 #0.2 #0.1 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.buffer_size = 100000 self.batch_size = 64 # 64 self.prioritized_replay = prioritized_replay self.prioritized_replay_alpha = 0.6 self.prioritized_replay_beta0 = 0.4 self.prioritized_replay_beta_iters = None self.prioritized_replay_eps = 1e-6 self.max_timesteps = 100000 # Replay buffer if self.prioritized_replay: self.memory = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters #self.tau = 0.001 # 0.001 per paper self.td_errors_list = [] self.actor_loss_list = [] self.critic_loss_list = [] def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: if self.prioritized_replay: samples = self.memory.sample(self.batch_size, beta=self.beta_schedule.value( len(self.memory))) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = samples experiences = [] for i in range(len(obses_t)): experiences.append( namedtuple("PrioritizedExperience", field_names=[ "state", "action", "reward", "next_state", "done", "weight", "batch_idx" ])(obses_t[i:i + 1], actions[i:i + 1], rewards[i:i + 1], obses_tp1[i:i + 1], dones[i:i + 1], weights[i:i + 1], batch_idxes[i:i + 1])) self.learn(experiences) else: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] #actions = list(action + self.noise.sample()) #print("act {}".format(actions)) #return actions # add some noise for exploration return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) using action gradients action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) actor_loss = self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) if self.prioritized_replay: # Update replay buffer priorities batch_idxes = np.vstack( [e.batch_idx[0] for e in experiences if e is not None]) new_priorities = np.abs(Q_targets) + self.prioritized_replay_eps self.memory.update_priorities(batch_idxes, new_priorities) self.td_errors_list.append(Q_targets.T) self.actor_loss_list.append(actor_loss[0]) self.critic_loss_list.append(critic_loss) #print("states {} next states {} critic_loss {} actor_loss {}".format(states, actions_next, critic_loss, actor_loss)) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def save_weights(self): self.actor_local.model.save_weights("DDPG_actor_weights.h5") self.critic_local.model.save_weights("DDPG_critic_weights.h5") def save_td_errors(self, i_episode): with open("DDPG_agent_td_errors_episode_{}.csv".format(i_episode), 'w') as csvfile: writer = csv.writer(csvfile) for td_errors in self.td_errors_list: writer.writerow([td_errors]) self.td_errors_list.clear() def save_losses(self, i_episode): with open( "DDPG_agent_actor_critic_loss_episode_{}.csv".format( i_episode), 'w') as csvfile: writer = csv.writer(csvfile) for actor_loss, critic_loss in zip(self.actor_loss_list, self.critic_loss_list): writer.writerow([actor_loss, critic_loss]) self.actor_loss_list.clear() self.critic_loss_list.clear()
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor self.tau = 0.002 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcementing agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.0005 # for soft update of target parameters self.noise_scale = 0.1 def reset_episode(self): self.noise.reset() self.total_reward = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.total_reward += reward self.memory.add(self.last_state, action, reward, next_state, done) #print(len(self.memory), self.batch_size) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Noise process self.max_unsuccessful_episodes_in_a_row = 10 self.exploration_mu = 0 # theta units are proportional to the action range, so I set it to 1% instead of the original 15% self.exploration_theta = 0.01 self.exploration_sigma = 0.05 # Replay memory self.buffer_size = 10000 self.batch_size = 64 # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.1 # for soft update of target parameters self.best_learning = -np.inf self.reset_learning() def __evaluate(self): state = self.task.reset() score = 0. count = 0 done = False while not done: action = self.actor_local.act(state) state, reward, done = self.task.step(action) score += reward count += 1 score *= self.task.action_repeat * self.task.sim.dt / self.task.sim.runtime self.score = score self.count = count if self.score > self.best_score: self.__save_best() return True return False def __save_best(self): self.__best_actor_local = self.actor_local.get_weights() self.__best_actor_target = self.actor_target.get_weights() self.__best_critic_local = self.critic_local.get_weights() self.__best_critic_target = self.critic_target.get_weights() self.count_unsuccessful_in_a_row = 0 self.best_score = self.score self.best_score_count = self.count if self.best_score > self.best_learning: # save best learning self.__best_learning_actor_local = np.copy(self.__best_actor_local) self.__best_learning_actor_target = np.copy( self.__best_actor_target) self.__best_learning_critic_local = np.copy( self.__best_critic_local) self.__best_learning_critic_target = np.copy( self.__best_critic_target) self.best_learning = self.best_score self.best_learning_count = self.best_score_count def restore_best(self): self.actor_local.set_weights(self.__best_actor_local) self.actor_target.set_weights(self.__best_actor_target) self.critic_local.set_weights(self.__best_critic_local) self.critic_target.set_weights(self.__best_critic_target) self.count_unsuccessful_in_a_row = 0 self.score = self.best_score self.count = self.best_score_count def restore_learning(self): self.actor_local.set_weights(self.__best_learning_actor_local) self.actor_target.set_weights(self.__best_learning_actor_target) self.critic_local.set_weights(self.__best_learning_critic_local) self.critic_target.set_weights(self.__best_learning_critic_target) self.count_unsuccessful_in_a_row = 0 self.score = self.best_learning self.count = self.best_learning_count def __update_noise(self): if self.__evaluate(): # improvement in score --> less exploration self.noise_scale = self.noise.multiply(0.5) self.count_unsuccessful_in_a_row = 0 return self.count_unsuccessful_in_a_row += 1 if self.count_unsuccessful_in_a_row < self.max_unsuccessful_episodes_in_a_row: return curr_noise_scale = self.noise_scale # increasing the action noise, more exploration self.noise_scale = self.noise.multiply(1.5) if (self.noise_scale > 1.0) and (self.noise_scale < curr_noise_scale + 1e-6): # could not increase action noise self.reset_learning() else: self.restore_best() def reset_learning(self): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.set_weights(self.critic_local.get_weights()) self.actor_target.set_weights(self.actor_local.get_weights()) self.memory = GoodBadReplayBuffer(self.buffer_size, self.batch_size) self.noise = OUNoise(self.action_size, mu=self.exploration_mu,\ theta=self.exploration_theta, sigma=self.exploration_sigma) self.noise_scale = self.noise.calc_scale() self.best_score = -np.inf self.__evaluate() return self.reset_episode() def reset_episode(self, use_noise=True): self.noise.reset() self.use_noise = use_noise self.last_state = self.task.reset() self.__last_state_reward = self.task.curr_score return self.last_state def act(self, state): """Returns actions for given state(s) as per current policy.""" action = self.actor_local.act(state) if self.use_noise: ret = list(action + self.noise.sample()) # add some noise for exploration else: ret = list(action) return np.clip(ret, self.task.action_low, self.task.action_high).tolist() def step(self, action, reward, next_state, done): # Save experience / reward if self.use_noise: diff_reward = reward - self.__last_state_reward self.memory.add(self.last_state, action, diff_reward, reward, next_state, done) if self.memory.has_sample(): # single learn at each step self.learn(self.memory.sample()) if done: self.__update_noise() # Roll over last state and action self.last_state = next_state self.__last_state_reward = reward def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.task.action_size) rewards = np.array([ e.step_reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]),\ (-1, self.task.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)