def train(self, n_steps: int = None, n_episodes: int = None, save_every: int = None, save_path: str = None, callback: callable = None, **kwargs): batch_size: int = kwargs.get('batch_size', 128) discount_factor: float = kwargs.get('discount_factor', 0.9999) learning_rate: float = kwargs.get('learning_rate', 0.0001) eps_start: float = kwargs.get('eps_start', 0.9) eps_end: float = kwargs.get('eps_end', 0.05) eps_decay_steps: int = kwargs.get('eps_decay_steps', 200) entropy_c: int = kwargs.get('entropy_c', 0.0001) memory_capacity: int = kwargs.get('memory_capacity', 1000) memory = ReplayMemory(memory_capacity, transition_type=A2CTransition) episode = 0 steps_done = 0 stop_training = False if n_steps and not n_episodes: n_episodes = np.iinfo(np.int32).max while episode < n_episodes and not stop_training: self.episode_id = str(uuid.uuid4()) state = self.env.reset() done = False print('==== EPISODE ID: {} ===='.format(self.episode_id)) while not done: threshold = eps_end + (eps_start - eps_end) * np.exp( -steps_done / eps_decay_steps) action = self.get_action(state, threshold=threshold) next_state, reward, done, _ = self.env.step(action) value = self.critic_network(state[None, :], training=False) value = tf.squeeze(value, axis=1) memory.push(state, action, reward, done, value) state = next_state steps_done += 1 if len(memory) < batch_size: continue self._apply_gradient_descent(memory, batch_size, learning_rate, discount_factor, entropy_c) if n_steps and steps_done >= n_steps: done = True stop_training = True is_checkpoint = save_every and episode % save_every == 0 if save_path and (is_checkpoint or episode == n_episodes): self.save(save_path, episode=episode)
def _apply_gradient_descent(self, memory: ReplayMemory, batch_size: int, learning_rate: float, discount_factor: float): optimizer = tf.keras.optimizers.Adam(lr=learning_rate) loss = tf.keras.losses.Huber() transitions = memory.sample(batch_size) batch = DQNTransition(*zip(*transitions)) state_batch = tf.convert_to_tensor(batch.state) action_batch = tf.convert_to_tensor(batch.action) reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32) next_state_batch = tf.convert_to_tensor(batch.next_state) done_batch = tf.convert_to_tensor(batch.done) with tf.GradientTape() as tape: state_action_values = tf.math.reduce_sum( self.policy_network(state_batch) * tf.one_hot(action_batch, self.n_actions), axis=1) next_state_values = tf.where( done_batch, tf.zeros(batch_size), tf.math.reduce_max(self.target_network(next_state_batch), axis=1)) expected_state_action_values = reward_batch + (discount_factor * next_state_values) loss_value = loss(expected_state_action_values, state_action_values) variables = self.policy_network.trainable_variables gradients = tape.gradient(loss_value, variables) optimizer.apply_gradients(zip(gradients, variables))
def _apply_gradient_descent( self, memory: ReplayMemory, batch_size: int, learning_rate: float, discount_factor: float, entropy_c: float, ): huber_loss = tf.keras.losses.Huber() wsce_loss = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) optimizer = tf.keras.optimizers.Adam(lr=learning_rate) transitions = memory.tail(batch_size) batch = A2CTransition(*zip(*transitions)) states = tf.convert_to_tensor(batch.state) actions = tf.convert_to_tensor(batch.action) rewards = tf.convert_to_tensor(batch.reward, dtype=tf.float32) dones = tf.convert_to_tensor(batch.done) values = tf.convert_to_tensor(batch.value) returns = [] exp_weighted_return = 0 for reward, done in zip(rewards[::-1], dones[::-1]): exp_weighted_return = reward + discount_factor * exp_weighted_return * ( 1 - int(done)) returns += [exp_weighted_return] returns = returns[::-1] with tf.GradientTape() as tape: state_values = self.critic_network(states) critic_loss_value = huber_loss(returns, state_values) gradients = tape.gradient(critic_loss_value, self.critic_network.trainable_variables) optimizer.apply_gradients( zip(gradients, self.critic_network.trainable_variables)) with tf.GradientTape() as tape: returns = tf.reshape(returns, [batch_size, 1]) advantages = returns - values actions = tf.cast(actions, tf.int32) logits = self.actor_network(states) policy_loss_value = wsce_loss(actions, logits, sample_weight=advantages) probs = tf.nn.softmax(logits) entropy_loss_value = tf.keras.losses.categorical_crossentropy( probs, probs) policy_total_loss_value = policy_loss_value - entropy_c * entropy_loss_value gradients = tape.gradient(policy_total_loss_value, self.actor_network.trainable_variables) optimizer.apply_gradients( zip(gradients, self.actor_network.trainable_variables))
def run(self): memory = ReplayMemory(self.memory_capacity, transition_type=DQNTransition) optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate) loss_fn = tf.keras.losses.Huber() while self.done_queue.qsize() < self.n_envs: while self.memory_queue.qsize() > 0: sample = self.memory_queue.get() memory.push(*sample) if len(memory) < self.batch_size: continue transitions = memory.sample(self.batch_size) batch = DQNTransition(*zip(*transitions)) state_batch = tf.convert_to_tensor(batch.state) action_batch = tf.convert_to_tensor(batch.action) reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32) next_state_batch = tf.convert_to_tensor(batch.next_state) done_batch = tf.convert_to_tensor(batch.done) with tf.GradientTape() as tape: state_action_values = tf.math.reduce_sum( self.model.policy_network(state_batch) * tf.one_hot(action_batch, self.model.n_actions), axis=1) next_state_values = tf.where( done_batch, tf.zeros(self.batch_size), tf.math.reduce_max( self.model.target_network(next_state_batch), axis=1)) expected_state_action_values = reward_batch + \ (self.discount_factor * next_state_values) loss_value = loss_fn(expected_state_action_values, state_action_values) variables = self.model.policy_network.trainable_variables gradients = tape.gradient(loss_value, variables) optimizer.apply_gradients(zip(gradients, variables)) self.model_update_queue.put(self.model)
def train(self, n_steps: int = None, n_episodes: int = None, save_every: int = None, save_path: str = None, callback: callable = None, **kwargs) -> float: batch_size: int = kwargs.get('batch_size', 128) discount_factor: float = kwargs.get('discount_factor', 0.9999) learning_rate: float = kwargs.get('learning_rate', 0.0001) eps_start: float = kwargs.get('eps_start', 0.9) eps_end: float = kwargs.get('eps_end', 0.05) eps_decay_steps: int = kwargs.get('eps_decay_steps', 200) update_target_every: int = kwargs.get('update_target_every', 1000) memory_capacity: int = kwargs.get('memory_capacity', 1000) render_interval: int = kwargs.get( 'render_interval', 50) # in steps, None for episode end renderers only memory = ReplayMemory(memory_capacity, transition_type=DQNTransition) episode = 0 total_steps_done = 0 total_reward = 0 stop_training = False if n_steps and not n_episodes: n_episodes = np.iinfo(np.int32).max print('==== AGENT ID: {} ===='.format(self.id)) while episode < n_episodes and not stop_training: state = self.env.reset() done = False steps_done = 0 while not done: threshold = eps_end + (eps_start - eps_end) * np.exp( -total_steps_done / eps_decay_steps) action = self.get_action(state, threshold=threshold) next_state, reward, done, _ = self.env.step(action) memory.push(state, action, reward, next_state, done) state = next_state total_reward += reward steps_done += 1 total_steps_done += 1 if len(memory) < batch_size: continue self._apply_gradient_descent(memory, batch_size, learning_rate, discount_factor) if n_steps and steps_done >= n_steps: done = True if render_interval is not None and steps_done % render_interval == 0: self.env.render(episode=episode, max_episodes=n_episodes, max_steps=n_steps) if steps_done % update_target_every == 0: self.target_network = tf.keras.models.clone_model( self.policy_network) self.target_network.trainable = False is_checkpoint = save_every and episode % save_every == 0 if save_path and (is_checkpoint or episode == n_episodes - 1): self.save(save_path, episode=episode) if not render_interval or steps_done < n_steps: self.env.render( episode=episode, max_episodes=n_episodes, max_steps=n_steps ) # renderers final state at episode end if not rendered earlier self.env.save() episode += 1 mean_reward = total_reward / steps_done return mean_reward
def train(self, n_steps: int = None, n_episodes: int = None, save_every: int = None, save_path: str = None, callback: callable = None, **kwargs) -> float: batch_size: int = kwargs.get('batch_size', 128) discount_factor: float = kwargs.get('discount_factor', 0.9999) learning_rate: float = kwargs.get('learning_rate', 0.0001) eps_start: float = kwargs.get('eps_start', 0.9) eps_end: float = kwargs.get('eps_end', 0.05) eps_decay_steps: int = kwargs.get('eps_decay_steps', 400) entropy_c: int = kwargs.get('entropy_c', 0.0001) memory_capacity: int = kwargs.get('memory_capacity', 1000) train_end: float = kwargs.get('train_end', 0.3) memory = ReplayMemory(memory_capacity, transition_type=A2C_LSTM_Transition) episode = 0 steps_done = 0 total_reward = 0 stop_training = False if n_steps and not n_episodes: n_episodes = np.iinfo(np.int32).max print('==== AGENT ID: {} ===='.format(self.id)) while episode < n_episodes and not stop_training: if not episode or not self.env.feed.has_next(): state = self.env.reset() done = False steps_done = 0 if episode: #self.LSTM.reset_states() memory = ReplayMemory(memory_capacity, transition_type=A2C_LSTM_Transition) self.env.portfolio.reset() print(self.env.portfolio.balances) print('==== TRAIN EPISODE ID ({}/{}): {} ===='.format( episode + 1, n_episodes, self.env.episode_id)) while not done: if steps_done % 24 == 0: #each day print("step {}/{}".format(steps_done, n_steps)) print(self.env.portfolio.balances) print(self.env.portfolio.net_worth) print('exchange:', state[-1][0]) if not self.env.feed.has_next(): done = True continue threshold = eps_end + (eps_start - eps_end) * np.exp( -steps_done / eps_decay_steps) action = self.get_action(state, threshold=threshold) next_state, reward, done, _ = self.env.step(action) value = self.critic_network(state[None, None, :], training=False) value = tf.squeeze(value, axis=-1) memory.push(state, action, reward, done, value) state = next_state total_reward += reward steps_done += 1 if self.env.portfolio.net_worth < self.env.portfolio.initial_net_worth * train_end: done = True continue if len(memory) < batch_size: continue if True or steps_done % batch_size == 0: self._apply_gradient_descent(memory, batch_size, learning_rate, discount_factor, entropy_c) if n_steps and steps_done >= n_steps: done = True # VALIDATION test_state = self.test_env.reset() self.LSTM.reset_states() done = False steps_done = 0 threshold = 0.1 print(self.env.portfolio.balances) print('==== TEST EPISODE ID ({}/{}): {} ===='.format( episode + 1, n_episodes, self.test_env.episode_id)) while not done: if steps_done % 24 == 0: #each day print("step {}/{}".format(steps_done, n_steps)) print(self.test_env.portfolio.balances) print(self.test_env.portfolio.net_worth) print('exchange:', test_state[-1][0]) if not self.test_env.feed.has_next(): done = True continue action = self.get_action(test_state, threshold=threshold) next_state, reward, done, _ = self.test_env.step(action) value = self.critic_network(test_state[None, None, :], training=False) value = tf.squeeze(value, axis=-1) test_state = next_state steps_done += 1 if self.test_env.portfolio.net_worth < self.test_env.portfolio.initial_net_worth * train_end: done = True continue portfolio_perf = self.test_env.portfolio.performance.values np.savetxt(save_path + '/test{}.csv'.format(episode + 1), portfolio_perf, delimiter=',', fmt='%s') self.LSTM.reset_states() is_checkpoint = save_every and episode % save_every == 0 if save_path and (is_checkpoint or episode == n_episodes): self.save(save_path, episode=episode) episode += 1 mean_reward = total_reward / steps_done return mean_reward
def train(self, n_steps: int = None, n_episodes: int = None, save_every: int = None, save_path: str = None, callback: callable = None, **kwargs) -> float: batch_size: int = kwargs.get('batch_size', 64) discount_factor: float = kwargs.get('discount_factor', 0.9999) learning_rate: float = kwargs.get('learning_rate', 0.0001) eps_start: float = kwargs.get('eps_start', 0.9) eps_end: float = kwargs.get('eps_end', 0.05) eps_decay_steps: int = kwargs.get('eps_decay_steps', 600) update_target_every: int = kwargs.get('update_target_every', 500) memory_capacity: int = kwargs.get('memory_capacity', 1000) render_interval: int = kwargs.get( 'render_interval', 50) # in steps, None for episode end render only memory = ReplayMemory(memory_capacity, transition_type=DQNTransition) episode = 0 total_reward = 0 stop_training = False if n_steps and not n_episodes: n_episodes = np.iinfo(np.int32).max print('==== AGENT ID: {} ===='.format(self.id)) self.env.max_episodes = n_episodes self.env.max_steps = n_steps #Do a certain number of episodes while episode < n_episodes and not stop_training: #Reset the state at the beginning of every episode state = self.env.reset() done = False steps_done = 0 episode_reward = 0 #Repeat until we reach a terminal state while not done: #Pick some action, take it, and store the SARSA in our memory threshold = eps_end + (eps_start - eps_end) * np.exp( -steps_done / eps_decay_steps) action = self.get_action(state, threshold=threshold) next_state, reward, done, _ = self.env.step(action) memory.push(state, action, reward, next_state, done) state = next_state total_reward += reward episode_reward += reward steps_done += 1 #Don't train until we get enough things in our memory to train with if len(memory) < batch_size: continue self._apply_gradient_descent(memory, batch_size, learning_rate, discount_factor) if n_steps and steps_done >= n_steps: done = True if render_interval is not None and steps_done % render_interval == 0: self.env.render(episode) if steps_done % update_target_every == 0: self.target_network = tf.keras.models.clone_model( self.policy_network) self.target_network.trainable = False is_checkpoint = save_every and episode % save_every == 0 if save_path and (is_checkpoint or episode == n_episodes - 1): self.save(save_path, episode=episode) if not render_interval or steps_done < n_steps: self.env.render( episode ) # render final state at episode end if not rendered earlier #Plot the rewards: if plot_rewards: episode_rewards.append(episode_reward) episode_step_sequences.append(steps_done) episode_reward = 0 #Now plot plt.clf() plt.xlabel('Step') plt.ylabel('Reward') for ed, steps in zip(episode_rewards, episode_step_sequences): plt.plot(steps, ed) plt.show() if done else plt.pause( 0.001) # Pause a bit so that the graph is updated self.env.save() episode += 1 mean_reward = total_reward / steps_done return mean_reward
def train(self, n_steps: int = None, n_episodes: int = None, save_every: int = None, save_path: str = None, callback: callable = None, **kwargs) -> float: batch_size: int = kwargs.get('batch_size', 128) discount_factor: float = kwargs.get('discount_factor', 0.9999) learning_rate: float = kwargs.get('learning_rate', 0.0001) eps_start: float = kwargs.get('eps_start', 0.9) eps_end: float = kwargs.get('eps_end', 0.05) eps_decay_steps: int = kwargs.get('eps_decay_steps', 200) entropy_c: int = kwargs.get('entropy_c', 0.0001) memory_capacity: int = kwargs.get('memory_capacity', 1000) render_interval: int = kwargs.get( 'render_interval', 50) # in steps, None for episode end render only memory = ReplayMemory(memory_capacity, transition_type=A2CTransition) episode = 0 total_steps_done = 0 total_reward = 0 if n_steps and not n_episodes: n_episodes = np.iinfo(np.int32).max print('==== AGENT ID: {} ===='.format(self.id)) while episode < n_episodes: state = self.env.reset() done = False steps_done = 0 print('==== EPISODE ID ({}/{}): {} ===='.format( episode + 1, n_episodes, self.env.episode_id)) while not done: threshold = eps_end + (eps_start - eps_end) * np.exp( -total_steps_done / eps_decay_steps) action = self.get_action(state, threshold=threshold) next_state, reward, done, _ = self.env.step(action) value = self.critic_network(state[None, :], training=False) value = tf.squeeze(value, axis=-1) memory.push(state, action, reward, done, value) state = next_state total_reward += reward steps_done += 1 total_steps_done += 1 if len(memory) < batch_size: continue self._apply_gradient_descent(memory, batch_size, learning_rate, discount_factor, entropy_c) if n_steps and steps_done >= n_steps: done = True if render_interval is not None and steps_done % render_interval == 0: self.env.render(episode) is_checkpoint = save_every and episode % save_every == 0 if not render_interval or steps_done < n_steps: self.env.render( episode ) # render final state at episode end if not rendered earlier self.env.save() if save_path and (is_checkpoint or episode == n_episodes - 1): self.save(save_path, episode=episode) episode += 1 mean_reward = total_reward / steps_done return mean_reward
def run(self): episode = 0 for i in range(self.n_envs): self.model_update_queue.put(self.model.policy_network.to_json()) memory = ReplayMemory(self.memory_capacity, transition_type=DQNTransition) optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate) loss_fn = tf.keras.losses.Huber() while self.done_queue.qsize() < self.n_envs: while self.sync_queue.qsize() < self.n_envs: while self.memory_queue.qsize() > 0: sample = self.memory_queue.get() memory.push(*sample) self.empty_queue(self.sync_queue) transitions = memory.sample(self.batch_size) batch = DQNTransition(*zip(*transitions)) state_batch = tf.convert_to_tensor(batch.state) action_batch = tf.convert_to_tensor(batch.action) reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32) next_state_batch = tf.convert_to_tensor(batch.next_state) done_batch = tf.convert_to_tensor(batch.done) with tf.GradientTape() as tape: state_action_values = tf.math.reduce_sum( self.model.policy_network(state_batch) * tf.one_hot(action_batch, self.model.n_actions), axis=1) next_state_values = tf.where( done_batch, tf.zeros(self.batch_size), tf.math.reduce_max( self.model.target_network(next_state_batch), axis=1)) expected_state_action_values = reward_batch + \ (self.discount_factor * next_state_values) loss_value = loss_fn(expected_state_action_values, state_action_values) variables = self.model.policy_network.trainable_variables gradients = tape.gradient(loss_value, variables) optimizer.apply_gradients(zip(gradients, variables)) self.model.update_target_network() json_model = self.model.policy_network.to_json() for n in range(self.n_envs): self.model_update_queue.put(json_model) for n in range(self.n_envs * 2): self.sync_queue.put(1) episode += 1 if episode == self.n_episodes: break while self.model_update_queue.qsize() > 0: time.sleep(.1) print("") self.empty_queue(self.sync_queue) self.empty_queue(self.model_update_queue) time.sleep(2) return self.model