def learn_Karpathy(self): """Learn using updates like in the Karpathy algorithm.""" iteration = self.start_at_iter while iteration < self.n_iter and not self.master.stop_requested: # Keep executing episodes until the master requests a stop (e.g. using SIGINT) iteration += 1 trajectory = self.task_runner.get_trajectory() reward = sum(trajectory["reward"]) action_taken = trajectory["action"] discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = discounted_episode_rewards results = self.master.session.run( [self.loss, self.apply_grad], feed_dict={ self.master.states: trajectory["state"], self.master.action_taken: action_taken, self.master.advantage: feedback }) results = self.master.session.run( [self.master.summary_op], feed_dict={ self.master.loss: results[0], self.master.reward: reward, self.master.episode_length: trajectory["steps"] }) self.writer.add_summary(results[0], iteration) self.writer.flush()
def learn_REINFORCE(self): """Learn using updates like in the REINFORCE algorithm.""" reporter = Reporter() total_n_trajectories = 0 iteration = self.start_at_iter while iteration < self.n_iter and not self.master.stop_requested: iteration += 1 # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.task_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], self.config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.master.session.run( [self.loss, self.apply_grad], feed_dict={ self.master.states: all_state, self.master.action_taken: all_action, self.master.advantage: all_adv }) print("Task:", self.task_id) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) summary = self.master.session.run( [self.master.summary_op], feed_dict={ self.master.loss: results[0], self.master.reward: np.mean(episode_rewards), self.master.episode_length: np.mean(episode_lengths) }) self.writer.add_summary(summary[0], iteration) self.writer.flush()
def learn(self): reporter = Reporter() self.session.run([self.reset_accumulative_grads]) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config["batch_size"]) episode_rewards = np.zeros(self.config["batch_size"]) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory() episode_rewards[episode_nr % self.config["batch_size"]] = sum( trajectory["reward"]) episode_lengths[episode_nr % self.config["batch_size"]] = len( trajectory["reward"]) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory["action"][:, None]).astype( np.float32) # one-hot encoding discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) std = np.std(discounted_episode_rewards) std = std if std > 0 else 1 discounted_episode_rewards /= std feedback = np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) self.session.run( [self.accumulate_grads], feed_dict={ self.states: trajectory["state"], self.action_taken: action_taken, self.feedback: feedback }) if episode_nr % self.config["batch_size"] == 0: # batch is done iteration += 1 self.session.run([self.apply_gradients]) self.session.run([self.reset_accumulative_grads]) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config["draw_frequency"] == 0: reporter.draw_rewards(mean_rewards) if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def learn(self): """Run learning algorithm""" self._initialize() reporter = Reporter() config = self.config total_n_trajectories = 0 for iteration in range(config["n_iter"]): # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = self.env_runner.get_trajectories() total_n_trajectories += len(trajectories) all_state = np.concatenate( [trajectory.states for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory.rewards, config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory.actions for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ sum(trajectory.rewards) for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory.rewards) for trajectory in trajectories ]) # episode lengths # TODO: deal with RNN state summary, _ = self.session.run( [self.summary_op, self.train], feed_dict={ self.states: all_state, self.actions_taken: all_action, self.advantage: all_adv }) self.writer.add_summary(summary, iteration) self.writer.flush() reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories) if self.config["save_model"]: self.saver.save(self.session, os.path.join(self.monitor_path, "model"))
def learn(self): # Assume global shared parameter vectors θ and θv and global shared counter T = 0 # Assume thread-specific parameter vectors θ' and θ'v with tf.train.MonitoredTrainingSession( master=self.server.target, is_chief=(self.task_id == 0), config=self.config_proto, save_summaries_secs=30, scaffold=self.scaffold ) as sess: self.session = sess sess.run(self.sync_net) self.runner.start_runner(sess, self.writer) while not sess.should_stop() and self.global_step < self.config["T_max"]: # Synchronize thread-specific parameters θ' = θ and θ'v = θv sess.run(self.sync_net) trajectory = self.pull_batch_from_queue() v = 0 if trajectory.terminal else self.get_critic_value( np.asarray(trajectory.states)[None, -1], trajectory.features[-1]) rewards_plus_v = np.asarray(trajectory.rewards + [v]) vpred_t = np.asarray(trajectory.values + [v]) delta_t = trajectory.rewards + self.config["gamma"] * vpred_t[1:] - vpred_t[:-1] batch_r = discount_rewards(rewards_plus_v, self.config["gamma"])[:-1] batch_adv = discount_rewards(delta_t, self.config["gamma"]) fetches = [self.summary_op, self.train_op, self._global_step] states = np.asarray(trajectory.states) feed_dict = { self.states: states, self.actions_taken: np.asarray(trajectory.actions), self.advantage: batch_adv, self.ret: np.asarray(batch_r) } feature = trajectory.features[0] if feature != [] and feature is not None: feed_dict[self.local_network.rnn_state_in] = feature summary, _, global_step = sess.run(fetches, feed_dict) self.writer.add_summary(summary, global_step) self.writer.flush()
def learn(self): """Run learning algorithm""" self._initialize() config = self.config for _ in range(int(config["n_iter"])): # Collect trajectories until we get timesteps_per_batch total timesteps trajectory = self.env_runner.get_steps(int(self.config["n_local_steps"])) v = 0 if trajectory.terminals[-1] else self.get_critic_value( np.asarray(trajectory.states)[None, -1], trajectory.features[-1]) rewards_plus_v = np.asarray(trajectory.rewards + [v]) vpred_t = np.asarray(trajectory.values + [v]) delta_t = trajectory.rewards + \ self.config["gamma"] * vpred_t[1:] - vpred_t[:-1] batch_r = discount_rewards( rewards_plus_v, self.config["gamma"])[:-1] batch_adv = discount_rewards(delta_t, self.config["gamma"]) fetches = [self.loss_summary_op, self.train_op, self._global_step] states = np.asarray(trajectory.states) feed_dict = { self.states: states, self.actions_taken: np.asarray(trajectory.actions), self.advantage: batch_adv, self.ret: np.asarray(batch_r) } feature = trajectory.features[0] if feature != [] and feature is not None: feed_dict[self.ac_net.rnn_state_in] = feature summary, _, global_step = self.session.run(fetches, feed_dict) self.writer.add_summary(summary, global_step) self.writer.flush() if self.config["save_model"]: tf.add_to_collection("action", self.action) tf.add_to_collection("states", self.states) self.saver.save(self.session, os.path.join( self.monitor_path, "model"))
def learn(self): reporter = Reporter() gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) rmsprop1 = np.zeros_like(self.w1) rmsprop2 = np.zeros_like(self.w2) iteration = 0 # amount of batches processed episode_nr = 0 episode_lengths = np.zeros(self.config["batch_size"]) episode_rewards = np.zeros(self.config["batch_size"]) mean_rewards = [] while True: # Keep executing episodes trajectory = self.get_trajectory(self.config["episode_max_length"]) episode_rewards[episode_nr % self.config["batch_size"]] = sum( trajectory["reward"]) episode_lengths[episode_nr % self.config["batch_size"]] = len( trajectory["reward"]) episode_nr += 1 action_taken = (np.arange( self.nA) == trajectory["action"][:, None]).astype( np.float32) # one-hot encoding epdlogp = action_taken - trajectory["prob"] # episode_states = np.vstack(encountered_states) discounted_episode_rewards = discount_rewards( trajectory["reward"], self.config["gamma"]) # print(discounted_episode_rewards) # standardize discounted_episode_rewards -= np.mean(discounted_episode_rewards) discounted_episode_rewards /= np.std(discounted_episode_rewards) epdlogp *= np.reshape( np.repeat(discounted_episode_rewards, self.nA), (len(discounted_episode_rewards), self.nA)) change_w1, change_w2 = self.backward_step(trajectory["state"], trajectory['x1'], epdlogp) gradient1 += change_w1 gradient2 += change_w2 if episode_nr % self.config["batch_size"] == 0: # batch is done iteration += 1 rmsprop1 = self.config["decay_rate"] * rmsprop1 + ( 1 - self.config["decay_rate"]) * gradient1**2 rmsprop2 = self.config["decay_rate"] * rmsprop2 + ( 1 - self.config["decay_rate"]) * gradient2**2 self.w1 += self.config["learning_rate"] * gradient1 / ( np.sqrt(rmsprop1) + 1e-5) self.w2 += self.config["learning_rate"] * gradient2 / ( np.sqrt(rmsprop2) + 1e-5) gradient1 = np.zeros_like(self.w1) gradient2 = np.zeros_like(self.w2) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, episode_nr) mean_rewards.append(episode_rewards.mean()) if episode_nr % self.config["draw_frequency"] == 0: reporter.draw_rewards(mean_rewards)
def learn(self): """Run learning algorithm""" self._initialize() reporter = Reporter() config = self.config total_n_trajectories = np.zeros(len(self.envs)) for iteration in range(config["n_iter"]): self.session.run([self.reset_accum_grads]) for i, task_runner in enumerate(self.task_runners): if self.config["switch_at_iter"] is not None: if iteration >= self.config["switch_at_iter"] and i != ( len(self.task_runners) - 1): continue elif iteration < self.config["switch_at_iter"] and i == len( self.task_runners) - 1: continue # Collect trajectories until we get timesteps_per_batch total timesteps trajectories = task_runner.get_trajectories() total_n_trajectories[i] += len(trajectories) all_state = np.concatenate( [trajectory["state"] for trajectory in trajectories]) # Compute discounted sums of rewards rets = [ discount_rewards(trajectory["reward"], config["gamma"]) for trajectory in trajectories ] max_len = max(len(ret) for ret in rets) padded_rets = [ np.concatenate([ret, np.zeros(max_len - len(ret))]) for ret in rets ] # Compute time-dependent baseline baseline = np.mean(padded_rets, axis=0) # Compute advantage function advs = [ret - baseline[:len(ret)] for ret in rets] all_action = np.concatenate( [trajectory["action"] for trajectory in trajectories]) all_adv = np.concatenate(advs) # Do policy gradient update step episode_rewards = np.array([ trajectory["reward"].sum() for trajectory in trajectories ]) # episode total rewards episode_lengths = np.array([ len(trajectory["reward"]) for trajectory in trajectories ]) # episode lengths results = self.session.run( [ self.losses[i], self.add_accum_grads[i], self.accum_grads ], feed_dict={ self.states: all_state, self.action_taken: all_action, self.advantage: all_adv }) summary = self.session.run( [self.summary_op], feed_dict={ self.loss: results[0], self.rewards: np.mean(episode_rewards), self.episode_lengths: np.mean(episode_lengths) }) self.writers[i].add_summary(summary[0], iteration) self.writers[i].flush() print("Task:", i) reporter.print_iteration_stats(iteration, episode_rewards, episode_lengths, total_n_trajectories[i]) # Apply accumulated gradient after all the gradients of each task are summed self.session.run([self.apply_gradients]) if self.config["save_model"]: if not os.path.exists(self.monitor_path): os.makedirs(self.monitor_path) self.saver.save(self.session, os.path.join(self.monitor_path, "model"))