def evaluate(self, visualize=False): # Try the policy traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, self.v) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) return avg_J, std_J
def evaluate(self, visualize=False): # Try the policy traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, self.v) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) #pfile = open(self.save_folder+'/performance.txt','a') #pfile.write(str(i)+','+str(avg_J)+','+str(std_J)+'\n') #pfile.close() return avg_J, std_J
def parallel_rollout(self): n_procs = 5 pool = ThreadPool(n_procs) procs = [] problems = [] for i in range(n_procs): problems.append(ConveyorBelt()) # different "initial" state traj_list = [] for i in range(n_procs): print 'applying', i procs.append( pool.apply_async(self.rollout_thread, args=( problems[i], i, ))) pool.close() pool.join() print[p.successful() for p in procs] for pidx, p in enumerate(procs): if not p.successful(): # Why does it ever fail? print pidx, 'Unsuccessful' traj_list.append(self.rollout_thread(problems[pidx], pidx)) else: traj_list.append(p.get()) return traj_list
def train(self,states,actions,rewards,sprimes,sumR,traj_lengths,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() sprimes = sprimes.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() print "Fitting V..." current_best_J = -np.inf stime = time.time() self.update_V(states, sumR) adv = self.compute_A(states, actions, sprimes, rewards, traj_lengths) self.update_pi(states, actions, adv) self.saveWeights(additional_name='epoch_' + str(0)) print time.time() - stime # train pi for i in range(1, epochs): stime = time.time() print 'Completed: %.2f%%' % (i / float(epochs) * 100) # Try policy - 5 trajectories, each 20 long traj_list = [] for n_iter in range( 5): # N = 5, T = 20, using the notation from PPO paper problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, visualize=self.visualize) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J print time.time() - stime # Add new data to the buffer new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) self.update_V(new_s, new_sumR) new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r, new_traj_lengths) self.update_pi(new_s, new_a, new_sumA) if avg_J > current_best_J: current_best_J = avg_J theta_star = self.save_folder + '/policy_search_' + str( i) + '.h5' self.saveWeights(additional_name='epoch_'+\ str(i)+'_'+str(avg_J))
def __init__(self, problem_idx, n_actions_per_node): ConveyorBelt.__init__(self, problem_idx, n_actions_per_node) self.set_objects_not_in_goal(self.objects)
def train(self,states,actions,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] n_data = states.shape[0] BATCH_SIZE = np.min([32, int(len(actions) * 0.1)]) if BATCH_SIZE == 0: BATCH_SIZE = 1 print BATCH_SIZE K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() current_best_J = -np.inf n_score_train = 1 performance_list = [] pfile = open(self.save_folder + '/performance.txt', 'w') for i in range(1, epochs): stime = time.time() # Rollouts # 5 trajectories, each 20 long stime = time.time() traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, self.v) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J # new rollout dataset new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) # choose a batch of data indices = np.random.randint(0, actions.shape[0], size=BATCH_SIZE) s_batch = np.array(states[indices, :]) # collision vector a_batch = np.array(actions[indices, :]) pi_indices = np.random.randint(0, new_a.shape[0], size=BATCH_SIZE) pi_s_batch = np.array(new_s[pi_indices, :]) # collision vector pi_a_batch = np.array(new_a[pi_indices, :]) # make their scores fake_scores = np.zeros((BATCH_SIZE, 1)) real_scores = np.ones((BATCH_SIZE, 1)) batch_x = np.vstack([pi_a_batch, a_batch]) batch_w = np.vstack([pi_s_batch, s_batch]) batch_scores = np.vstack([fake_scores, real_scores]) # Update D self.disc.fit({ 'x': batch_x, 'w': batch_w }, batch_scores, epochs=1, verbose=False) new_r, new_sumR = self.compute_r_using_D(traj_list) # update value function self.update_V(new_s, new_sumR) # update policy new_sumA = self.compute_A(new_s, new_a, new_sprime, new_r, new_traj_lengths) self.update_pi(new_s, new_a, new_sumA) self.saveWeights(additional_name='epoch_'+\ str(i)+'_'+str(avg_J)) print 'Completed: %.2f%%' % (i / float(epochs) * 100) print "Epoch took: %.2fs" % (time.time() - stime)
def train(self,states,actions,rewards,sprimes,\ epochs=500,d_lr=1e-3,g_lr=1e-4): states = states.squeeze() sprimes = sprimes.squeeze() true_performance_list = [] G_performance_list = [] mse_list = [] n_data = states.shape[0] BATCH_SIZE = np.min([32, int(len(actions) * 0.1)]) if BATCH_SIZE == 0: BATCH_SIZE = 1 print BATCH_SIZE K.set_value(self.opt_G.lr, g_lr) K.set_value(self.opt_D.lr, d_lr) print self.opt_G.get_config() current_best_J = -np.inf pfile = open(self.save_folder + '/performance.txt', 'w') # n_episodes = epochs*5 # T = 20, but we update it once we finish executing all T # This is because this is an episodic task - you can only learn meaningful moves # if you go deep in the trajectory. # So, we have 300*5*20 RL data for i in range(1, epochs): print 'Completed: %.2f%%' % (i / float(epochs) * 100) stime = time.time() terminal_state_idxs = np.where( np.sum(np.sum(sprimes, axis=-1), axis=-1) == 0)[0] nonterminal_mask = np.ones((sprimes.shape[0], 1)) nonterminal_mask[terminal_state_idxs, :] = 0 # make the targets fake = self.a_gen.predict([sprimes]) # predicted by pi real = actions real_targets = rewards + np.multiply( self.disc.predict([fake, sprimes]), nonterminal_mask) stime = time.time() self.update_disc(real, states, real_targets, BATCH_SIZE) self.update_pi(states, BATCH_SIZE) print 'Fitting time', time.time() - stime # Technically speaking, we should update the policy every timestep. # What if we update it 100 times after we executed 5 episodes, each with 20 timesteps?? stime = time.time() traj_list = [] for n_iter in range(5): problem = ConveyorBelt() # different "initial" state traj = problem.execute_policy(self, 20, self.v) traj_list.append(traj) problem.env.Destroy() RaveDestroy() avg_J = np.mean([np.sum(traj['r']) for traj in traj_list]) std_J = np.std([np.sum(traj['r']) for traj in traj_list]) pfile = open(self.save_folder + '/performance.txt', 'a') pfile.write(str(i) + ',' + str(avg_J) + ',' + str(std_J) + '\n') pfile.close() print 'Score of this policy', avg_J # Add new data to the buffer - only if this was a non-zero trajectory if avg_J > 1.0: new_s, new_a, new_r, new_sprime, new_sumR, _, new_traj_lengths = format_RL_data( traj_list) new_a = self.a_scaler.transform(new_a) states = np.r_[states, new_s.squeeze()] actions = np.r_[actions, new_a] rewards = np.r_[rewards, new_r] sprimes = np.r_[sprimes, new_sprime.squeeze()] print "Rollout time", time.time() - stime if avg_J > current_best_J: current_best_J = avg_J theta_star = self.save_folder + '/policy_search_' + str( i) + '.h5' self.saveWeights(additional_name='tau_'+str(self.tau)+'epoch_'+\ str(i)+'_'+str(avg_J))