def add(self, state, action, reward, next_state, done): self.tuples.append([ utils.to_numpy(state), utils.to_numpy(next_state), np.float32(action), np.reshape(np.float32(np.array([reward])), (1, 1)), np.reshape(np.float32(np.array([float(done)])), (1, 1)) ])
def evaluate(net, args, replay_memory, dict_all_returns, key, store_transition=True): total_reward = 0.0 env = utils.NormalizedActions(gym.make(env_tag)) state = env.reset() num_frames = 0 state = utils.to_tensor(state).unsqueeze(0) # replay_buffer = replay_memory.ReplayMemory(args.buffer_size) # replay_memory[key] = replay_memory if args.is_cuda: state = state.cuda() done = False while not done: if store_transition: num_frames += 1 # if render and is_render: env.render() action = net.forward(state) action.clamp(-1, 1) action = utils.to_numpy(action.cpu()) # if is_action_noise: action += self.ounoise.noise() # print("1") next_state, reward, done, info = env.step(action.flatten()) # Simulate one step in environment next_state = utils.to_tensor(next_state).unsqueeze(0) if args.is_cuda: next_state = next_state.cuda() total_reward += reward if store_transition: add_experience(state, action, next_state, reward, done, replay_memory, args) # replay_memory[key] = replay_memory # if len(replay_buffer) > args.batch_size: # transitions = replay_buffer.sample(args.batch_size) # batch = replay_memory.Transition(*zip(*transitions)) # replay_queue.put(batch) state = next_state
def evaluate(self, net, is_render, is_action_noise=False, store_transition=True): total_reward = 0.0 state = self.env.reset() state = utils.to_tensor(state).unsqueeze(0) if self.args.is_cuda: state = state.cuda() done = False while not done: if store_transition: self.num_frames += 1 self.gen_frames += 1 if render and is_render: self.env.render() action = net.forward(state) action.clamp(-1, 1) action = utils.to_numpy(action.cpu()) if is_action_noise: action += self.ounoise.noise() next_state, reward, done, info = self.env.step( action.flatten()) # Simulate one step in environment next_state = utils.to_tensor(next_state).unsqueeze(0) if self.args.is_cuda: next_state = next_state.cuda() total_reward += reward if store_transition: self.add_experience(state, action, next_state, reward, done) state = next_state if store_transition: self.num_games += 1 return total_reward
def shape_dpp(self, critic, state, action): Q1, Q2 = critic.forward(state, action) original_Q = torch.min(Q1, Q2) all_Q = [original_Q] state = utils.to_numpy(state.cpu()) coupling = 4 max_ind = 35 qvals = [] for coupling_mag in range(coupling): for row_id in range(len(state)): random_entry = random.randint(0, max_ind) state[row_id, random_entry] = 0.5 shaped_state = utils.to_tensor(state).cuda() Q1, Q2 = critic.forward(shaped_state, action) all_Q.append(torch.min(Q1, Q2) / (coupling_mag + 1.0)) all_Q = torch.cat(all_Q, 1) all_qvals = all_Q.cpu().data.numpy() all_qvals = np.expand_dims(np.max(all_qvals, 1), 1) multiplier = np.divide(all_qvals, original_Q.cpu().data.numpy()) #print(all_qvals.shape, original_Q.cpu().data.numpy().shape) np.clip(multiplier, 1, 100) multiplier = torch.Tensor(multiplier).cuda() #print(dpp_max.shape, all_Q.shape) return original_Q * multiplier, original_Q * multiplier
def update_parameters(self, batch): state_batch = torch.cat(batch.state) next_state_batch = torch.cat(batch.next_state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) done_batch = torch.cat(batch.done) state_batch.volatile = False next_state_batch.volatile = True action_batch.volatile = False # Critic Update vals = self.critic.forward(state_batch) new_vals = self.critic.forward(next_state_batch) * (1 - done_batch) targets = reward_batch + self.gamma * new_vals self.critic_optim.zero_grad() dt = self.loss(vals, targets) dt.backward() self.critic_optim.step() # Actor Update self.actor_optim.zero_grad() state_batch = utils.to_tensor(utils.to_numpy(state_batch)) targets = utils.to_tensor(utils.to_numpy(targets)) vals = utils.to_tensor(utils.to_numpy(vals)) action_logs = self.actor.forward(state_batch) entropy_loss = torch.mean(entropy(torch.exp(action_logs))) action_logs = F.log_softmax(action_logs) dt = targets - vals alogs = [] for i, action in enumerate(action_batch): action_i = int(action.cpu().data.numpy()) alogs.append(action_logs[i, action_i]) alogs = torch.cat(alogs).unsqueeze(0) policy_loss = -torch.mean(dt * alogs.t()) actor_loss = policy_loss - entropy_loss actor_loss.backward() self.actor_optim.step()
def shape_dpp(self, critic, actor, state, sensor_model): Q1, _, val = critic((state), actor((state))) original_T = Q1 - val all_adv = [original_T] state = utils.to_numpy(state.cpu()) #mid_index = int(180 / self.args.angle_res) coupling = self.args.coupling max_ind = int(360 / self.args.angle_res) perturb_index = [ np.argwhere(state[i, 0:max_ind] != -1).flatten() for i in range(len(state)) ] for i, entry in enumerate(perturb_index): np.random.shuffle(entry) if len(entry) < coupling: perturb_index[i] = np.tile(entry, (coupling, 1)).flatten() for coupling_mag in range(coupling): empty_ind = [int(entry[coupling_mag]) for entry in perturb_index] if sensor_model == 'density': for i, ind in enumerate(empty_ind): state[i, ind] = 1.0 elif sensor_model == 'closets': for i, ind in enumerate(empty_ind): state[i, ind] = 1.0 shaped_state = utils.to_tensor(state).cuda() Q1, _, val = critic((shaped_state), actor((shaped_state))) adv = (Q1 - val) / (coupling_mag + 1) all_adv.append(adv) all_adv = torch.cat(all_adv, 1) dpp_max = torch.max(all_adv, 1)[0].unsqueeze(1) with torch.no_grad(): normalizer = dpp_max / original_T return original_T * normalizer
def rollout_worker(id, second_pid, task_pipe, result_pipe, is_noise, data_bucket, model_bucket, env_name, noise_std, ALGO, trainers=None, Save_net=True, pomdp_adv=False): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results is_noise (bool): Use noise? data_bucket (list of shared object): A list of shared object reference to s,ns,a,r,done (replay buffer) managed by a manager that is used to store experience tuples model_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors) env_name (str): Environment name? noise_std (float): Standard deviation of Gaussian for sampling noise Returns: None """ np.random.seed( id) ###make sure the random seeds across learners are different if ALGO == 'dis': #make env with blue and red policy agent inside, assert trainers is not None dis_env = make_self_play_env( seed=id, return_policy_agent=False, trainers=trainers )[0] #trainer if not None, first is the shared DQN agent, second is the best red policy env = EnvironmentWrapper(env_name, ALGO, dis_env, 1) #this is for red agent elif ALGO == 'TD3_tennis': pid = id + second_pid * 20 #should be larger than the number of processes tennis_env = make_tennis_env.TennisEnvFactory(seed=id, no_graphics=True, pid=pid).getEnv()[0] env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0) else: print("ALGO is:", ALGO) env = EnvironmentWrapper(env_name, ALGO) ###LOOP### while True: identifier, gen = task_pipe.recv( ) # Wait until a signal is received to start rollout if identifier == 'TERMINATE': print('Process:', os.getpid(), ' killed') exit(0) #Kill yourself # Get the requisite network net = model_bucket[identifier] fitness = 0.0 total_frame = 0 state = env.reset() if pomdp_adv: env.try_set_pomdp_adv() rollout_trajectory = [] state = utils.to_tensor(np.array(state)).unsqueeze(0) while True: # unless done #cannot pass this line action = net.Gumbel_softmax_sample_distribution( state, use_cuda=False) if ALGO == 'dis' else net.forward(state) #here action is on the simplex, while use the arg_max sample in the env step action = utils.to_numpy(action) if is_noise and not ALGO == 'dis': #action = (action + np.random.normal(0, noise_std, size=env.env.action_space.shape[0])).clip(env.env.action_space.low, env.env.action_space.high) action = (action + np.random.normal( 0, noise_std, size=env.env.action_space.shape[0])).clip( env.env.action_space.low, env.env.action_space.high) next_state, reward, done, info = env.step( int( net.turn_max_into_onehot(action).argmax().numpy(). __float__())) if ALGO == 'dis' else env.step( action.flatten()) # Simulate one step in environment #if id == 1: env.render() next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) fitness += reward #print('line 79') # If storing transitions if data_bucket != None: #Skip for test set rollout_trajectory.append([ utils.to_numpy(state), utils.to_numpy(next_state), np.float32(action), np.reshape(np.float32(np.array([reward])), (1, 1)), np.reshape(np.float32(np.array([float(done)])), (1, 1)) ]) state = next_state total_frame += 1 # DONE FLAG IS Received #print(done) if done: # Push experiences to main for entry in rollout_trajectory: data_bucket.append( entry ) #ms this data_bucket store experience, is shared accross process (and training thread?) break # Send back id, fitness, total length and shaped fitness using the result pipe #if gen >= 10 and id == 1: # env.env.world.good_agents[0].dqn_agent_naive.save_net('./runner_blue_dqn.pth') if gen >= 10 and gen % 5 == 0 and Save_net: net.save_net('./pytorch_models/red_' + str(id) + '_net_step_' + str(gen) + '.pth') result_pipe.send( [identifier, fitness, total_frame, env.get_blue_actual_reward()])
def rollout_worker(args, id, type, task_pipe, result_pipe, predator_data_bucket, prey_data_bucket, predators_bucket, prey_bucket, store_transitions, config): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: worker_id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results noise (object): A noise generator object exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples pop (shared list object): A shared list object managed by a manager used to store all the models (actors) difficulty (int): Difficulty of the task use_rs (bool): Use behavioral reward shaping? store_transition (bool): Log experiences to exp_list? Returns: None """ if type == 'test': NUM_EVALS = args.num_test elif type == 'pg': NUM_EVALS = args.rollout_size elif type == 'evo': NUM_EVALS = 10 else: sys.exit('Incorrect type') if config == 'simple_tag' or config == 'hard_tag': from envs.env_wrapper import SimpleTag env = SimpleTag(args, NUM_EVALS) elif config == 'simple_adversary': from envs.env_wrapper import SimpleAdversary env = SimpleAdversary(args, NUM_EVALS) elif config == 'simple_push': from envs.env_wrapper import SimplePush env = SimplePush(args, NUM_EVALS) else: sys.exit('Unknow Config in runner.py') print('Runner running config ', config) np.random.seed(id) random.seed(id) while True: teams_blueprint = task_pipe.recv( ) #Wait until a signal is received to start rollout if teams_blueprint == 'TERMINATE': exit(0) # Kill yourself # Get the current team actors if type == 'test' or type == 'pg': team = [predators_bucket[0] for _ in range(args.config.num_agents)] elif type == "evo": team = [ predators_bucket[teams_blueprint[0]] for _ in range(args.config.num_agents) ] else: sys.exit('Incorrect type') if args.rollout_size == 0: store_transitions = False fitness = [None for _ in range(NUM_EVALS)] frame = 0 prey_fitness = [None for _ in range(NUM_EVALS)] predator_state, prey_state = env.reset() prey_rollout_trajectory = [] predator_rollout_trajectory = [[] for _ in range(3)] prey_state = utils.to_tensor(np.array(prey_state)) predator_state = utils.to_tensor(np.array(predator_state)) while True: #unless done if type == 'pg': prey_action = [ prey_bucket[0].noisy_action(prey_state[i, :], head=i).detach().numpy() for i in range(len(prey_state)) ] predator_action = [ team[i].noisy_action(predator_state[i, :], head=i).detach().numpy() for i in range(len(predator_state)) ] else: prey_action = [ prey_bucket[0].clean_action(prey_state[i, :], head=i).detach().numpy() for i in range(len(prey_state)) ] predator_action = [ team[i].clean_action(predator_state[i, :], head=i).detach().numpy() for i in range(len(predator_state)) ] #JOINT ACTION [agent_id, universe_id, action] #Bound Action prey_action = np.array(prey_action).clip(-1.0, 1.0) predator_action = np.array(predator_action).clip(-1.0, 1.0) next_pred_state, next_prey_state, pred_reward, prey_reward, done, global_reward = env.step( predator_action, prey_action) # Simulate one step in environment #State --> [agent_id, universe_id, obs] #reward --> [agent_id, universe_id] #done --> [universe_id] #info --> [universe_id] #if type == "test": env.universe[0].render() next_pred_state = utils.to_tensor(np.array(next_pred_state)) next_prey_state = utils.to_tensor(np.array(next_prey_state)) #Grab global reward as fitnesses for i, grew in enumerate(global_reward): if grew[0] != None: fitness[i] = grew[0] prey_fitness[i] = grew[1] #PREDATOR #Push experiences to memory if store_transitions: if not args.is_matd3 and not args.is_maddpg: #Default for agent_id in range(args.config.num_agents): for universe_id in range(NUM_EVALS): if not done: predator_rollout_trajectory[agent_id].append([ np.expand_dims( utils.to_numpy(predator_state)[ agent_id, universe_id, :], 0), np.expand_dims( utils.to_numpy(next_pred_state)[ agent_id, universe_id, :], 0), np.expand_dims( predator_action[agent_id, universe_id, :], 0), np.expand_dims( np.array([ pred_reward[agent_id, universe_id] ], dtype="float32"), 0), np.expand_dims( np.array([done], dtype="float32"), 0), universe_id, type ]) else: #FOR MATD3 for universe_id in range(NUM_EVALS): if not done: predator_rollout_trajectory[0].append([ np.expand_dims( utils.to_numpy(predator_state) [:, universe_id, :], 0), np.expand_dims( utils.to_numpy(next_pred_state) [:, universe_id, :], 0), np.expand_dims( predator_action[:, universe_id, :], 0), #[batch, agent_id, :] np.array([pred_reward[:, universe_id]], dtype="float32"), np.expand_dims( np.array([done], dtype="float32"), 0), universe_id, type ]) #PREY for universe_id in range(NUM_EVALS): if not done: prey_rollout_trajectory.append([ np.expand_dims( utils.to_numpy(prey_state)[0, universe_id, :], 0), np.expand_dims( utils.to_numpy(next_prey_state)[0, universe_id, :], 0), np.expand_dims(prey_action[0, universe_id, :], 0), # [batch, agent_id, :] np.array([prey_reward[:, universe_id]], dtype="float32"), np.expand_dims(np.array([done], dtype="float32"), 0), universe_id, type ]) predator_state = next_pred_state prey_state = next_prey_state frame += NUM_EVALS #DONE FLAG IS Received if done: #Push experiences to main if store_transitions: for agent_id, buffer in enumerate(predator_data_bucket): for entry in predator_rollout_trajectory[agent_id]: temp_global_reward = fitness[entry[5]] entry[5] = np.expand_dims( np.array([temp_global_reward], dtype="float32"), 0) buffer.append(entry) #PREY for buffer in prey_data_bucket: for entry in prey_rollout_trajectory: temp_global_reward = 0.0 entry[5] = np.expand_dims( np.array([temp_global_reward], dtype="float32"), 0) buffer.append(entry) break #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([teams_blueprint, [fitness, prey_fitness], frame])
def rollout_worker(worker_id, task_pipe, result_pipe, noise, exp_list, pop, difficulty, use_rs, store_transition=True, use_synthetic_targets=False, xbias=None, zbias=None, phase_len=100, traj_container=None, ep_len=1005, JGS=False, num_trials=5): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: worker_id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results noise (object): A noise generator object exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples pop (shared list object): A shared list object managed by a manager used to store all the models (actors) difficulty (int): Difficulty of the task use_rs (bool): Use behavioral reward shaping? store_transition (bool): Log experiences to exp_list? Returns: None """ worker_id = worker_id env = EnvironmentWrapper(difficulty, rs=use_rs, use_synthetic_targets=use_synthetic_targets, xbias=xbias, zbias=zbias, phase_len=phase_len, jgs=JGS) nofault_endstep = phase_len * 4 if use_rs: lfoot = [] rfoot = [] if difficulty == 0: ltibia = [] rtibia = [] pelvis_x = [] pelvis_y = [] ltibia_angle = [] lfemur_angle = [] rtibia_angle = [] rfemur_angle = [] head_x = [] while True: _ = task_pipe.recv( ) #Wait until a signal is received to start rollout net = pop[worker_id] #Get the current model state from the population fitnesses = [] total_frames = 0 for _ in range(num_trials): fitness = 0.0 state = env.reset() state = utils.to_tensor(np.array(state)).unsqueeze(0) while True: #unless done action = net.forward(state) action = utils.to_numpy(action) if noise != None: action += noise.noise() next_state, reward, done, info = env.step( action.flatten()) # Simulate one step in environment next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) fitness += reward state = next_state #DONE FLAG IS Received if done: total_frames += env.istep fitnesses.append(fitness) break lower_fit = min(fitnesses) total_frames /= float(num_trials) mean_fit = sum(fitnesses) / float(num_trials) max_fit = max(fitnesses) #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send( [worker_id, lower_fit, total_frames, [mean_fit, max_fit]])
def rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket, models_bucket, store_transitions, random_baseline): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: worker_id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results noise (object): A noise generator object exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples pop (shared list object): A shared list object managed by a manager used to store all the models (actors) difficulty (int): Difficulty of the task use_rs (bool): Use behavioral reward shaping? store_transition (bool): Log experiences to exp_list? Returns: None """ if type == 'test': NUM_EVALS = args.num_test elif type == 'pg': NUM_EVALS = args.rollout_size elif type == 'evo': NUM_EVALS = 5 else: sys.exit('Incorrect type') if args.config.env_choice == 'multiwalker': NUM_EVALS = 1 if args.config.env_choice == 'cassie': NUM_EVALS = 1 if args.config.env_choice == 'hyper': NUM_EVALS = 1 if args.config.env_choice == 'motivate': NUM_EVALS = 1 if args.config.env_choice == 'pursuit': NUM_EVALS = 1 if args.config.env_choice == 'rover_tight' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'rover_trap': from envs.env_wrapper import RoverDomainPython env = RoverDomainPython(args, NUM_EVALS) elif args.config.env_choice == 'motivate': from envs.env_wrapper import MotivateDomain env = MotivateDomain(args, NUM_EVALS) elif args.config.env_choice == 'pursuit': from envs.env_wrapper import Pursuit env = Pursuit(args, NUM_EVALS) elif args.config.env_choice == 'multiwalker': from envs.env_wrapper import MultiWalker env = MultiWalker(args, NUM_EVALS) elif args.config.env_choice == 'cassie': from envs.env_wrapper import Cassie env = Cassie(args, NUM_EVALS) elif args.config.env_choice == 'hyper': from envs.env_wrapper import PowerPlant env = PowerPlant(args, NUM_EVALS) else: sys.exit('Incorrect env type') np.random.seed(id) random.seed(id) while True: teams_blueprint = task_pipe.recv( ) #Wait until a signal is received to start rollout if teams_blueprint == 'TERMINATE': exit(0) # Kill yourself # Get the current team actors if args.ps == 'full' or args.ps == 'trunk': if type == 'test' or type == 'pg': team = [ models_bucket[0] for _ in range(args.config.num_agents) ] elif type == "evo": team = [ models_bucket[0][teams_blueprint[0]] for _ in range(args.config.num_agents) ] else: sys.exit('Incorrect type') else: #Heterogeneous if type == 'test' or type == 'pg': team = models_bucket elif type == "evo": team = [ models_bucket[agent_id][popn_id] for agent_id, popn_id in enumerate(teams_blueprint) ] else: sys.exit('Incorrect type') if args.rollout_size == 0: if args.scheme == 'standard': store_transitions = False elif args.scheme == 'multipoint' and random.random( ) < 0.1 and store_transitions: store_transitions = True fitness = [None for _ in range(NUM_EVALS)] frame = 0 joint_state = env.reset() rollout_trajectory = [[] for _ in range(args.config.num_agents)] joint_state = utils.to_tensor(np.array(joint_state)) while True: #unless done if random_baseline: joint_action = [ np.random.random((NUM_EVALS, args.state_dim)) for _ in range(args.config.num_agents) ] elif type == 'pg': if args.ps == 'trunk': joint_action = [ team[i][0].noisy_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents) ] else: joint_action = [ team[i][0].noisy_action( joint_state[i, :]).detach().numpy() for i in range(args.config.num_agents) ] else: if args.ps == 'trunk': joint_action = [ team[i].clean_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents) ] else: joint_action = [ team[i].clean_action( joint_state[i, :]).detach().numpy() for i in range(args.config.num_agents) ] #JOINT ACTION [agent_id, universe_id, action] #Bound Action joint_action = np.array(joint_action).clip(-1.0, 1.0) next_state, reward, done, global_reward = env.step( joint_action) # Simulate one step in environment #State --> [agent_id, universe_id, obs] #reward --> [agent_id, universe_id] #done --> [universe_id] #info --> [universe_id] if args.config.env_choice == 'motivate' and type == "test": print(['%.2f' % r for r in reward], global_reward) next_state = utils.to_tensor(np.array(next_state)) #Grab global reward as fitnesses for i, grew in enumerate(global_reward): if grew != None: fitness[i] = grew #Reward Shaping if (args.config.env_choice == 'motivate' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'rover_tight') and type == "evo": if args.config.is_gsl: # Gloabl subsumes local? fitness[i] += sum(env.universe[i].cumulative_local) #Push experiences to memory if store_transitions: for agent_id in range(args.config.num_agents): for universe_id in range(NUM_EVALS): if not done[universe_id]: rollout_trajectory[agent_id].append([ np.expand_dims( utils.to_numpy(joint_state)[ agent_id, universe_id, :], 0), np.expand_dims( utils.to_numpy(next_state)[agent_id, universe_id, :], 0), np.expand_dims( joint_action[agent_id, universe_id, :], 0), np.expand_dims( np.array([reward[agent_id, universe_id]], dtype="float32"), 0), np.expand_dims( np.array([done[universe_id]], dtype="float32"), 0), universe_id, type ]) joint_state = next_state frame += NUM_EVALS if sum(done) > 0 and sum(done) != len(done): k = None #DONE FLAG IS Received if sum(done) == len(done): #Push experiences to main if store_transitions: if args.ps == 'full': #Full setup with one replay buffer for heap in rollout_trajectory: for entry in heap: temp_global_reward = fitness[entry[5]] entry[5] = np.expand_dims( np.array([temp_global_reward], dtype="float32"), 0) data_bucket[0].append(entry) else: #Heterogeneous for agent_id, buffer in enumerate(data_bucket): for entry in rollout_trajectory[agent_id]: temp_global_reward = fitness[entry[5]] entry[5] = np.expand_dims( np.array([temp_global_reward], dtype="float32"), 0) buffer.append(entry) break #print(fitness) if type == "test" and random.random() < 1.0 and ( args.config.env_choice == 'rover_tight' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'motivate'): env.render() print('Test trajectory lens', [len(world.rover_path[0]) for world in env.universe]) #print (type, id, 'Fit of rendered', ['%.2f'%f for f in fitness]) #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([teams_blueprint, [fitness], frame])
def rollout_worker(id, task_pipe, result_pipe, is_noise, data_bucket, model_bucket, env_name, noise_std, ALGO): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results is_noise (bool): Use noise? data_bucket (list of shared object): A list of shared object reference to s,ns,a,r,done (replay buffer) managed by a manager that is used to store experience tuples model_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors) env_name (str): Environment name? noise_std (float): Standard deviation of Gaussian for sampling noise Returns: None """ env = EnvironmentWrapper(env_name, ALGO) np.random.seed( id) ###make sure the random seeds across learners are different ###LOOP### while True: identifier = task_pipe.recv( ) # Wait until a signal is received to start rollout if identifier == 'TERMINATE': exit(0) #Kill yourself # Get the requisite network net = model_bucket[identifier] fitness = 0.0 total_frame = 0 state = env.reset() rollout_trajectory = [] state = utils.to_tensor(np.array(state)).unsqueeze(0) while True: # unless done action = net.forward(state) action = utils.to_numpy(action) if is_noise: action = (action + np.random.normal( 0, noise_std, size=env.env.action_space.shape[0])).clip( env.env.action_space.low, env.env.action_space.high) next_state, reward, done, info = env.step( action.flatten()) # Simulate one step in environment next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) fitness += reward # If storing transitions if data_bucket != None: #Skip for test set rollout_trajectory.append([ utils.to_numpy(state), utils.to_numpy(next_state), np.float32(action), np.reshape(np.float32(np.array([reward])), (1, 1)), np.reshape(np.float32(np.array([float(done)])), (1, 1)) ]) state = next_state total_frame += 1 # DONE FLAG IS Received if done: # Push experiences to main for entry in rollout_trajectory: data_bucket.append(entry) break # Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([identifier, fitness, total_frame])
def compute_intrinsic_reward(self, state, goal): #Simplest is the cosine similarity #distance = self.vector_distance.forward(state, goal) distance = torch.sum(torch.abs(state - goal) > self.success_criteria ).float() / self.args.state_dim return float(utils.to_numpy(distance))
def rollout_worker(worker_id, task_pipe, result_pipe, noise, exp_list, pop, difficulty, use_rs, store_transition=True, use_synthetic_targets=False, xbias=None, zbias=None, phase_len=100, traj_container=None, ep_len=1005, JGS=False): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: worker_id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results noise (object): A noise generator object exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples pop (shared list object): A shared list object managed by a manager used to store all the models (actors) difficulty (int): Difficulty of the task use_rs (bool): Use behavioral reward shaping? store_transition (bool): Log experiences to exp_list? Returns: None """ worker_id = worker_id env = EnvironmentWrapper(difficulty, rs=use_rs, use_synthetic_targets=use_synthetic_targets, xbias=xbias, zbias=zbias, phase_len=phase_len, jgs=JGS) nofault_endstep = phase_len * 4 if use_rs: lfoot = [] rfoot = [] if difficulty == 0: ltibia = [] rtibia = [] pelvis_x = [] pelvis_y = [] ltibia_angle = [] lfemur_angle = [] rtibia_angle = [] rfemur_angle = [] head_x = [] while True: _ = task_pipe.recv( ) #Wait until a signal is received to start rollout net = pop[worker_id] #Get the current model state from the population fitness = 0.0 total_frame = 0 shaped_fitness = 0.0 state = env.reset() rollout_trajectory = [] state = utils.to_tensor(np.array(state)).unsqueeze(0) exit_flag = True while True: #unless done action = net.forward(state) action = utils.to_numpy(action) if noise != None: action += noise.noise() next_state, reward, done, info = env.step( action.flatten()) # Simulate one step in environment if use_rs: #If using behavioral reward shaping lfoot.append(env.lfoot_xyz) rfoot.append(env.rfoot_xyz) if difficulty == 0: ltibia.append(env.ltibia_xyz) rtibia.append(env.rtibia_xyz) pelvis_y.append(env.pelvis_y) pelvis_x.append(env.pelvis_x) lfemur_angle.append(env.lfemur_angle) ltibia_angle.append(env.ltibia_angle) rfemur_angle.append(env.rfemur_angle) rtibia_angle.append(env.rtibia_angle) head_x.append(env.head_x) next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) fitness += reward #If storing transitions if store_transition: rollout_trajectory.append([ utils.to_numpy(state), action, utils.to_numpy(next_state), np.reshape(np.array([reward]), (1, 1)), np.reshape(np.array([-1.0]), (1, 1)), np.reshape(np.array([int(done)]), (1, 1)) ]) state = next_state #DONE FLAG IS Received if done or (use_synthetic_targets == True and env.istep >= nofault_endstep) or env.istep >= ep_len: total_frame += env.istep #Proximal Policy Optimization (Process and push as trajectories) if traj_container != None: traj_container.append(rollout_trajectory) if store_transition: # Forgive trajectories that did not end within 2 steps of maximum allowed if env.istep < 298 and difficulty == 0 or env.istep < 998 and difficulty != 0 and use_synthetic_targets != True or env.istep < ( nofault_endstep - 2) and difficulty != 0 and use_synthetic_targets: for i, entry in enumerate(rollout_trajectory): entry[4] = np.reshape( np.array([len(rollout_trajectory) - i]), (1, 1)) #Push experiences to main for entry in rollout_trajectory: exp_list.append([ entry[0], entry[1], entry[2], entry[3], entry[4], entry[5] ]) rollout_trajectory = [] #Behavioral Reward Shaping if use_rs: if difficulty == 0: #Round 1 ltibia = np.array(ltibia) rtibia = np.array(rtibia) pelvis_y = np.array(pelvis_y) pelvis_x = np.array(pelvis_x) head_x = np.array(head_x) lfemur_angle = np.degrees(np.array(lfemur_angle)) rfemur_angle = np.degrees(np.array(rfemur_angle)) ltibia_angle = np.degrees(np.array(ltibia_angle)) rtibia_angle = np.degrees(np.array(rtibia_angle)) #Compute Shaped fitness shaped_fitness = env.istep + rs.final_footx( pelvis_x, lfoot, rfoot ) * 100.0 #rs.thighs_swing(lfemur_angle, rfemur_angle)/360.0 + #Compute trajectory wide constraints hard_shape_w = rs.pelvis_height_rs( pelvis_y ) * rs.foot_z_rs(lfoot, rfoot) * rs.knee_bend( ltibia_angle, lfemur_angle, rtibia_angle, rfemur_angle) * rs.head_behind_pelvis(head_x) #Apply constraint to fitness/shaped_fitness shaped_fitness = shaped_fitness * hard_shape_w if shaped_fitness > 0 else shaped_fitness #fitness = fitness * hard_shape_w if fitness > 0 else fitness #Reset ltibia = [] rtibia = [] pelvis_x = [] pelvis_y = [] head_x = [] ltibia_angle = [] lfemur_angle = [] rtibia_angle = [] rfemur_angle = [] #ROUND 2 else: if JGS: shaped_fitness = [ env.istep + env.xjgs, env.istep + env.zjgs, env.istep - (abs(env.zjgs) * abs(env.xjgs)), env.istep * 10 + env.xjgs + env.zjgs ] # Foot Criss-cross lfoot = np.array(lfoot) rfoot = np.array(rfoot) criss_cross = rs.foot_z_rs(lfoot, rfoot) lfoot = [] rfoot = [] fitness = criss_cross * fitness else: ######## Scalarization RS ####### if env.zminus_pen > 0: zminus_fitness = fitness - env.zminus_pen * 5 else: zminus_fitness = 0.0 if env.zplus_pen > 0: zplus_fitness = fitness - 5 * env.zplus_pen else: zplus_fitness = 0.0 x_fitness = fitness - 5 * env.x_pen #Behavioral RS pelvis_swingx = rs.pelvis_swing( np.array(env.vel_traj), use_synthetic_targets, phase_len) pelv_swing_fit = fitness + pelvis_swingx #Foot Criss-cross lfoot = np.array(lfoot) rfoot = np.array(rfoot) criss_cross = rs.foot_z_rs(lfoot, rfoot) lfoot = [] rfoot = [] footz_fit = criss_cross * fitness #Make the scaled fitness list shaped_fitness = [ zplus_fitness, zminus_fitness, x_fitness, pelv_swing_fit, footz_fit ] else: shaped_fitness = [] ############## FOOT Z AXIS PENALTY ########## if exit_flag: break else: exit_flag = True state = env.reset() state = utils.to_tensor(np.array(state)).unsqueeze(0) #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([worker_id, fitness, total_frame, shaped_fitness])
def rollout_worker(args, worker_id, task_pipe, result_pipe, noise, data_bucket, models_bucket, model_template): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: worker_id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results noise (object): A noise generator object exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples pop (shared list object): A shared list object managed by a manager used to store all the models (actors) difficulty (int): Difficulty of the task use_rs (bool): Use behavioral reward shaping? store_transition (bool): Log experiences to exp_list? Returns: None """ worker_id = worker_id env = Task_Rovers(args) models = [model_template for _ in range(args.num_rover)] for m in models: m = m.eval() while True: RENDER = task_pipe.recv( ) #Wait until a signal is received to start rollout # Get the current model state from the population for m, bucket_model in zip(models, models_bucket): utils.hard_update(m, bucket_model) fitness = 0.0 joint_state = env.reset() rollout_trajectory = [[] for _ in range(args.num_rover)] joint_state = utils.to_tensor(np.array(joint_state)) while True: #unless done joint_action = [ models[i].forward(joint_state[i, :]).detach().numpy() for i in range(args.num_rover) ] if noise != None: for action in joint_action: action += noise.noise() next_state, reward, done, info = env.step( joint_action) # Simulate one step in environment next_state = utils.to_tensor(np.array(next_state)) fitness += sum(reward) / args.coupling #If storing transitions for i in range(args.num_rover): rollout_trajectory[i].append([ np.expand_dims(utils.to_numpy(joint_state)[i, :], 0), np.expand_dims(np.array(joint_action)[i, :], 0), np.expand_dims(utils.to_numpy(next_state)[i, :], 0), np.expand_dims(np.array([reward[i]]), 0), np.expand_dims(np.array([done]), 0) ]) joint_state = next_state #DONE FLAG IS Received if done: if RENDER: env.render() #Push experiences to main for rover_id in range(args.num_rover): for entry in rollout_trajectory[rover_id]: for i in range(len(entry[0])): data_bucket[rover_id].append([ entry[0], entry[1], entry[2], entry[3], entry[4] ]) break #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([fitness])
def rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket, models_bucket, store_transitions, random_baseline): """Rollout Worker runs a simulation in the environment to generate experiences and fitness values Parameters: args (object): Parameter class id (int): Specific Id unique to each worker spun task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task result_pipe (pipe): Sender end of the pipe used to report back results data_bucket (shared list object): A shared list object managed by a manager that is used to store experience tuples models_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors) store_transition (bool): Log experiences to exp_list? random_baseline (bool): Test with random action for baseline purpose? Returns: None """ if type == 'test': NUM_EVALS = args.num_test elif type == 'pg': NUM_EVALS = args.rollout_size elif type == 'evo': NUM_EVALS = 10 if not args.config.env_choice == 'motivate' else 1 else: sys.exit('Incorrect type') if args.rollout_size == 0: store_transitions = False # Evolution does not need to store data env = RoverDomainPython(args, NUM_EVALS) np.random.seed(id) random.seed(id) while True: teams_blueprint = task_pipe.recv( ) #Wait until a signal is received to start rollout if teams_blueprint == 'TERMINATE': exit(0) # Kill yourself # Get the current team actors if type == 'test' or type == 'pg': team = [models_bucket[0] for _ in range(args.config.num_agents)] elif type == "evo": team = [ models_bucket[0][teams_blueprint[0]] for _ in range(args.config.num_agents) ] else: sys.exit('Incorrect type') fitness = [None for _ in range(NUM_EVALS)] frame = 0 joint_state = env.reset() rollout_trajectory = [[] for _ in range(args.config.num_agents)] joint_state = utils.to_tensor(np.array(joint_state)) while True: #unless done if random_baseline: joint_action = [ np.random.random((NUM_EVALS, args.state_dim)) for _ in range(args.config.num_agents) ] elif type == 'pg': joint_action = [ team[i][0].noisy_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents) ] else: joint_action = [ team[i].clean_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents) ] #JOINT ACTION [agent_id, universe_id, action] #Bound Action joint_action = np.array(joint_action).clip(-1.0, 1.0) next_state, reward, done, global_reward = env.step( joint_action) # Simulate one step in environment #State --> [agent_id, universe_id, obs] #reward --> [agent_id, universe_id] #done --> [universe_id] #info --> [universe_id] next_state = utils.to_tensor(np.array(next_state)) #Grab global reward as fitnesses for i, grew in enumerate(global_reward): if grew != None: fitness[i] = grew #Push experiences to memory if store_transitions: if not args.is_matd3 and not args.is_maddpg: #Default for normal MERL for agent_id in range(args.config.num_agents): for universe_id in range(NUM_EVALS): if not done[universe_id]: rollout_trajectory[agent_id].append([ np.expand_dims( utils.to_numpy(joint_state)[ agent_id, universe_id, :], 0), np.expand_dims( utils.to_numpy(next_state)[ agent_id, universe_id, :], 0), np.expand_dims( joint_action[agent_id, universe_id, :], 0), np.expand_dims( np.array( [reward[agent_id, universe_id]], dtype="float32"), 0), np.expand_dims( np.array([done[universe_id]], dtype="float32"), 0), universe_id, type ]) else: #FOR MATD3/MADDPG - requires global state [concatenation of all observations and actions] for universe_id in range(NUM_EVALS): if not done[universe_id]: rollout_trajectory[0].append([ np.expand_dims( utils.to_numpy(joint_state) [:, universe_id, :], 0), np.expand_dims( utils.to_numpy(next_state)[:, universe_id, :], 0), np.expand_dims(joint_action[:, universe_id, :], 0), #[batch, agent_id, :] np.array([reward[:, universe_id]], dtype="float32"), np.expand_dims( np.array([done[universe_id]], dtype="float32"), 0), universe_id, type ]) joint_state = next_state frame += NUM_EVALS #DONE FLAG IS Received if sum(done) == len(done): #Push experiences to main if store_transitions: for agent_id, buffer in enumerate(data_bucket): for entry in rollout_trajectory[agent_id]: temp_global_reward = fitness[entry[5]] entry[5] = np.expand_dims( np.array([temp_global_reward], dtype="float32"), 0) buffer.append(entry) break #Send back id, fitness, total length and shaped fitness using the result pipe result_pipe.send([teams_blueprint, [fitness], frame])