예제 #1
0
파일: train.py 프로젝트: yinjiangjin/MERL
def evaluate(weights, args, NUM_EVALS=10, render=False):
	"""Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			args (object): Parameter class
			id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			data_bucket (shared list object): A shared list object managed by a manager that is used to store experience tuples
			models_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			store_transition (bool): Log experiences to exp_list?
			random_baseline (bool): Test with random action for baseline purpose?

		Returns:
			None
	"""
	env = RoverDomainPython(args, NUM_EVALS)
	model = MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)

	for i, param in enumerate(model.parameters()):
		try:
			param.data = weights[i]
		except:
			param.data = weights[i].data



	fitness = [None for _ in range(NUM_EVALS)]; frame=0
	joint_state = env.reset()
	joint_state = utils.to_tensor(np.array(joint_state))

	while True: #unless done

		joint_action = [model.clean_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents)]

		#JOINT ACTION [agent_id, universe_id, action]
		#Bound Action
		joint_action = np.array(joint_action).clip(-1.0, 1.0)
		next_state, reward, done, global_reward = env.step(joint_action)  # Simulate one step in environment
		#State --> [agent_id, universe_id, obs]
		#reward --> [agent_id, universe_id]
		#done --> [universe_id]
		#info --> [universe_id]

		next_state = utils.to_tensor(np.array(next_state))

		#Grab global reward as fitnesses
		for i, grew in enumerate(global_reward):
			if grew != None:
				fitness[i] = grew


		joint_state = next_state
		frame+=NUM_EVALS

		#DONE FLAG IS Received
		if sum(done)==len(done):
			break

	return sum(fitness)/len(fitness)
예제 #2
0
def rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket,
                   models_bucket, store_transitions, random_baseline):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			worker_id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			noise (object): A noise generator object
			exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
			pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
			difficulty (int): Difficulty of the task
			use_rs (bool): Use behavioral reward shaping?
			store_transition (bool): Log experiences to exp_list?

		Returns:
			None
	"""

    if type == 'test': NUM_EVALS = args.num_test
    elif type == 'pg': NUM_EVALS = args.rollout_size
    elif type == 'evo': NUM_EVALS = 5
    else: sys.exit('Incorrect type')

    if args.config.env_choice == 'multiwalker': NUM_EVALS = 1
    if args.config.env_choice == 'cassie': NUM_EVALS = 1
    if args.config.env_choice == 'hyper': NUM_EVALS = 1
    if args.config.env_choice == 'motivate': NUM_EVALS = 1
    if args.config.env_choice == 'pursuit': NUM_EVALS = 1

    if args.config.env_choice == 'rover_tight' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'rover_trap':
        from envs.env_wrapper import RoverDomainPython
        env = RoverDomainPython(args, NUM_EVALS)
    elif args.config.env_choice == 'motivate':
        from envs.env_wrapper import MotivateDomain
        env = MotivateDomain(args, NUM_EVALS)
    elif args.config.env_choice == 'pursuit':
        from envs.env_wrapper import Pursuit
        env = Pursuit(args, NUM_EVALS)
    elif args.config.env_choice == 'multiwalker':
        from envs.env_wrapper import MultiWalker
        env = MultiWalker(args, NUM_EVALS)
    elif args.config.env_choice == 'cassie':
        from envs.env_wrapper import Cassie
        env = Cassie(args, NUM_EVALS)
    elif args.config.env_choice == 'hyper':
        from envs.env_wrapper import PowerPlant
        env = PowerPlant(args, NUM_EVALS)
    else:
        sys.exit('Incorrect env type')
    np.random.seed(id)
    random.seed(id)

    while True:
        teams_blueprint = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        if teams_blueprint == 'TERMINATE': exit(0)  # Kill yourself

        # Get the current team actors
        if args.ps == 'full' or args.ps == 'trunk':
            if type == 'test' or type == 'pg':
                team = [
                    models_bucket[0] for _ in range(args.config.num_agents)
                ]
            elif type == "evo":
                team = [
                    models_bucket[0][teams_blueprint[0]]
                    for _ in range(args.config.num_agents)
                ]
            else:
                sys.exit('Incorrect type')

        else:  #Heterogeneous
            if type == 'test' or type == 'pg': team = models_bucket
            elif type == "evo":
                team = [
                    models_bucket[agent_id][popn_id]
                    for agent_id, popn_id in enumerate(teams_blueprint)
                ]
            else:
                sys.exit('Incorrect type')

        if args.rollout_size == 0:
            if args.scheme == 'standard': store_transitions = False
            elif args.scheme == 'multipoint' and random.random(
            ) < 0.1 and store_transitions:
                store_transitions = True
        fitness = [None for _ in range(NUM_EVALS)]
        frame = 0
        joint_state = env.reset()
        rollout_trajectory = [[] for _ in range(args.config.num_agents)]

        joint_state = utils.to_tensor(np.array(joint_state))

        while True:  #unless done

            if random_baseline:
                joint_action = [
                    np.random.random((NUM_EVALS, args.state_dim))
                    for _ in range(args.config.num_agents)
                ]
            elif type == 'pg':
                if args.ps == 'trunk':
                    joint_action = [
                        team[i][0].noisy_action(joint_state[i, :],
                                                head=i).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
                else:
                    joint_action = [
                        team[i][0].noisy_action(
                            joint_state[i, :]).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
            else:
                if args.ps == 'trunk':
                    joint_action = [
                        team[i].clean_action(joint_state[i, :],
                                             head=i).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
                else:
                    joint_action = [
                        team[i].clean_action(
                            joint_state[i, :]).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
            #JOINT ACTION [agent_id, universe_id, action]

            #Bound Action
            joint_action = np.array(joint_action).clip(-1.0, 1.0)
            next_state, reward, done, global_reward = env.step(
                joint_action)  # Simulate one step in environment
            #State --> [agent_id, universe_id, obs]
            #reward --> [agent_id, universe_id]
            #done --> [universe_id]
            #info --> [universe_id]

            if args.config.env_choice == 'motivate' and type == "test":
                print(['%.2f' % r for r in reward], global_reward)

            next_state = utils.to_tensor(np.array(next_state))

            #Grab global reward as fitnesses
            for i, grew in enumerate(global_reward):
                if grew != None:
                    fitness[i] = grew

                    #Reward Shaping
                    if (args.config.env_choice == 'motivate'
                            or args.config.env_choice == 'rover_loose'
                            or args.config.env_choice
                            == 'rover_tight') and type == "evo":
                        if args.config.is_gsl:  # Gloabl subsumes local?
                            fitness[i] += sum(env.universe[i].cumulative_local)

            #Push experiences to memory
            if store_transitions:
                for agent_id in range(args.config.num_agents):
                    for universe_id in range(NUM_EVALS):
                        if not done[universe_id]:
                            rollout_trajectory[agent_id].append([
                                np.expand_dims(
                                    utils.to_numpy(joint_state)[
                                        agent_id, universe_id, :], 0),
                                np.expand_dims(
                                    utils.to_numpy(next_state)[agent_id,
                                                               universe_id, :],
                                    0),
                                np.expand_dims(
                                    joint_action[agent_id, universe_id, :], 0),
                                np.expand_dims(
                                    np.array([reward[agent_id, universe_id]],
                                             dtype="float32"), 0),
                                np.expand_dims(
                                    np.array([done[universe_id]],
                                             dtype="float32"), 0), universe_id,
                                type
                            ])

            joint_state = next_state
            frame += NUM_EVALS

            if sum(done) > 0 and sum(done) != len(done):
                k = None

            #DONE FLAG IS Received
            if sum(done) == len(done):
                #Push experiences to main
                if store_transitions:
                    if args.ps == 'full':  #Full setup with one replay buffer
                        for heap in rollout_trajectory:
                            for entry in heap:
                                temp_global_reward = fitness[entry[5]]
                                entry[5] = np.expand_dims(
                                    np.array([temp_global_reward],
                                             dtype="float32"), 0)
                                data_bucket[0].append(entry)

                    else:  #Heterogeneous
                        for agent_id, buffer in enumerate(data_bucket):
                            for entry in rollout_trajectory[agent_id]:
                                temp_global_reward = fitness[entry[5]]
                                entry[5] = np.expand_dims(
                                    np.array([temp_global_reward],
                                             dtype="float32"), 0)
                                buffer.append(entry)

                break

        #print(fitness)

        if type == "test" and random.random() < 1.0 and (
                args.config.env_choice == 'rover_tight'
                or args.config.env_choice == 'rover_loose'
                or args.config.env_choice == 'motivate'):
            env.render()
            print('Test trajectory lens',
                  [len(world.rover_path[0]) for world in env.universe])
            #print (type, id, 'Fit of rendered', ['%.2f'%f for f in fitness])

        #Send back id, fitness, total length and shaped fitness using the result pipe

        result_pipe.send([teams_blueprint, [fitness], frame])
예제 #3
0
파일: train.py 프로젝트: yinjiangjin/MERL
		if not os.path.exists(self.metric_save): os.makedirs(self.metric_save)
		if not os.path.exists(self.model_save): os.makedirs(self.model_save)
		if not os.path.exists(self.aux_save): os.makedirs(self.aux_save)

		self.critic_fname = 'critic_' + self.savetag
		self.actor_fname = 'actor_' + self.savetag
		self.log_fname = 'reward_' + self.savetag
		self.best_fname = 'best_' + self.savetag





if __name__ == "__main__":
	args = Parameters()  # Create the Parameters class
	train_env = RoverDomainPython(args, 10)
	test_env = RoverDomainPython(args, 100)


	#test_tracker = utils.Tracker(args.metric_save, [args.log_fname], '.csv')  # Initiate tracker
	torch.manual_seed(args.seed);
	np.random.seed(args.seed);
	random.seed(args.seed)  # Seeds

	total_frames = 0; all_scores = [-1.0]; all_test = [-1.0]
	model = MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)


	print_threshold = 1000000
	###### TRAINING LOOP ########
	while True:
예제 #4
0
def test_rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket,
                        models_bucket, store_transitions, random_baseline):

    viz_gen = 0

    teams_blueprint = task_pipe.recv(
    )  #Wait until a signal is received  to start rollout
    team = models_bucket

    NUM_EVALS = 1

    if args.config.env_choice == 'rover_tight' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'rover_trap':
        from envs.env_wrapper import RoverDomainPython
        env = RoverDomainPython(args, NUM_EVALS)
    elif args.config.env_choice == 'rover_heterogeneous':
        from envs.env_wrapper import RoverHeterogeneous
        env = RoverHeterogeneous(args, NUM_EVALS)
    else:
        sys.exit('Incorrect env type')
    np.random.seed(id)
    random.seed(id)

    fitness = [None]
    joint_state = env.reset()
    rollout_trajectory = [[] for _ in range(args.config.num_agents)]

    joint_state = utils.to_tensor(np.array(joint_state))

    while True:  #unless done
        if random_baseline:
            joint_action = [
                np.random.random((NUM_EVALS, args.state_dim))
                for _ in range(args.config.num_agents)
            ]
        elif type == 'pg':
            if args.ps == 'trunk':
                joint_action = [
                    team[i][0].noisy_action(joint_state[i, :],
                                            head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
            else:
                joint_action = [
                    team[i][0].noisy_action(
                        joint_state[i, :]).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
        else:
            if args.ps == 'trunk':
                joint_action = [
                    team[i].clean_action(joint_state[i, :],
                                         head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
            else:
                joint_action = [
                    team[i].clean_action(joint_state[i, :]).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
        #JOINT ACTION [agent_id, universe_id, action]

        #Bound Action
        joint_action = np.array(joint_action).clip(-1.0, 1.0)
        next_state, reward, done, global_reward = env.step(
            joint_action)  # Simulate one step in environment

        try:
            next_state = utils.to_tensor(np.array(next_state))
        except:
            print(" here is the problem with the next_state", next_state)

        if type == "test" and args.config.config == 'simple_spread':
            env.render(0)
            print(joint_action[:, 0, :])
            # print(joint_state[0])

        #Grab global reward as fitnesses
        for i, grew in enumerate(
                global_reward
        ):  # for all evaluation, for rollout size for PG, and for num_eval for Evo part
            if grew != None:
                fitness[i] = grew

                #Reward Shaping
                if (args.config.env_choice == 'motivate'
                        or args.config.env_choice == 'rover_loose'
                        or args.config.env_choice == 'rover_tight'
                        or args.config.env_choice
                        == 'rover_heterogeneous') and type == "evo":
                    if args.config.is_gsl:  # Gloabl subsumes local?
                        fitness[i] += sum(env.universe[i].cumulative_local)

        joint_state = next_state

        #DONE FLAG IS Received
        if sum(done) == len(done):  # if done is 1, then exit
            #Push experiences to main
            break

    # Vizualization for test sets
    if type == "test" and (args.config.env_choice == 'rover_tight'
                           or args.config.env_choice == 'rover_heterogeneous'
                           or args.config.env_choice == 'rover_loose'
                           or args.config.env_choice == 'motivate'
                           or args.config.env_choice == 'rover_trap'
                           or args.config.config == 'simple_spread'):
        env.render()
        env.viz()

    ######### todo: to integrate here
    #if args.visualization:
    #    visualize(env, test_fits)

    #Send back id, fitness, total length and shaped fitness using the result pipe
    result_pipe.send([teams_blueprint, [fitness]])
예제 #5
0
def rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket,
                   models_bucket, store_transitions, random_baseline):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			args (object): Parameter class
			id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			data_bucket (shared list object): A shared list object managed by a manager that is used to store experience tuples
			models_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			store_transition (bool): Log experiences to exp_list?
			random_baseline (bool): Test with random action for baseline purpose?

		Returns:
			None
	"""

    if type == 'test': NUM_EVALS = args.num_test
    elif type == 'pg': NUM_EVALS = args.rollout_size
    elif type == 'evo':
        NUM_EVALS = 10 if not args.config.env_choice == 'motivate' else 1
    else:
        sys.exit('Incorrect type')

    if args.rollout_size == 0:
        store_transitions = False  # Evolution does not need to store data

    env = RoverDomainPython(args, NUM_EVALS)
    np.random.seed(id)
    random.seed(id)

    while True:
        teams_blueprint = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        if teams_blueprint == 'TERMINATE': exit(0)  # Kill yourself

        # Get the current team actors
        if type == 'test' or type == 'pg':
            team = [models_bucket[0] for _ in range(args.config.num_agents)]
        elif type == "evo":
            team = [
                models_bucket[0][teams_blueprint[0]]
                for _ in range(args.config.num_agents)
            ]
        else:
            sys.exit('Incorrect type')

        fitness = [None for _ in range(NUM_EVALS)]
        frame = 0
        joint_state = env.reset()
        rollout_trajectory = [[] for _ in range(args.config.num_agents)]
        joint_state = utils.to_tensor(np.array(joint_state))

        while True:  #unless done

            if random_baseline:
                joint_action = [
                    np.random.random((NUM_EVALS, args.state_dim))
                    for _ in range(args.config.num_agents)
                ]
            elif type == 'pg':
                joint_action = [
                    team[i][0].noisy_action(joint_state[i, :],
                                            head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
            else:
                joint_action = [
                    team[i].clean_action(joint_state[i, :],
                                         head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]

            #JOINT ACTION [agent_id, universe_id, action]
            #Bound Action
            joint_action = np.array(joint_action).clip(-1.0, 1.0)
            next_state, reward, done, global_reward = env.step(
                joint_action)  # Simulate one step in environment
            #State --> [agent_id, universe_id, obs]
            #reward --> [agent_id, universe_id]
            #done --> [universe_id]
            #info --> [universe_id]

            next_state = utils.to_tensor(np.array(next_state))

            #Grab global reward as fitnesses
            for i, grew in enumerate(global_reward):
                if grew != None:
                    fitness[i] = grew

            #Push experiences to memory
            if store_transitions:
                if not args.is_matd3 and not args.is_maddpg:  #Default for normal MERL
                    for agent_id in range(args.config.num_agents):
                        for universe_id in range(NUM_EVALS):
                            if not done[universe_id]:
                                rollout_trajectory[agent_id].append([
                                    np.expand_dims(
                                        utils.to_numpy(joint_state)[
                                            agent_id, universe_id, :], 0),
                                    np.expand_dims(
                                        utils.to_numpy(next_state)[
                                            agent_id, universe_id, :], 0),
                                    np.expand_dims(
                                        joint_action[agent_id, universe_id, :],
                                        0),
                                    np.expand_dims(
                                        np.array(
                                            [reward[agent_id, universe_id]],
                                            dtype="float32"), 0),
                                    np.expand_dims(
                                        np.array([done[universe_id]],
                                                 dtype="float32"), 0),
                                    universe_id, type
                                ])

                else:  #FOR MATD3/MADDPG - requires global state [concatenation of all observations and actions]
                    for universe_id in range(NUM_EVALS):
                        if not done[universe_id]:
                            rollout_trajectory[0].append([
                                np.expand_dims(
                                    utils.to_numpy(joint_state)
                                    [:, universe_id, :], 0),
                                np.expand_dims(
                                    utils.to_numpy(next_state)[:,
                                                               universe_id, :],
                                    0),
                                np.expand_dims(joint_action[:, universe_id, :],
                                               0),  #[batch, agent_id, :]
                                np.array([reward[:, universe_id]],
                                         dtype="float32"),
                                np.expand_dims(
                                    np.array([done[universe_id]],
                                             dtype="float32"), 0),
                                universe_id,
                                type
                            ])

            joint_state = next_state
            frame += NUM_EVALS

            #DONE FLAG IS Received
            if sum(done) == len(done):
                #Push experiences to main
                if store_transitions:
                    for agent_id, buffer in enumerate(data_bucket):
                        for entry in rollout_trajectory[agent_id]:
                            temp_global_reward = fitness[entry[5]]
                            entry[5] = np.expand_dims(
                                np.array([temp_global_reward],
                                         dtype="float32"), 0)
                            buffer.append(entry)

                break

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([teams_blueprint, [fitness], frame])
예제 #6
0
파일: rover_viz.py 프로젝트: wsg1873/MERL
    if classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight, gain=0.5)
        torch.nn.init.constant_(m.bias, 0)


# Initialize Value Fn weights
def weights_init_value_fn(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        torch.nn.init.xavier_uniform_(m.weight, gain=1)
        torch.nn.init.constant_(m.bias, 0)


args = Parameters()
NUM_EVALS = 1
env = RoverDomainPython(args, NUM_EVALS)

path = 'nets/0_actor_pop20_roll50_envrover_tight_6_3_seed2023-reward'
buffer = torch.load(path)
net = MultiHeadActor(args.state_dim, args.action_dim, 100,
                     args.config.num_agents)
net.load_state_dict(buffer)
net.eval()

joint_state = env.reset()
joint_state = utils.to_tensor(np.array(joint_state))
fitness = [0 for _ in range(NUM_EVALS)]
local_fitness = [0 for _ in range(NUM_EVALS)]

while True:  #unless done