Пример #1
0
def main(args):
    render = args.render
    if not render:
        import matplotlib
        matplotlib.use('Agg')
        import matplotlib.pyplot as plt
    from utils.utils import TabularPolicy, TabularValueFun
    from part1.tabular_value_iteration import ValueIteration
    from envs import Grid1DEnv, GridWorldEnv
    envs = [GridWorldEnv(seed=0), GridWorldEnv(seed=1)]

    for env in envs:
        env_name = env.__name__
        exp_dir = os.getcwd() + '/data/part1/%s/policy_type%s_temperature%s/' % (env_name, args.policy_type, args.temperature)
        logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'])
        args_dict = vars(args)
        args_dict['env'] = env_name
        json.dump(vars(args), open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True)

        policy = TabularPolicy(env)
        value_fun = TabularValueFun(env)
        algo = ValueIteration(env,
                              value_fun,
                              policy,
                              policy_type=args.policy_type,
                              render=render,
                              temperature=args.temperature)
        algo.train()
Пример #2
0
    def __init__(self):
        super().__init__()
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999995

        args = get_config()
        args.env_name = "EscalationGW"
        self.env = GridWorldEnv(args)
        self.no_iter = 200000
        self.alpha = 1.0
        self.alpha_min = 0.001
        self.alpha_decay = 0.999995
Пример #3
0
 def init_env():
     if args.env_name == "StagHunt":
         assert args.num_agents == 2, (
             "only 2 agents is supported, check the config.py.")
         env = MGEnv(args)
     elif args.env_name == "StagHuntGW":
         assert args.num_agents == 2, (
             "only 2 agent is supported in single navigation, check the config.py."
         )
         env = GridWorldEnv(args)
     else:
         print("Can not support the " + args.env_name + "environment.")
         raise NotImplementedError
     env.seed(args.seed + rank * 1000)
     return env
Пример #4
0
def main():
    args = get_config()
    
    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(1)
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)
    
    run_dir = Path(args.model_dir)/ ("run" + str(args.seed)) / 'eval_finetune'
    if os.path.exists(run_dir): 
        shutil.rmtree(run_dir)
        os.mkdir(run_dir)
    log_dir = run_dir / 'logs'
    os.makedirs(str(log_dir))
    logger = SummaryWriter(str(log_dir))
    gifs_dir = run_dir / 'gifs'	
    os.makedirs(str(gifs_dir))
	
    # env	
    if args.env_name == "StagHunt":	
        assert args.num_agents == 2, ("only 2 agents is supported, check the config.py.")	
        env = MGEnv(args)		
    elif args.env_name == "StagHuntGW" or args.env_name == "EscalationGW":	
        assert args.num_agents == 2, ("only 2 agent is supported in single navigation, check the config.py.")	
        env = GridWorldEnv(args)		
    else:	
        print("Can not support the " + args.env_name + "environment." )	
        raise NotImplementedError	

    #Policy network    	    
    actor_critic = []	    
    for i in range(args.num_agents):
        ac = torch.load(str(args.model_dir) + 'run' + str(args.seed) + "/finetune/models/agent%i_model" % i + ".pt")['model'].to(device)
        actor_critic.append(ac)
        
    coop_num = []
    defect_num = []
    coopdefect_num = []
    defectcoop_num = []
    gore1_num = []
    gore2_num = []
    hare1_num = []
    hare2_num = []
    collective_return = []
    apple_consumption = []
    waste_cleared = []
    sustainability = []
    fire = []
    eval_rewards = []
    frames = []
        
    for episode in range(args.eval_episodes):  
        print("Episode %i of %i" % (episode, args.eval_episodes))
        state = env.reset()	
        state = np.array([state])	
        
        share_obs = []	
        obs = []	
        recurrent_hidden_statess = []	
        recurrent_hidden_statess_critic = []	
        recurrent_c_statess = []	
        recurrent_c_statess_critic = []	
        masks = []	
        policy_reward = 0	

        # rollout	
        for i in range(args.num_agents):	
            if len(env.observation_space[0]) == 1:	
                share_obs.append((torch.tensor(state.reshape(1, -1),dtype=torch.float32)).to(device))	
                obs.append((torch.tensor(state[:,i,:],dtype=torch.float32)).to(device))	
            else:	
                raise NotImplementedError	
            recurrent_hidden_statess.append(torch.zeros(1, actor_critic[i].recurrent_hidden_state_size).to(device))	
            recurrent_hidden_statess_critic.append(torch.zeros(1, actor_critic[i].recurrent_hidden_state_size).to(device))	
            recurrent_c_statess.append(torch.zeros(1, actor_critic[i].recurrent_hidden_state_size).to(device))	
            recurrent_c_statess_critic.append(torch.zeros(1, actor_critic[i].recurrent_hidden_state_size).to(device))	
            masks.append(torch.ones(1,1).to(device))	
            
        frames_dir = str(gifs_dir) + '/episode%i/'%episode + 'frames/'
        for step in range(args.episode_length):	   
            print("step %i of %i" % (step, args.episode_length))	
            # Sample actions
            
            if not os.path.exists(frames_dir):
                os.makedirs(frames_dir)
                
            if args.save_gifs:
                frame = env.render(filename= frames_dir + str(step).zfill(6) + '.png')
                frames.append(frame)	
            	
            one_hot_actions = []	
            for i in range(args.num_agents):
                one_hot_action = np.zeros(env.action_space[0].n)	
                with torch.no_grad():	
                    value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic,recurrent_c_states, recurrent_c_states_critic = actor_critic[i].act(share_obs[i], obs[i], recurrent_hidden_statess[i], recurrent_hidden_statess_critic[i], recurrent_c_statess[i], recurrent_c_statess_critic[i], masks[i])	
                recurrent_hidden_statess[i].copy_(recurrent_hidden_states)	
                recurrent_hidden_statess_critic[i].copy_(recurrent_hidden_states_critic) 	
                recurrent_c_statess[i].copy_(recurrent_c_states)	
                recurrent_c_statess_critic[i].copy_(recurrent_c_states_critic)              	
                one_hot_action[action] = 1	
                one_hot_actions.append(one_hot_action)	

            # Obser reward and next obs	
            state, reward, done, infos = env.step(one_hot_actions)
            if any(done):
                break	

            for i in range(args.num_agents):	
                print("Reward of agent%i: " %i + str(reward[i]))	
                policy_reward += reward[i]	
            state = np.array([state])	

            for i in range(args.num_agents):	
                if len(env.observation_space[0]) == 1:	
                    share_obs[i].copy_(torch.tensor(state.reshape(1, -1),dtype=torch.float32))	
                    obs[i].copy_(torch.tensor(state[:,i,:],dtype=torch.float32))	
                 
        eval_rewards.append(policy_reward)
        if args.save_gifs:
            utility_funcs.make_gif_from_image_dir(str(gifs_dir) + '/episode%i/'%episode, frames_dir, gif_name=args.env_name + '_trajectory')
        if args.env_name == "StagHunt":
            if 'coop&coop_num' in infos.keys():
                coop_num.append(infos['coop&coop_num'])
            if 'defect&defect_num' in infos.keys():
                defect_num.append(infos['defect&defect_num'])
            if 'coop&defect_num' in infos.keys():
                coopdefect_num.append(infos['coop&defect_num'])
            if 'defect&coop_num' in infos.keys():
                defectcoop_num.append(infos['defect&coop_num'])
                        
            logger.add_scalars('coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[episode]},
                    episode)
            logger.add_scalars('defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[episode]},
                    episode)
            logger.add_scalars('coop&defect_num_per_episode',
                    {'coop&defect_num_per_episode': coopdefect_num[episode]},
                    episode)
            logger.add_scalars('defect&coop_num_per_episode',
                    {'defect&coop_num_per_episode': defectcoop_num[episode]},
                    episode)
            logger.add_scalars('collective_return',
                    {'collective_return': eval_rewards[episode]*10},
                    episode)
        elif args.env_name == "StagHuntGW":
            if 'collective_return' in infos.keys(): 
                collective_return.append(infos['collective_return']) 
            if 'coop&coop_num' in infos.keys():
                coop_num.append(infos['coop&coop_num'])
            if 'gore1_num' in infos.keys(): 
                gore1_num.append(infos['gore1_num']) 
            if 'gore2_num' in infos.keys():
                gore2_num.append(infos['gore2_num'])
            if 'hare1_num' in infos.keys(): 
                hare1_num.append(infos['hare1_num']) 
            if 'hare2_num' in infos.keys():
                hare2_num.append(infos['hare2_num'])

            logger.add_scalars('collective_return',
                {'collective_return': collective_return[episode]},
                episode)
            logger.add_scalars('coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[episode]},
                    episode)
            logger.add_scalars('gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[episode]},
                    episode)
            logger.add_scalars('gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[episode]},
                    episode)
            logger.add_scalars('hare1_num_per_episode',
                    {'hare1_num_per_episode': hare1_num[episode]},
                    episode)
            logger.add_scalars('hare2_num_per_episode',
                    {'hare2_num_per_episode': hare2_num[episode]},
                    episode)
                    
        elif args.env_name == "EscalationGW":
            if 'collective_return' in infos.keys(): 
                collective_return.append(infos['collective_return']) 
            if 'coop&coop_num' in infos.keys():
                coop_num.append(infos['coop&coop_num'])

            logger.add_scalars('collective_return',
                {'collective_return': collective_return[episode]},
                episode)
            logger.add_scalars('coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[episode]},
                    episode)
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()   
    r = np.mean(np.array(eval_rewards))
    print("Mean reward is %i" % r)
Пример #5
0
def main():
    args = get_config()

    # seed
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)

    # cuda
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(args.n_training_threads)
        if args.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(args.n_training_threads)

    # path
    model_dir = Path('./results') / args.env_name / args.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    log_dir = run_dir / 'logs'
    save_dir = run_dir / 'models'
    os.makedirs(str(log_dir))
    os.makedirs(str(save_dir))
    logger = SummaryWriter(str(log_dir))

    # env
    envs = make_parallel_env(args)
    #Policy network
    actor_critic = []
    if args.share_policy:
        ac = Policy(envs.observation_space[0],
                    envs.action_space[0],
                    num_agents=args.num_agents,
                    base_kwargs={
                        'lstm': args.lstm,
                        'naive_recurrent': args.naive_recurrent_policy,
                        'recurrent': args.recurrent_policy,
                        'hidden_size': args.hidden_size
                    })
        ac.to(device)
        for agent_id in range(args.num_agents):
            actor_critic.append(ac)
    else:
        for agent_id in range(args.num_agents):
            ac = Policy(envs.observation_space[0],
                        envs.action_space[0],
                        num_agents=args.num_agents,
                        base_kwargs={
                            'naive_recurrent': args.naive_recurrent_policy,
                            'recurrent': args.recurrent_policy,
                            'hidden_size': args.hidden_size
                        })
            ac.to(device)
            actor_critic.append(ac)

    agents = []
    rollouts = []
    for agent_id in range(args.num_agents):
        # algorithm
        agent = PPO(actor_critic[agent_id],
                    agent_id,
                    args.clip_param,
                    args.ppo_epoch,
                    args.num_mini_batch,
                    args.data_chunk_length,
                    args.value_loss_coef,
                    args.entropy_coef,
                    logger,
                    lr=args.lr,
                    eps=args.eps,
                    max_grad_norm=args.max_grad_norm,
                    use_clipped_value_loss=args.use_clipped_value_loss)
        #replay buffer
        ro = RolloutStorage(args.num_agents, agent_id, args.episode_length,
                            args.n_rollout_threads,
                            envs.observation_space[agent_id],
                            envs.action_space[agent_id],
                            actor_critic[agent_id].recurrent_hidden_state_size)
        agents.append(agent)
        rollouts.append(ro)

    # reset env
    obs = envs.reset()
    # rollout
    for i in range(args.num_agents):
        rollouts[i].share_obs[0].copy_(
            torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
        rollouts[i].obs[0].copy_(torch.tensor(obs[:, i, :]))
        rollouts[i].recurrent_hidden_states.zero_()
        rollouts[i].recurrent_hidden_states_critic.zero_()
        rollouts[i].recurrent_c_states.zero_()
        rollouts[i].recurrent_c_states_critic.zero_()
        rollouts[i].to(device)

    # run
    coop_num = []
    defect_num = []
    coopdefect_num = []
    defectcoop_num = []
    gore1_num = []
    gore2_num = []
    gore3_num = []
    hare1_num = []
    hare2_num = []
    hare3_num = []
    collective_return = []
    apple_consumption = []
    waste_cleared = []
    sustainability = []
    fire = []

    start = time.time()
    episodes = int(
        args.num_env_steps) // args.episode_length // args.n_rollout_threads
    all_episode = 0

    for episode in range(episodes):

        if args.use_linear_lr_decay:
            # decrease learning rate linearly
            for i in range(args.num_agents):
                update_linear_schedule(agents[i].optimizer, episode, episodes,
                                       args.lr)

        for step in range(args.episode_length):
            # Sample actions
            values = []
            actions = []
            action_log_probs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []
            recurrent_c_statess = []
            recurrent_c_statess_critic = []

            with torch.no_grad():
                for i in range(args.num_agents):
                    value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic,\
                        recurrent_c_states, recurrent_c_states_critic =\
                            actor_critic[i].act(rollouts[i].share_obs[step],
                                                        rollouts[i].obs[step],
                                                        rollouts[i].recurrent_hidden_states[step],
                                                        rollouts[i].recurrent_hidden_states_critic[step],
                                                        rollouts[i].recurrent_c_states[step],
                                                        rollouts[i].recurrent_c_states_critic[step],
                                                        rollouts[i].masks[step])
                    values.append(value)
                    actions.append(action)
                    action_log_probs.append(action_log_prob)
                    recurrent_hidden_statess.append(recurrent_hidden_states)
                    recurrent_hidden_statess_critic.append(
                        recurrent_hidden_states_critic)
                    recurrent_c_statess.append(recurrent_c_states)
                    recurrent_c_statess_critic.append(
                        recurrent_c_states_critic)

            # rearrange action
            actions_env = []
            for i in range(args.n_rollout_threads):
                one_hot_action_env = []
                for k in range(args.num_agents):
                    one_hot_action = np.zeros(envs.action_space[0].n)
                    one_hot_action[actions[k][i]] = 1
                    one_hot_action_env.append(one_hot_action)
                actions_env.append(one_hot_action_env)

            # Obser reward and next obs
            obs, reward, done, infos = envs.step(actions_env)

            # If done then clean the history of observations.
            # insert data in buffer
            masks = []
            bad_masks = []
            for i in range(args.num_agents):
                mask = []
                bad_mask = []
                for done_ in done:
                    if done_[i]:
                        mask.append([0.0])
                        bad_mask.append([1.0])
                    else:
                        mask.append([1.0])
                        bad_mask.append([1.0])
                masks.append(torch.FloatTensor(mask))
                bad_masks.append(torch.FloatTensor(bad_mask))

            for i in range(args.num_agents):
                rollouts[i].insert(
                    torch.tensor(obs.reshape(args.n_rollout_threads, -1)),
                    torch.tensor(obs[:, i, :]), recurrent_hidden_statess[i],
                    recurrent_hidden_statess_critic[i], recurrent_c_statess[i],
                    recurrent_c_statess_critic[i], actions[i],
                    action_log_probs[i], values[i],
                    torch.tensor(reward[:,
                                        i].reshape(-1,
                                                   1)), masks[i], bad_masks[i])

        with torch.no_grad():
            next_values = []
            for i in range(args.num_agents):
                next_value = actor_critic[i].get_value(
                    rollouts[i].share_obs[-1], rollouts[i].obs[-1],
                    rollouts[i].recurrent_hidden_states[-1],
                    rollouts[i].recurrent_hidden_states_critic[-1],
                    rollouts[i].recurrent_c_states[-1],
                    rollouts[i].recurrent_c_states_critic[-1],
                    rollouts[i].masks[-1]).detach()
                next_values.append(next_value)

        for i in range(args.num_agents):
            rollouts[i].compute_returns(next_values[i], args.use_gae,
                                        args.gamma, args.gae_lambda,
                                        args.use_proper_time_limits)

        # update the network
        value_losses = []
        action_losses = []
        dist_entropies = []
        for i in range(args.num_agents):
            value_loss, action_loss, dist_entropy = agents[i].update(
                rollouts[i])
            value_losses.append(value_loss)
            action_losses.append(action_loss)
            dist_entropies.append(dist_entropy)

        if args.env_name == "StagHunt":
            for info in infos:
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'defect&defect_num' in info.keys():
                    defect_num.append(info['defect&defect_num'])
                if 'coop&defect_num' in info.keys():
                    coopdefect_num.append(info['coop&defect_num'])
                if 'defect&coop_num' in info.keys():
                    defectcoop_num.append(info['defect&coop_num'])

            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[all_episode]},
                    all_episode)
                logger.add_scalars('coop&defect_num_per_episode', {
                    'coop&defect_num_per_episode':
                    coopdefect_num[all_episode]
                }, all_episode)
                logger.add_scalars('defect&coop_num_per_episode', {
                    'defect&coop_num_per_episode':
                    defectcoop_num[all_episode]
                }, all_episode)
                all_episode += 1
        elif args.env_name == "StagHuntGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'gore1_num' in info.keys():
                    gore1_num.append(info['gore1_num'])
                if 'gore2_num' in info.keys():
                    gore2_num.append(info['gore2_num'])
                if 'hare1_num' in info.keys():
                    hare1_num.append(info['hare1_num'])
                if 'hare2_num' in info.keys():
                    hare2_num.append(info['hare2_num'])
            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare1_num_per_episode',
                    {'hare1_num_per_episode': hare1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare2_num_per_episode',
                    {'hare2_num_per_episode': hare2_num[all_episode]},
                    all_episode)
                all_episode += 1
        elif args.env_name == "EscalationGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                all_episode += 1
        elif args.env_name == "multi_StagHuntGW":
            for info in infos:
                if 'collective_return' in info.keys():
                    collective_return.append(info['collective_return'])
                if 'coop&coop_num' in info.keys():
                    coop_num.append(info['coop&coop_num'])
                if 'gore0_num' in info.keys():
                    gore1_num.append(info['gore0_num'])
                if 'gore1_num' in info.keys():
                    gore2_num.append(info['gore1_num'])
                if 'gore2_num' in info.keys():
                    gore3_num.append(info['gore2_num'])
                if 'hare0_num' in info.keys():
                    hare1_num.append(info['hare0_num'])
                if 'hare1_num' in info.keys():
                    hare2_num.append(info['hare1_num'])
                if 'hare2_num' in info.keys():
                    hare3_num.append(info['hare2_num'])
            for i in range(args.n_rollout_threads):
                logger.add_scalars(
                    'collective_return',
                    {'collective_return': collective_return[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore1_num_per_episode',
                    {'gore1_num_per_episode': gore1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore2_num_per_episode',
                    {'gore2_num_per_episode': gore2_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'gore3_num_per_episode',
                    {'gore3_num_per_episode': gore3_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare1_num_per_episode',
                    {'hare1_num_per_episode': hare1_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare2_num_per_episode',
                    {'hare2_num_per_episode': hare2_num[all_episode]},
                    all_episode)
                logger.add_scalars(
                    'hare3_num_per_episode',
                    {'hare3_num_per_episode': hare3_num[all_episode]},
                    all_episode)
                all_episode += 1

        # clean the buffer and reset
        obs = envs.reset()
        for i in range(args.num_agents):
            rollouts[i].share_obs[0].copy_(
                torch.tensor(obs.reshape(args.n_rollout_threads, -1)))
            rollouts[i].obs[0].copy_(torch.tensor(obs[:, i, :]))
            rollouts[i].recurrent_hidden_states.zero_()
            rollouts[i].recurrent_hidden_states_critic.zero_()
            rollouts[i].recurrent_c_states.zero_()
            rollouts[i].recurrent_c_states_critic.zero_()
            rollouts[i].masks[0].copy_(torch.ones(args.n_rollout_threads, 1))
            rollouts[i].bad_masks[0].copy_(
                torch.ones(args.n_rollout_threads, 1))
            rollouts[i].to(device)

        for i in range(args.num_agents):
            # save for every interval-th episode or for the last epoch
            if (episode % args.save_interval == 0 or episode == episodes - 1):
                torch.save({'model': actor_critic[i]},
                           str(save_dir) + "/agent%i_model" % i + ".pt")

        # log information
        if episode % args.log_interval == 0:
            total_num_steps = (
                episode + 1) * args.episode_length * args.n_rollout_threads
            end = time.time()
            print(
                "\n Updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
                .format(episode, episodes, total_num_steps, args.num_env_steps,
                        int(total_num_steps / (end - start))))
            for i in range(args.num_agents):
                print("value loss of agent%i: " % i + str(value_losses[i]))
    logger.export_scalars_to_json(str(log_dir / 'summary.json'))
    logger.close()

    ###----------------------------------------------------------###
    ###----------------------------------------------------------###
    ###----------------------------------------------------------###
    if args.eval:
        eval_dir = run_dir / 'eval'
        log_dir = eval_dir / 'logs'
        os.makedirs(str(log_dir))
        logger = SummaryWriter(str(log_dir))

        # eval best policy
        eval_rewards = []
        # env
        if args.env_name == "StagHunt":
            assert args.num_agents == 2, (
                "only 2 agents is supported, check the config.py.")
            env = MGEnv(args)
        elif args.env_name == "StagHuntGW" or args.env_name == "EscalationGW":
            assert args.num_agents == 2, (
                "only 2 agent is supported in single navigation, check the config.py."
            )
            env = GridWorldEnv(args)
        elif args.env_name == "multi_StagHuntGW":
            env = multi_GridWorldEnv(args)
        else:
            print("Can not support the " + args.env_name + "environment.")
            raise NotImplementedError

        #Policy network
        coop_num = []
        defect_num = []
        coopdefect_num = []
        defectcoop_num = []
        gore1_num = []
        gore2_num = []
        gore3_num = []
        hare1_num = []
        hare2_num = []
        hare3_num = []
        collective_return = []
        apple_consumption = []
        waste_cleared = []
        sustainability = []
        fire = []

        for episode in range(args.eval_episodes):
            print("Episode %i of %i" % (episode, args.eval_episodes))
            state = env.reset()
            state = np.array([state])

            share_obs = []
            obs = []
            recurrent_hidden_statess = []
            recurrent_hidden_statess_critic = []
            recurrent_c_statess = []
            recurrent_c_statess_critic = []
            masks = []
            policy_reward = 0

            # rollout
            for i in range(args.num_agents):
                share_obs.append(
                    (torch.tensor(state.reshape(1, -1),
                                  dtype=torch.float32)).to(device))
                obs.append((torch.tensor(state[:, i, :],
                                         dtype=torch.float32)).to(device))
                recurrent_hidden_statess.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                recurrent_hidden_statess_critic.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                recurrent_c_statess.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                recurrent_c_statess_critic.append(
                    torch.zeros(
                        1, actor_critic[i].recurrent_hidden_state_size).to(
                            device))
                masks.append(torch.ones(1, 1).to(device))

            for step in range(args.episode_length):
                print("step %i of %i" % (step, args.episode_length))
                # Sample actions
                one_hot_actions = []
                for i in range(args.num_agents):
                    one_hot_action = np.zeros(env.action_space[0].n)
                    with torch.no_grad():
                        value, action, action_log_prob, recurrent_hidden_states, recurrent_hidden_states_critic, recurrent_c_states, recurrent_c_states_critic = actor_critic[
                            i].act(share_obs[i], obs[i],
                                   recurrent_hidden_statess[i],
                                   recurrent_hidden_statess_critic[i],
                                   recurrent_c_statess[i],
                                   recurrent_c_statess_critic[i], masks[i])
                    recurrent_hidden_statess[i].copy_(recurrent_hidden_states)
                    recurrent_hidden_statess_critic[i].copy_(
                        recurrent_hidden_states_critic)
                    recurrent_c_statess[i].copy_(recurrent_c_states)
                    recurrent_c_statess_critic[i].copy_(
                        recurrent_c_states_critic)
                    one_hot_action[action] = 1
                    one_hot_actions.append(one_hot_action)

                # Obser reward and next obs
                state, reward, done, infos = env.step(one_hot_actions)

                for i in range(args.num_agents):
                    print("Reward of agent%i: " % i + str(reward[i]))
                    policy_reward += reward[i]

                if all(done):
                    break

                state = np.array([state])

                for i in range(args.num_agents):
                    if len(env.observation_space[0]) == 1:
                        share_obs[i].copy_(
                            torch.tensor(state.reshape(1, -1),
                                         dtype=torch.float32))
                        obs[i].copy_(
                            torch.tensor(state[:, i, :], dtype=torch.float32))
                    elif len(env.observation_space[0]) == 3:
                        share_obs[i].copy_(
                            torch.tensor(state.reshape(
                                1, -1, env.observation_space[0][1],
                                env.observation_space[0][2]),
                                         dtype=torch.float32))
                        obs[i].copy_(
                            torch.tensor(state[:, i, :, :, :],
                                         dtype=torch.float32))

            eval_rewards.append(policy_reward)

            if args.env_name == "StagHunt":
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                if 'defect&defect_num' in infos.keys():
                    defect_num.append(infos['defect&defect_num'])
                if 'coop&defect_num' in infos.keys():
                    coopdefect_num.append(infos['coop&defect_num'])
                if 'defect&coop_num' in infos.keys():
                    defectcoop_num.append(infos['defect&coop_num'])

                logger.add_scalars(
                    'coop&coop_num_per_episode',
                    {'coop&coop_num_per_episode': coop_num[episode]}, episode)
                logger.add_scalars(
                    'defect&defect_num_per_episode',
                    {'defect&defect_num_per_episode': defect_num[episode]},
                    episode)
                logger.add_scalars(
                    'coop&defect_num_per_episode',
                    {'coop&defect_num_per_episode': coopdefect_num[episode]},
                    episode)
                logger.add_scalars(
                    'defect&coop_num_per_episode',
                    {'defect&coop_num_per_episode': defectcoop_num[episode]},
                    episode)

            elif args.env_name == "StagHuntGW":
                if 'collective_return' in infos.keys():
                    collective_return.append(infos['collective_return'])
                    logger.add_scalars(
                        'collective_return',
                        {'collective_return': collective_return[episode]},
                        episode)
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                    logger.add_scalars(
                        'coop&coop_num_per_episode',
                        {'coop&coop_num_per_episode': coop_num[episode]},
                        episode)
                if 'gore1_num' in infos.keys():
                    gore1_num.append(infos['gore1_num'])
                    logger.add_scalars(
                        'gore1_num_per_episode',
                        {'gore1_num_per_episode': gore1_num[episode]}, episode)
                if 'gore2_num' in infos.keys():
                    gore2_num.append(infos['gore2_num'])
                    logger.add_scalars(
                        'gore2_num_per_episode',
                        {'gore2_num_per_episode': gore2_num[episode]}, episode)
                if 'hare1_num' in infos.keys():
                    hare1_num.append(infos['hare1_num'])
                    logger.add_scalars(
                        'hare1_num_per_episode',
                        {'hare1_num_per_episode': hare1_num[episode]}, episode)
                if 'hare2_num' in infos.keys():
                    hare2_num.append(infos['hare2_num'])
                    logger.add_scalars(
                        'hare2_num_per_episode',
                        {'hare2_num_per_episode': hare2_num[episode]}, episode)
            elif args.env_name == "EscalationGW":
                if 'collective_return' in infos.keys():
                    collective_return.append(infos['collective_return'])
                    logger.add_scalars(
                        'collective_return',
                        {'collective_return': collective_return[episode]},
                        episode)
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                    logger.add_scalars(
                        'coop&coop_num_per_episode',
                        {'coop&coop_num_per_episode': coop_num[episode]},
                        episode)
            elif args.env_name == "multi_StagHuntGW":
                if 'collective_return' in infos.keys():
                    collective_return.append(infos['collective_return'])
                    logger.add_scalars(
                        'collective_return',
                        {'collective_return': collective_return[episode]},
                        episode)
                if 'coop&coop_num' in infos.keys():
                    coop_num.append(infos['coop&coop_num'])
                    logger.add_scalars(
                        'coop&coop_num_per_episode',
                        {'coop&coop_num_per_episode': coop_num[episode]},
                        episode)
                if 'gore0_num' in infos.keys():
                    gore1_num.append(infos['gore0_num'])
                    logger.add_scalars(
                        'gore1_num_per_episode',
                        {'gore1_num_per_episode': gore1_num[episode]}, episode)
                if 'gore1_num' in infos.keys():
                    gore2_num.append(infos['gore1_num'])
                    logger.add_scalars(
                        'gore2_num_per_episode',
                        {'gore2_num_per_episode': gore2_num[episode]}, episode)
                if 'gore2_num' in infos.keys():
                    gore3_num.append(infos['gore2_num'])
                    logger.add_scalars(
                        'gore3_num_per_episode',
                        {'gore3_num_per_episode': gore3_num[episode]}, episode)
                if 'hare0_num' in infos.keys():
                    hare1_num.append(infos['hare0_num'])
                    logger.add_scalars(
                        'hare1_num_per_episode',
                        {'hare1_num_per_episode': hare1_num[episode]}, episode)
                if 'hare1_num' in infos.keys():
                    hare2_num.append(infos['hare1_num'])
                    logger.add_scalars(
                        'hare2_num_per_episode',
                        {'hare2_num_per_episode': hare2_num[episode]}, episode)
                if 'hare2_num' in infos.keys():
                    hare3_num.append(infos['hare2_num'])
                    logger.add_scalars(
                        'hare3_num_per_episode',
                        {'hare3_num_per_episode': hare3_num[episode]}, episode)
        logger.export_scalars_to_json(str(log_dir / 'summary.json'))
        logger.close()
Пример #6
0
        # save statistics in dataframe
        total_reward_episodes_df = pd.DataFrame(total_reward_episodes, columns=['episode', 'reward1', 'reward2', 'length', 'coop', 'totalreward', 'epsilon'])

        return Q1, Q2, policy1, policy2, total_reward_episodes_df


if __name__ == '__main__':

    # Init the environment
    args = get_config()
    print("Using Environment: {}".format(args.env_name))
    NUM_EPISODE = args.num_episodes

    # Since we only consider a case where we only has 2 agents
    assert args.num_agents == 2
    env = GridWorldEnv(args)
    print("Number of Agents: {}".format(env.num_agents))

    rl = ReinforcementLearning(env=env)

    now_time = datetime.now()
    time_name = str(now_time.month)+"_"+str(now_time.day)+"_"+str(now_time.hour)+"_"+str(now_time.minute)

    epsilon_set = [0.0001]
    alpha_set = [0.1]
    beta_set = [0.01]
    repeat_num = 1

    if args.algorithm_name == 'Q':
        print("Using algorithm Q-Learning ...")
        # Test Q-Learning
Пример #7
0
class FriendQAgent(object):
    def __init__(self):
        super().__init__()
        self.gamma = 0.9
        self.epsilon = 1.0
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999995

        args = get_config()
        args.env_name = "EscalationGW"
        self.env = GridWorldEnv(args)
        self.no_iter = 200000
        self.alpha = 1.0
        self.alpha_min = 0.001
        self.alpha_decay = 0.999995
        # stateSpace = self.env.observation_space
        # actSpace = self.env.action_space
        # dimOfQ = np.concatenate((stateSpace, actSpace))
        # self.Q = np.ones(dimOfQ)

    def state_encode(self, pos):
        return pos[0] * self.env.length + pos[1]

    def act(self, Q, state, epsilon):
        state_a = self.state_encode(state[0:2])
        state_b = self.state_encode(state[2:4])
        state_esc = self.state_encode(state[4:6])
        temp = np.random.random()
        if temp < epsilon:
            action = np.random.randint(0, 4)
        else:
            actions = np.where(
                Q[state_a, state_b,
                  state_esc, :, :] == np.max(Q[state_a, state_b,
                                               state_esc, :, :]))
            action = actions[0][np.random.choice(range(len(actions[0])), 1)[0]]
        return action

    def learn(self):
        errors = []
        n_states = self.env.length * self.env.length
        Q_a = np.zeros((n_states, n_states, n_states, 5, 5))
        Q_b = np.zeros((n_states, n_states, n_states, 5, 5))
        epsilon = self.epsilon
        alpha = self.alpha
        gamma = self.gamma

        i = 0
        while i < self.no_iter:
            states_a, states_b = self.env.reset()
            state = [
                self.state_encode(states_a[0:2]),
                self.state_encode(states_a[2:4]),
                self.state_encode(states_a[4:6])
            ]
            while True:
                if i % 10000 == 1:
                    print(str(errors[-1]))
                before_value = Q_a[2][1][1][2][4]
                actions = [
                    self.act(Q_a, states_a, epsilon),
                    self.act(Q_b, states_b, epsilon)
                ]
                states_new, rewards, done, infos = self.env.step(actions)
                states_a_new, states_b_new = states_new
                state_new = [
                    self.state_encode(states_a_new[0:2]),
                    self.state_encode(states_a_new[2:4]),
                    self.state_encode(states_a_new[4:6])
                ]
                i += 1
                if done:
                    Q_a[state[0], state[1], state[2], actions[0], actions[1]] = Q_a[state[0], state[1], state[2],
                                                                                    actions[0], actions[1]] + \
                                                                                alpha * (rewards[0] - Q_a[
                        state[0], state[1], state[2], actions[0], actions[1]])
                    Q_b[state[0], state[1], state[2], actions[1], actions[0]] = Q_b[state[0], state[1], state[2],
                                                                                    actions[1], actions[0]] + \
                                                                                alpha * (rewards[1] - Q_b[
                        state[0], state[1], state[2], actions[1], actions[0]])
                    after_value = Q_a[2][1][1][2][4]
                    errors.append(abs(before_value - after_value))
                    break
                else:
                    Q_a[state[0], state[1], state[2], actions[0], actions[1]] = Q_a[state[0], state[1], state[2],
                                                                                    actions[0], actions[1]] + \
                                                                                alpha * (rewards[0] + gamma * np.max(
                        Q_a[state_new[0], state_new[1], state_new[2], :, :]) -
                                                                                         Q_a[state[0], state[1], state[
                                                                                             2], actions[0], actions[
                                                                                                 1]])
                    Q_b[state[0], state[1], state[2], actions[1], actions[0]] = Q_b[state[0], state[1], state[2],
                                                                                    actions[1], actions[0]] + \
                                                                                alpha * (rewards[1] + gamma * np.max(
                        Q_b[state_new[0], state_new[1], state_new[2], :, :]) -
                                                                                         Q_b[state[0], state[1], state[
                                                                                             2], actions[1], actions[
                                                                                                 0]])
                    after_value = Q_a[2][1][1][2][4]
                    errors.append(abs(before_value - after_value))
                    state = state_new
                epsilon *= self.epsilon_decay
                epsilon = max(self.epsilon_min, epsilon)
                # alpha *= self.alpha_decay
                # alpha = max(self.alpha_min, alpha)
                alpha = 1 / (i / self.alpha_min / self.no_iter + 1)
        plot_error(errors, "friend_q_learning_3_2")
        return
Пример #8
0
import os
import time
import numpy as np
import matplotlib.pyplot as plt
from envs import GridWorldEnv, MGEnv, Agent
from config import get_config

Reward_returned = []
Terminated_returned = []
Info_returned = []

args = get_config()
print(args.env_name)

assert args.num_agents == 2, ("only 2 agens are supported")
env = GridWorldEnv(args)
'''Definition of Action see AAs '''
# 共五个action(0~1) 用一个array 表示, example: action 1--------------->[0,1,0,0,0]
'''
AGENT_ACTIONS = {
    0: 'MOVE_LEFT',         # Move left
    1: 'MOVE_RIGHT',        # Move right
    2: 'MOVE_UP',           # Move up
    3: 'MOVE_DOWN',         # Move down
    4: 'STAY'               # don't move
}  # Rotate clockwise

ACTIONS = {
    'MOVE_LEFT': [0, -1],   # Move left
    'MOVE_RIGHT': [0, 1],   # Move right
    'MOVE_UP': [-1, 0],     # Move up