Exemplo n.º 1
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.qmix_pg_learner = QMIX_PG(self.agents, args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.actor_critic_buffer = ReplayBuffer(args, args.buffer_size)
            # self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        tmp = f'clamp2-5_rewardscale10_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                           f'{args.actor_update_delay}_{args.critic_lr}_{args.n_epoch}_{args.temp}'  # f'clamp2-5_'+ rewardscale10_
        self.save_path = self.args.result_dir + '/linear_mix/' + 'mcsac' + '/' + tmp + '/' + args.map  # _gradclip0.5

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Exemplo n.º 2
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Exemplo n.º 3
0
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)

        if args.alg == 'commnet_coma':
            self.agents = CommNetAgents(args)
            self.rolloutWorker = CommNetRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommNetRolloutWorker(self.env_evaluate,
                                                       self.agents, args)
        else:
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                                args)
        if args.alg != 'coma' and args.alg != 'commnet_coma':
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Exemplo n.º 4
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time
Exemplo n.º 5
0
class PlayerTrainer(object):
    def __init__(self,actor,critic,buffersize,game,player,batch_size,gamma):
        self.actor = actor
        self.critic = critic
        self.replay = ReplayBuffer(buffersize)
        self.game =game
        self.player = player
        self.batch_size = batch_size
        self.gamma = gamma


    def noisyMaxQMove(self):
        state = self.game.space
        As = self.actor.predict(np.reshape(state, (1, *state.shape)))
        avail = self.game.avail()
        availQ = {}
        availP = []
        for k in avail:
            availQ[k] = As[0][k]
            availP.append(As[0][k])
        # if sum(availP)> 0:
        availP = np.array(availP)

        availP = [round(i, 5) if i >= 0 else (-.001 * round(i, 5)) for i in availP]
        availNorm = [i / sum(availP) for i in availP]

        a = np.random.choice(avail, p=availNorm)

        self.game.move(a,self.player)
        next_state, reward = self.game.step(self.player)

        self.bufferAdd(state,As,reward,self.game.game_over,next_state)
        if self.replay.size() > self.batch_size:
            s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay.sample_batch(self.batch_size)
            target_q = self.critic.predict_target(s2_batch,self.actor.predict_target(s2_batch))
            y_i = []
            for k in range(self.batch_size):
                if t_batch[k]:
                    y_i.append(r_batch[k])
                else:
                    y_i.append(r_batch[k] + self.gamma * target_q[k])

            predicted_q_value, _ = self.critic.train(
                s_batch, a_batch, np.reshape(y_i, (self.batch_size, 1)))

            #ep_ave_max_q += np.amax(predicted_q_value)

            # Update the actor policy using the sampled gradient
            a_outs = self.actor.predict(s_batch)
            grads = self.critic.action_gradients(s_batch, a_outs)
            self.actor.train(s_batch, grads[0])

            # Update target networks
            self.actor.update_target_network()
            self.actor.update_target_network()
        return self.game.space , reward

    def bufferAdd(self,state,Qs,reward,terminal,next_state):
        self.replay.add(np.reshape(state,(self.actor.s_dim,)),np.reshape(Qs,(self.actor.a_dim,)),reward,terminal,np.reshape(next_state,(self.actor.s_dim,)))
Exemplo n.º 6
0
 def __init__(self,actor,critic,buffersize,game,player,batch_size,gamma):
     self.actor = actor
     self.critic = critic
     self.replay = ReplayBuffer(buffersize)
     self.game =game
     self.player = player
     self.batch_size = batch_size
     self.gamma = gamma
Exemplo n.º 7
0
    def test_random_sampling(self):
        rb = ReplayBuffer(3)
        rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[1]).add(
            Transitions[2])

        samples = rb.sample(100)
        n_1, n_2 = 0, 0
        for sample in samples:
            if sample == Transitions[1]:
                n_1 += 1
            elif sample == Transitions[2]:
                n_2 += 1
            else:
                pytest.fail()

        assert n_1 > n_2
Exemplo n.º 8
0
    def __init__(self, env, gamma=0.99, tau=0.005, hidden_size=256, device=None):
        super(NAF, self).__init__(env, device=None)
        self.action_space = self.act_dim
        self.num_inputs = self.obs_dim
        num_inputs = self.obs_dim
        action_space = self.act_dim
        self.model = Policy(hidden_size, num_inputs, action_space).to(self.device)
        self.target_model = Policy(hidden_size, num_inputs, action_space).to(self.device)
        self.optimizer = Adam(self.model.parameters(), lr=1e-3)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim)
        self.c_loss, self.a_loss = [], []
        self.gamma = gamma
        self.tau = tau

        hard_update(self.target_model, self.model)
Exemplo n.º 9
0
    def __init__(self, env, args):
        self.env = env

        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Exemplo n.º 10
0
    def __init__(self, env, args, itr, seed):
        # 随机设置种子
        if seed is not None:
            self.setup_seed(seed)
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0
        self.time_steps = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \
                  str(self.args.target_update_period)
        self.alg_tag = '_' + self.args.optim

        if self.args.her:
            self.alg_tag += str(self.args.her)
            alg_dir += '_her=' + str(self.args.her)

        # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr
        self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr

        self.agents = Agents(args, itr=itr)
        print('step runner 初始化')
        if self.args.her:
            print('使用HER')
Exemplo n.º 11
0
    def test_circular_buffer(self):
        rb = ReplayBuffer(4)
        rb.add(Transitions[0])
        rb.add(Transitions[1])
        rb.add(Transitions[2])
        rb.add(Transitions[3])
        rb.add(Transitions[4])
        rb.add(Transitions[5])

        assert (rb._storage == [
            Transitions[4], Transitions[5], Transitions[2], Transitions[3]
        ]).all()
Exemplo n.º 12
0
    def __init__(self, env, args):
        self.env = env
        self.agents = Agents(args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        self.buffer = ReplayBuffer(args)
        self.args = args
        self.epsilon = args.epsilon

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents,
                                            args)
Exemplo n.º 13
0
    def __init__(self, env, args):
        self.env = env
        self.args = args

        self.agents = Agents(args)
        self.qmix_pg_learner = QMIX_PG(self.agents, args)
        self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find(
                'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.critic_buffer = ReplayBuffer(args, args.critic_buffer_size)
            self.actor_buffer = ReplayBuffer(args, args.actor_buffer_size)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \
                                                             f'{args.actor_update_delay}_{args.critic_lr}'  # f'clamp2-5_'+  anneal_epsilon
        self.save_path = self.args.result_dir + '/linear_mix/' + 'qmix_ac_total_cf' + '/' + tmp + '/' + args.map

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Exemplo n.º 14
0
 def test_len(self):
     rb = ReplayBuffer(5)
     rb.add(Transitions[0]).add(Transitions[1]).add(Transitions[2])
     assert len(rb) == 3
     for i in range(8):
         rb.add(Transitions[i])
     assert len(rb) == 5
Exemplo n.º 15
0
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args
        self.plt_success = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.env_name
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
Exemplo n.º 16
0
class DQN(Trainer):

    def __init__(self, parameters):
        super(DQN, self).__init__(parameters)
        self.replay_buffer = ReplayBuffer(self.buffersize)

    def push_to_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def compute_td_loss(self, batch_size, *args):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            batch_size)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))

        q_values = self.current_model(state)
        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

        next_q_values = self.current_model(next_state)
        next_q_state_values = self.target_model(next_state)
        next_q_value = next_q_state_values.gather(
            1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)

        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        loss = (q_value - Variable(expected_q_value.data)).abs()
        loss[loss.le(1)] = loss[loss.le(1)].pow(2)
        loss[loss.gt(1)] = 1 #(loss[loss.gt(1)] + 1) / 2
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss
Exemplo n.º 17
0
    def __init__(self, env, args, itr):
        # 获取参数
        # self.args = get_common_args()
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.agents = Agents(args, itr=itr)
        # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络
        # if not self.args.reuse_network:
        #     self.agents = []
        #     for i in range(self.args.n_agents):
        #         self.agents.append(Agents(self.args, i))

        # self.rollout = RollOut(self.agents, self.args)

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str(
            itr)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        print('runner 初始化')
Exemplo n.º 18
0
    def __init__(self, config):
        self.writer = SummaryWriter() 
        self.device = 'cuda' if T.cuda.is_available() else 'cpu'

        self.dqn_type = config["dqn-type"]
        self.run_title = config["run-title"]
        self.env = gym.make(config["environment"])

        self.num_states  = np.prod(self.env.observation_space.shape)
        self.num_actions = self.env.action_space.n

        layers = [
            self.num_states, 
            *config["architecture"], 
            self.num_actions
        ]

        self.policy_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net = Q_Network(self.dqn_type, layers).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        capacity = config["max-experiences"]
        self.p_replay_eps = config["p-eps"]
        self.prioritized_replay = config["prioritized-replay"]
        self.replay_buffer = PrioritizedReplayBuffer(capacity, config["p-alpha"]) if self.prioritized_replay \
                        else ReplayBuffer(capacity)

        self.beta_scheduler = LinearSchedule(config["episodes"], initial_p=config["p-beta-init"], final_p=1.0)
        self.epsilon_decay = lambda e: max(config["epsilon-min"], e * config["epsilon-decay"])

        self.train_freq = config["train-freq"]
        self.use_soft_update = config["use-soft-update"]
        self.target_update = config["target-update"]
        self.tau = config["tau"]
        self.gamma = config["gamma"]
        self.batch_size = config["batch-size"]
        self.time_step = 0

        self.optim = T.optim.AdamW(self.policy_net.parameters(), lr=config["lr-init"], weight_decay=config["weight-decay"])
        self.lr_scheduler = T.optim.lr_scheduler.StepLR(self.optim, step_size=config["lr-step"], gamma=config["lr-gamma"])
        self.criterion = nn.SmoothL1Loss(reduction="none") # Huber Loss
        self.min_experiences = max(config["min-experiences"], config["batch-size"])

        self.save_path = config["save-path"]
Exemplo n.º 19
0
class NAF(BaseAgent):

    def __init__(self, env, gamma=0.99, tau=0.005, hidden_size=256, device=None):
        super(NAF, self).__init__(env, device=None)
        self.action_space = self.act_dim
        self.num_inputs = self.obs_dim
        num_inputs = self.obs_dim
        action_space = self.act_dim
        self.model = Policy(hidden_size, num_inputs, action_space).to(self.device)
        self.target_model = Policy(hidden_size, num_inputs, action_space).to(self.device)
        self.optimizer = Adam(self.model.parameters(), lr=1e-3)

        self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim)
        self.c_loss, self.a_loss = [], []
        self.gamma = gamma
        self.tau = tau

        hard_update(self.target_model, self.model)

    def act(self, state, action_noise=None, param_noise=None):
        state = torch.tensor(state, dtype=torch.float32, device=self.device)
        state = state.reshape(1, -1)
        self.model.eval()
        mu, _, _ = self.model((Variable(state), None))
        self.model.train()
        mu = mu.data
        if action_noise is not None:
            mu += torch.Tensor(action_noise.noise())

        return mu.clamp(-1, 1).cpu().data.numpy().flatten()

    def train(self):
        #state_batch = Variable(torch.cat(batch.state))
        #action_batch = Variable(torch.cat(batch.action))
        #reward_batch = Variable(torch.cat(batch.reward))
        #mask_batch = Variable(torch.cat(batch.mask))
        #next_state_batch = Variable(torch.cat(batch.next_state))

        state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.replay_buffer.sample(128)
        _, _, next_state_values = self.target_model((next_state_batch, None))

        reward_batch = reward_batch.unsqueeze(1)
        mask_batch = mask_batch.unsqueeze(1)
        expected_state_action_values = reward_batch + (self.gamma * (1 - mask_batch) * next_state_values)

        _, state_action_values, _ = self.model((state_batch, action_batch))

        loss = MSELoss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 1)
        self.optimizer.step()

        soft_update(self.target_model, self.model, self.tau)

        return loss.item(), 0

    def step(self, t):
        c, a = self.train()
        self.c_loss.append(c);
        self.a_loss.append(a)
        if t % 5000 == 0:
            # self.evaluate(self.env)
            print(f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss) * 2}')
            self.c_loss, self.a_loss = [], []
        self.episode_timesteps += 1

    def save_model(self, env_name, suffix="", model_path=None):
        if not os.path.exists('models/'):
            os.makedirs('models/')

        if model_path is None:
            model_path = "models/naf_{}_{}".format(env_name, suffix)
        print('Saving model to {}'.format(model_path))
        torch.save(self.model.state_dict(), model_path)

    def load_model(self, model_path):
        print('Loading model from {}'.format(model_path))
        self.model.load_state_dict(torch.load(model_path))
Exemplo n.º 20
0
    quant_idx = quant_idx.cpu().data
    batch_idx = np.arange(batch_size)
    tau = tau_hat[:, quant_idx][batch_idx, batch_idx]
        
    return tau, expected_quant

num_quant = 51
Vmin = -10
Vmax = 10

current_model = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant)
target_model  = QRDQN(env.observation_space.shape[0], env.action_space.n, num_quant)
    
optimizer = optim.Adam(current_model.parameters())

replay_buffer = ReplayBuffer(10000)

def update_target(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())
    
update_target(current_model, target_model)

def compute_td_loss(batch_size):
    state, action, reward, next_state, done = replay_buffer.sample(batch_size) 

    state      = autograd.Variable(torch.FloatTensor(np.float32(state)))
    next_state = autograd.Variable(torch.FloatTensor(np.float32(next_state)), volatile=True)
    action     = autograd.Variable(torch.LongTensor(action))
    reward     = torch.FloatTensor(reward)
    done       = torch.FloatTensor(np.float32(done))
Exemplo n.º 21
0
])

# Use soft updates to update the target networks
target_update = tf.group([
    tf.assign(v_targ, DECAY * v_targ + (1 - DECAY) * v_main)
    for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
])

init = tf.global_variables_initializer()
session = tf.Session()
session.run(init)
session.run(target_init)

#%% Replay Buffer

replay_buffer = ReplayBuffer(observation_shape=env.observation_space.shape,
                             action_shape=(1, ))


# %% Play
def sample_action(env, observation, epsilon):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        q_s_a = session.run(q, feed_dict={x: np.atleast_2d(observation)})[0]
        return np.argmax(q_s_a)


def play_once(env, epsilon, render=False):
    observation = env.reset()
    done = False
    steps = 0
Exemplo n.º 22
0
def learn(env,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16):
    torch.set_num_threads(num_cpu)
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative

    obs, xy_per_marine = common.init(env, obs)

    group_id = 0
    reset = True
    dqn = DQN(num_actions, lr, cuda)

    print('\nCollecting experience...')
    checkpoint_path = 'models/deepq/checkpoint.pth.tar'
    if os.path.exists(checkpoint_path):
        dqn, saved_mean_reward = load_checkpoint(dqn, cuda, filename=checkpoint_path)
    for t in range(max_timesteps):
        # Take action and update exploration to the newest value
        # custom process for DefeatZerglingsAndBanelings
        obs, screen, player = common.select_marine(env, obs)
        # action = act(
        #     np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
        action = dqn.choose_action(np.array(screen)[None])
        reset = False
        rew = 0
        new_action = None
        obs, new_action = common.marine_action(env, obs, player, action)
        army_count = env._obs[0].observation.player_common.army_count
        try:
            if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
                obs = env.step(actions=new_action)
            else:
                new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                obs = env.step(actions=new_action)
        except Exception as e:
            # print(e)
            1  # Do nothing
        player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
        new_screen = player_relative
        rew += obs[0].reward
        done = obs[0].step_type == environment.StepType.LAST
        selected = obs[0].observation["screen"][_SELECTED]
        player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
        if len(player_y) > 0:
            player = [int(player_x.mean()), int(player_y.mean())]
        if len(player) == 2:
            if player[0] > 32:
                new_screen = common.shift(LEFT, player[0] - 32, new_screen)
            elif player[0] < 32:
                new_screen = common.shift(RIGHT, 32 - player[0],
                                          new_screen)
            if player[1] > 32:
                new_screen = common.shift(UP, player[1] - 32, new_screen)
            elif player[1] < 32:
                new_screen = common.shift(DOWN, 32 - player[1], new_screen)
        # Store transition in the replay buffer.
        replay_buffer.add(screen, action, rew, new_screen, float(done))
        screen = new_screen
        episode_rewards[-1] += rew
        reward = episode_rewards[-1]
        if done:
            print("Episode Reward : %s" % episode_rewards[-1])
            obs = env.reset()
            player_relative = obs[0].observation["screen"][
                _PLAYER_RELATIVE]
            screen = player_relative
            group_list = common.init(env, obs)
            # Select all marines first
            # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
            episode_rewards.append(0.0)
            reset = True

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(
                    batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = dqn.learn(obses_t, actions, rewards, obses_tp1, gamma, batch_size)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes,
                                                new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            dqn.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("reward", reward)
            logger.record_tabular("mean 100 episode reward",
                                  mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()

        if (checkpoint_freq is not None and t > learning_starts
                and num_episodes > 100 and t % checkpoint_freq == 0):
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                if print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward,
                            mean_100ep_reward))
                save_checkpoint({
                    'epoch': t + 1,
                    'state_dict': dqn.save_state_dict(),
                    'best_accuracy': mean_100ep_reward
                }, checkpoint_path)
                saved_mean_reward = mean_100ep_reward
Exemplo n.º 23
0
 def __init__(self, parameters):
     super(Rainbow, self).__init__(parameters)
     self.replay_buffer = ReplayBuffer(self.buffersize)
Exemplo n.º 24
0
class Rainbow(Trainer):
    def __init__(self, parameters):
        super(Rainbow, self).__init__(parameters)
        self.replay_buffer = ReplayBuffer(self.buffersize)

    def push_to_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def load_model(self):
        self.current_model = RainbowDQN(self.env.observation_space.shape[0],
                                        self.env.action_space.n, num_atoms,
                                        Vmin,
                                        Vmax)  # input:(1,84,84), output:6
        self.target_model = RainbowDQN(self.env.observation_space.shape[0],
                                       self.env.action_space.n, num_atoms,
                                       Vmin, Vmax)

        if USE_CUDA:
            self.current_model = self.current_model.cuda()
            self.target_model = self.target_model.cuda()

        self.update_target(self.current_model, self.target_model)  # sync nets

    def projection_distribution(self, next_state, rewards, dones):
        batch_size = next_state.size(0)

        delta_z = float(Vmax - Vmin) / (num_atoms - 1)
        support = torch.linspace(Vmin, Vmax, num_atoms)

        next_dist = self.target_model(next_state).data.cpu() * support
        next_action = next_dist.sum(2).max(1)[1]
        next_action = next_action.unsqueeze(1).unsqueeze(1).expand(
            next_dist.size(0), 1, next_dist.size(2))
        next_dist = next_dist.gather(1, next_action).squeeze(1)

        rewards = rewards.unsqueeze(1).expand_as(next_dist)
        dones = dones.unsqueeze(1).expand_as(next_dist)
        support = support.unsqueeze(0).expand_as(next_dist)

        Tz = rewards + (1 - dones) * 0.99 * support
        Tz = Tz.clamp(min=Vmin, max=Vmax)
        b = (Tz - Vmin) / delta_z
        l = b.floor().long()
        u = b.ceil().long()

        offset = torch.linspace(0, (batch_size - 1) * num_atoms, batch_size).long()\
            .unsqueeze(1).expand(batch_size, num_atoms)

        proj_dist = torch.zeros(next_dist.size())
        proj_dist.view(-1).index_add_(0, (l + offset).view(-1),
                                      (next_dist * (u.float() - b)).view(-1))
        proj_dist.view(-1).index_add_(0, (u + offset).view(-1),
                                      (next_dist * (b - l.float())).view(-1))

        return proj_dist

    def compute_td_loss(self, batch_size, *args):
        state, action, reward, next_state, done = self.replay_buffer.sample(
            batch_size)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action))
        reward = torch.FloatTensor(reward)
        done = torch.FloatTensor(np.float32(done))

        proj_dist = self.projection_distribution(next_state, reward, done)

        dist = self.current_model(state)
        action = action.unsqueeze(1).unsqueeze(1).expand(
            batch_size, 1, num_atoms)
        dist = dist.gather(1, action).squeeze(1)
        dist.data.clamp_(0.01, 0.99)
        loss = -(Variable(proj_dist) * dist.log()).sum(1)
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.current_model.reset_noise()
        self.target_model.reset_noise()

        return loss
Exemplo n.º 25
0
            noise=config.get("noise", None)
            if m != config["target_map"] else None,
            vsn=config.get("vsn", None) if m != config["target_map"] else None,
            ally_indices=ally_indices,
            enemy_indices=enemy_indices,
        ) for m, d in zip(config["map_names"], difficulties)
    ]

    for env in train_envs:
        env_info = env.get_env_info()
        target_info = target_env.get_env_info()
        env.buffer = ReplayBuffer(
            n_actions=target_info['n_actions'],
            n_agents=env_info['n_agents'],
            obs_shape=target_info['obs_shape'],
            state_shape=target_info['state_shape'],
            episode_limit=env_info['episode_limit'],
            size=args.buffer_size,
            alg=args.alg,
            dtype=np.float16,
        )
        logging.info(env_info)
    # change args to accommodate largest possible env
    # assures the widths of the created neural networks are sufficient
    env_info = target_env.get_env_info()
    args.n_actions = env_info["n_actions"]
    args.n_agents = env_info["n_agents"]
    args.state_shape = env_info["state_shape"]
    args.obs_shape = env_info["obs_shape"]
    args.episode_limit = env_info["episode_limit"]

    runner = Runner(None, args, target_env)
Exemplo n.º 26
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        # 用来在一个稀疏奖赏的环境上评估算法的好坏,胜利为1,失败为-1,其他普通的一步为0
        '''
        self.env_evaluate = StarCraft2Env(map_name=args.map,
                                          step_mul=args.step_mul,
                                          difficulty=args.difficulty,
                                          game_version=args.game_version,
                                          seed=args.seed,
                                          replay_dir=args.replay_dir,
                                          reward_sparse=True,
                                          reward_scale=False)
        '''
        self.env_evaluate = MeetEnv()

        if args.alg.find('commnet') > -1 or args.alg.find('g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
            self.evaluateWorker = CommRolloutWorker(self.env_evaluate, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
            self.evaluateWorker = RolloutWorker(self.env_evaluate, self.agents, args)
        if args.alg.find('coma') == -1 and args.alg.find('central_v') == -1 and args.alg.find('reinforce') == -1:  # these 3 algorithms are on-poliy
            self.buffer = ReplayBuffer(args)
        self.args = args

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.alg + '/' + args.map
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

    def run(self, num):
        plt.figure()
        plt.axis([0, self.args.n_epoch, 0, 100])
        win_rates = []
        episode_rewards = []
        train_steps = 0
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            print('Run {}, train epoch {}'.format(num, epoch))
            if epoch % self.args.evaluate_cycle == 0:
                win_rate, episode_reward = self.evaluate()
                # print('win_rate is ', win_rate)
                win_rates.append(win_rate)
                episode_rewards.append(episode_reward)
                plt.cla()
                plt.subplot(2, 1, 1)
                plt.plot(range(len(win_rates)), win_rates)
                plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
                plt.ylabel('win_rate')

                plt.subplot(2, 1, 2)
                plt.plot(range(len(episode_rewards)), episode_rewards)
                plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
                plt.ylabel('episode_rewards')

                plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
                np.save(self.save_path + '/win_rates_{}'.format(num), win_rates)
                np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                episode, _ = self.rolloutWorker.generate_episode(episode_idx)
                episodes.append(episode)
                # print(_)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate((episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find('central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps, self.rolloutWorker.epsilon)
                train_steps += 1
            else:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size))
                    self.agents.train(mini_batch, train_steps)
                    train_steps += 1


        plt.cla()
        plt.subplot(2, 1, 1)
        plt.plot(range(len(win_rates)), win_rates)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('win_rate')

        plt.subplot(2, 1, 2)
        plt.plot(range(len(episode_rewards)), episode_rewards)
        plt.xlabel('epoch*{}'.format(self.args.evaluate_cycle))
        plt.ylabel('episode_rewards')

        plt.savefig(self.save_path + '/plt_{}.png'.format(num), format='png')
        np.save(self.save_path + '/win_rates_{}'.format(num), win_rates)
        np.save(self.save_path + '/episode_rewards_{}'.format(num), episode_rewards)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.rolloutWorker.generate_episode(evaluate=True)
            episode_rewards += episode_reward
            if episode_reward > self.args.threshold:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch

    def evaluate_sparse(self):
        win_number = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward = self.evaluateWorker.generate_episode(evaluate=True)
            result = 'win' if episode_reward > 0 else 'defeat'
            print('Epoch {}: {}'.format(epoch, result))
            if episode_reward > 0:
                win_number += 1
        self.env_evaluate.close()
        return win_number / self.args.evaluate_epoch
Exemplo n.º 27
0
def train(train_env, agent_action_fn, eval_mode=False):
    action_space = train_env.action_space
    obs_space = train_env.observation_space

    ######### instantiate actor,critic, replay buffer, uo-process#########
    ## feed online with state. feed target with next_state.
    online_state_inputs = tf.placeholder(tf.float32,
                                         shape=(None, obs_space.shape[0]),
                                         name="online_state_inputs")

    target_state_inputs = tf.placeholder(tf.float32,
                                         shape=online_state_inputs.shape,
                                         name="target_state_inputs")

    ## inputs to q_net for training q.
    online_action_inputs_training_q = tf.placeholder(
        tf.float32,
        shape=(None, action_space.shape[0]),
        name='online_action_batch_inputs')
    # condition bool scalar to switch action inputs to online q.
    # feed True: training q.
    # feed False: training policy.
    cond_training_q = tf.placeholder(tf.bool, shape=[], name='cond_training_q')

    terminated_inputs = tf.placeholder(tf.float32,
                                       shape=(None),
                                       name='terminated_inputs')
    reward_inputs = tf.placeholder(tf.float32,
                                   shape=(None),
                                   name='rewards_inputs')

    # for summary text
    summary_text_tensor = tf.convert_to_tensor(str('summary_text'),
                                               preferred_dtype=string)
    tf.summary.text(name='summary_text',
                    tensor=summary_text_tensor,
                    collections=[DDPG_CFG.log_summary_keys])

    ##instantiate actor, critic.
    actor = Actor(
        action_dim=action_space.shape[0],
        online_state_inputs=online_state_inputs,
        target_state_inputs=target_state_inputs,
        input_normalizer=DDPG_CFG.actor_input_normalizer,
        input_norm_params=DDPG_CFG.actor_input_norm_params,
        n_fc_units=DDPG_CFG.actor_n_fc_units,
        fc_activations=DDPG_CFG.actor_fc_activations,
        fc_initializers=DDPG_CFG.actor_fc_initializers,
        fc_normalizers=DDPG_CFG.actor_fc_normalizers,
        fc_norm_params=DDPG_CFG.actor_fc_norm_params,
        fc_regularizers=DDPG_CFG.actor_fc_regularizers,
        output_layer_initializer=DDPG_CFG.actor_output_layer_initializer,
        output_layer_regularizer=None,
        output_normalizers=DDPG_CFG.actor_output_layer_normalizers,
        output_norm_params=DDPG_CFG.actor_output_layer_norm_params,
        output_bound_fns=DDPG_CFG.actor_output_bound_fns,
        learning_rate=DDPG_CFG.actor_learning_rate,
        is_training=is_training)

    critic = Critic(
        online_state_inputs=online_state_inputs,
        target_state_inputs=target_state_inputs,
        input_normalizer=DDPG_CFG.critic_input_normalizer,
        input_norm_params=DDPG_CFG.critic_input_norm_params,
        online_action_inputs_training_q=online_action_inputs_training_q,
        online_action_inputs_training_policy=actor.
        online_action_outputs_tensor,
        cond_training_q=cond_training_q,
        target_action_inputs=actor.target_action_outputs_tensor,
        n_fc_units=DDPG_CFG.critic_n_fc_units,
        fc_activations=DDPG_CFG.critic_fc_activations,
        fc_initializers=DDPG_CFG.critic_fc_initializers,
        fc_normalizers=DDPG_CFG.critic_fc_normalizers,
        fc_norm_params=DDPG_CFG.critic_fc_norm_params,
        fc_regularizers=DDPG_CFG.critic_fc_regularizers,
        output_layer_initializer=DDPG_CFG.critic_output_layer_initializer,
        output_layer_regularizer=None,
        learning_rate=DDPG_CFG.critic_learning_rate)

    ## track updates.
    global_step_tensor = tf.train.create_global_step()

    ## build whole graph
    copy_online_to_target_op, train_online_policy_op, train_online_q_op, update_target_op, saver \
      = build_ddpg_graph(actor, critic, reward_inputs, terminated_inputs, global_step_tensor)

    #we save the replay buffer data to files.
    replay_buffer = ReplayBuffer(
        buffer_size=DDPG_CFG.replay_buff_size,
        save_segment_size=DDPG_CFG.replay_buff_save_segment_size,
        save_path=DDPG_CFG.replay_buffer_file_path,
        seed=DDPG_CFG.random_seed)
    if DDPG_CFG.load_replay_buffer_set:
        replay_buffer.load(DDPG_CFG.replay_buffer_file_path)

    sess = tf.Session(graph=tf.get_default_graph())
    summary_writer = tf.summary.FileWriter(logdir=os.path.join(
        DDPG_CFG.log_dir, "train"),
                                           graph=sess.graph)
    log_summary_op = tf.summary.merge_all(key=DDPG_CFG.log_summary_keys)

    sess.run(fetches=[tf.global_variables_initializer()])

    #copy init params from online to target
    sess.run(fetches=[copy_online_to_target_op])

    # Load a previous checkpoint if it exists
    latest_checkpoint = tf.train.latest_checkpoint(DDPG_CFG.checkpoint_dir)
    if latest_checkpoint:
        tf.logging.info(
            "==== Loading model checkpoint: {}".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    elif eval_mode:
        raise FileNotFoundError(
            '== in evaluation mode, we need check point file which can not be found.==='
        )

    ####### start training #########
    obs = train_env.reset()
    transition = preprocess_low_dim(obs)

    n_episodes = 1

    if not eval_mode:
        for step in range(1, DDPG_CFG.num_training_steps):
            #replace with new transition
            policy_out = sess.run(fetches=[actor.online_action_outputs_tensor],
                                  feed_dict={
                                      online_state_inputs:
                                      transition.next_state[np.newaxis, :],
                                      is_training:
                                      False
                                  })[0]
            transition = agent_action_fn(policy_out, replay_buffer, train_env)
            if step % 200 == 0:
                tf.logging.info(' +++++++++++++++++++ global_step:{} action:{}'
                                '  reward:{} term:{}'.format(
                                    step, transition.action, transition.reward,
                                    transition.terminated))
            if step < 10:
                #feed some transitions in buffer.
                continue
            ## ++++ sample mini-batch and train.++++
            state_batch, action_batch, reward_batch, next_state_batch, terminated_batch = \
             replay_buffer.sample_batch(DDPG_CFG.batch_size)

            # ---- 1. train policy.-----------
            sess.run(
                fetches=[train_online_policy_op],
                feed_dict={
                    online_state_inputs: state_batch,
                    cond_training_q: False,
                    online_action_inputs_training_q:
                    action_batch,  # feed but not used.
                    is_training: True
                })

            # ---- 2. train q. --------------
            sess.run(fetches=[train_online_q_op],
                     feed_dict={
                         online_state_inputs: state_batch,
                         cond_training_q: True,
                         online_action_inputs_training_q: action_batch,
                         target_state_inputs: next_state_batch,
                         reward_inputs: reward_batch,
                         terminated_inputs: terminated_batch,
                         is_training: True
                     })

            # ----- 3. update target ---------
            sess.run(fetches=[update_target_op], feed_dict=None)

            # do evaluation after eval_freq steps:
            if step % DDPG_CFG.eval_freq == 0:  ##and step > DDPG_CFG.eval_freq:
                evaluate(env=train_env,
                         num_eval_steps=DDPG_CFG.num_eval_steps,
                         preprocess_fn=preprocess_low_dim,
                         estimate_fn=lambda state: sess.run(
                             fetches=[actor.online_action_outputs_tensor],
                             feed_dict={
                                 online_state_inputs: state,
                                 is_training: False
                             }),
                         summary_writer=summary_writer,
                         saver=saver,
                         sess=sess,
                         global_step=step,
                         log_summary_op=log_summary_op,
                         summary_text_tensor=summary_text_tensor)

            if transition.terminated:
                transition = preprocess_low_dim(train_env.reset())
                n_episodes += 1
                continue  # begin new episode

    else:  #eval mode
        evaluate(env=train_env,
                 num_eval_steps=DDPG_CFG.eval_steps_after_training,
                 preprocess_fn=preprocess_low_dim,
                 estimate_fn=lambda state: sess.run(
                     fetches=[actor.online_action_outputs_tensor],
                     feed_dict={
                         online_state_inputs: state,
                         is_training: False
                     }),
                 summary_writer=summary_writer,
                 saver=None,
                 sess=sess,
                 global_step=0,
                 log_summary_op=log_summary_op,
                 summary_text_tensor=summary_text_tensor)

    sess.close()
    train_env.close()
Exemplo n.º 28
0
class Runner:
    def __init__(self, env, args):
        self.env = env

        if args.alg.find('commnet') > -1 or args.alg.find(
                'g2anet') > -1:  # communication agent
            self.agents = CommAgents(args)
            self.rolloutWorker = CommRolloutWorker(env, self.agents, args)
        else:  # no communication agent
            self.agents = Agents(args)
            self.rolloutWorker = RolloutWorker(env, self.agents, args)
        if args.learn and args.alg.find('coma') == -1 and args.alg.find(
                'central_v') == -1 and args.alg.find(
                    'reinforce') == -1:  # these 3 algorithms are on-poliy
            if args.use_per:
                self.buffer = PrioritizedReplayBuffer(args)
            else:
                self.buffer = ReplayBuffer(args)
        self.args = args
        self.win_rates = []
        self.episode_rewards = []

        # 用来保存plt和pkl
        self.save_path = self.args.result_dir + '/' + args.map + '/'
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.file_name = self.save_path + str(args.env_name) + '_' + str(
            args.n_agents) + '_' + str(args.map_size) + '_' + args.name_time

    def run(self, num):
        train_steps = 0
        episode_rewards = 0
        fixed_rewards = 0
        st = time.time()
        plot_rewards = []
        # print('Run {} start'.format(num))
        for epoch in range(self.args.n_epoch):
            # print('Run {}, train epoch {}'.format(num, epoch))
            # if epoch % self.args.evaluate_cycle == 0:
            #     win_rate, episode_reward = self.evaluate()
            #     # print('win_rate is ', win_rate)
            #     self.win_rates.append(win_rate)
            #     self.episode_rewards.append(episode_reward)
            #     print(episode_reward)
            #     # self.plt(num)

            episodes = []
            # 收集self.args.n_episodes个episodes
            for episode_idx in range(self.args.n_episodes):
                if self.args.use_ja:
                    if self.args.use_v1:
                        episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v2(
                            episode_idx)
                    else:
                        episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode_ja_v3(
                            episode_idx)
                else:
                    episode, episode_reward, rate, fixed_reward = self.rolloutWorker.generate_episode(
                        episode_idx)
                episodes.append(episode)
                episode_rewards += episode_reward
                fixed_rewards += fixed_reward
                plot_rewards.append(episode_reward)
                if epoch % self.args.evaluate_cycle == 0:
                    t = time.time() - st
                    st = time.time()
                    epr = round(episode_rewards / self.args.evaluate_cycle, 2)
                    fr = round(fixed_rewards / self.args.evaluate_cycle, 2)
                    print('train epoch {}, reward {}, time {}, rate {}'.format(
                        epoch, [epr, fr], t, rate))
                    # wandb.log({"reward": epr, "test_reward": epr})
                    episode_rewards = 0
                    fixed_rewards = 0
                    with open(self.file_name, 'wb') as fp:
                        pickle.dump(plot_rewards, fp)
            # episode的每一项都是一个(1, episode_len, n_agents, 具体维度)四维数组,下面要把所有episode的的obs拼在一起
            episode_batch = episodes[0]
            episodes.pop(0)
            for episode in episodes:
                for key in episode_batch.keys():
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)
            if self.args.alg.find('coma') > -1 or self.args.alg.find(
                    'central_v') > -1 or self.args.alg.find('reinforce') > -1:
                self.agents.train(episode_batch, train_steps,
                                  self.rolloutWorker.epsilon)
                train_steps += 1
            elif not self.args.load_model:
                self.buffer.store_episode(episode_batch)
                for train_step in range(self.args.train_steps):
                    # mini_batch = self.buffer.sample(min(self.buffer.current_size, self.args.batch_size))
                    # # print(mini_batch['terminated'])
                    # # print(train_steps)
                    # dq = self.agents.train(mini_batch, train_steps)
                    if self.args.use_per:
                        mini_batch, idxs = self.buffer.sample(
                            min(self.buffer.current_size,
                                self.args.batch_size))
                        dq = self.agents.train(mini_batch, train_steps)
                        self.buffer.update_priorities(idxs, dq)
                    else:
                        mini_batch = self.buffer.sample(
                            min(self.buffer.current_size,
                                self.args.batch_size))
                        dq = self.agents.train(mini_batch, train_steps)
                    train_steps += 1
        # self.plt(num)

    def evaluate(self):
        win_number = 0
        episode_rewards = 0
        for epoch in range(self.args.evaluate_epoch):
            _, episode_reward, win_tag = self.rolloutWorker.generate_episode(
                epoch, evaluate=True)
            episode_rewards += episode_reward
            if win_tag:
                win_number += 1
        return win_number / self.args.evaluate_epoch, episode_rewards / self.args.evaluate_epoch
Exemplo n.º 29
0
    def __init__(self, config):
        self.config = config

        self.network_freq = 125#self.config.conf['HLC-frequency']

        self.reward_decay = 1.0
        self.reward_scale = config.conf['reward-scale']

        self.max_time_per_train_episode = 10#self.config.conf['max-train-time']
        self.max_step_per_train_episode = int(self.max_time_per_train_episode*self.network_freq)
        self.max_time_per_test_episode = 10#self.config.conf['max-test-time']#16
        self.max_step_per_test_episode = int(self.max_time_per_test_episode*self.network_freq)

        env_name = 'Walker2DBulletEnv-v0'#'AntBulletEnv-v0'#'Walker2DBulletEnv-v0'#'HumanoidBulletEnv-v0'
        self.env = gym.make(env_name)
        # self.env.render()

        print(self.env.observation_space)
        print(self.env.action_space)
        self.config.conf['state-dim'] = self.env.observation_space.shape[0]
        self.config.conf['action-dim'] = self.env.action_space.shape[0]

        self.config.conf['actor-logstd-initial'] = np.zeros((1, self.config.conf['action-dim']))
        self.config.conf['actor-logstd-bounds'] = np.ones((2,self.config.conf['action-dim']))
        self.config.conf['actor-output-bounds'] = np.ones((2,self.config.conf['action-dim']))
        self.config.conf['actor-output-bounds'][0][:] = -1 * np.ones(self.config.conf['action-dim'],)
        self.config.conf['actor-output-bounds'][1][:] = 1* np.ones(self.config.conf['action-dim'],)

        self.config.conf['actor-logstd-initial'] *= np.log(1.0)  # np.log(min(std*0.25, 1.0))#0.5
        self.config.conf['actor-logstd-bounds'][0] *= np.log(0.2)
        self.config.conf['actor-logstd-bounds'][1] *= np.log(1.0)  # 0.6

        self.agent = Agent(self.env, self.config)

        self.episode_count = 0
        self.step_count = 0
        self.train_iter_count = 0

        self.best_reward = 0
        self.best_episode = 0
        self.best_train_iter = 0

        # load weight from previous network
        # dir_path = 'record/2017_12_04_15.20.44/no_force'  # '2017_05_29_18.23.49/with_force'

        # create new network
        dir_path = 'TRPO/record/' + '3D/' + env_name +'/' + datetime.now().strftime('%Y_%m_%d_%H.%M.%S')
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        if not os.path.exists(dir_path + '/saved_actor_networks'):
            os.makedirs(dir_path + '/saved_actor_networks')
        if not os.path.exists(dir_path + '/saved_critic_networks'):
            os.makedirs(dir_path + '/saved_critic_networks')
        self.logging = logger(dir_path)
        config.save_configuration(dir_path)
        config.record_configuration(dir_path)
        config.print_configuration()
        self.agent.load_weight(dir_path)
        self.dir_path = dir_path

        self.on_policy_paths = []
        self.off_policy_paths = []
        self.buffer = ReplayBuffer(self.config.conf['replay-buffer-size'])

        self.force = [0,0,0]
        self.force_chest = [0, 0, 0]  # max(0,force_chest[1]-300*1.0 / EXPLORE)]
        self.force_pelvis = [0, 0, 0]
Exemplo n.º 30
0
    def __init__(self, config):
        self.config = config

        self.PD_freq = self.config.conf['LLC-frequency']
        self.Physics_freq = self.config.conf['Physics-frequency']
        self.network_freq = self.config.conf['HLC-frequency']
        self.sampling_skip = int(self.PD_freq / self.network_freq)

        self.reward_decay = 1.0
        self.reward_scale = config.conf['reward-scale']
        self.reward_scale = self.reward_scale / float(
            self.sampling_skip)  # /10.0#normalizing reward to 1

        self.max_time_per_train_episode = self.config.conf['max-train-time']
        self.max_step_per_train_episode = int(self.max_time_per_train_episode *
                                              self.network_freq)
        self.max_time_per_test_episode = self.config.conf['max-test-time']  #16
        self.max_step_per_test_episode = int(self.max_time_per_test_episode *
                                             self.network_freq)
        self.train_external_force_disturbance = True
        if self.train_external_force_disturbance == True:
            path_str = 'with_external_force_disturbance/'
        else:
            path_str = 'without_external_force_disturbance/'
        self.test_external_force_disturbance = True

        self.env = Valkyrie(
            max_time=self.max_time_per_train_episode,
            renders=False,
            initial_gap_time=0.5,
            PD_freq=self.PD_freq,
            Physics_freq=self.Physics_freq,
            Kp=config.conf['Kp'],
            Kd=config.conf['Kd'],
            bullet_default_PD=config.conf['bullet-default-PD'],
            controlled_joints_list=config.conf['controlled-joints'])

        config.conf['state-dim'] = self.env.stateNumber
        self.agent = Agent(self.env, self.config)

        self.episode_count = 0
        self.step_count = 0
        self.train_iter_count = 0

        self.best_reward = 0
        self.best_episode = 0
        self.best_train_iter = 0

        self.control = Control(self.config, self.env)

        # load weight from previous network
        # dir_path = 'record/2017_12_04_15.20.44/no_force'  # '2017_05_29_18.23.49/with_force'

        # create new network
        dir_path = 'TRPO/record/' + '3D_push/' + path_str + datetime.now(
        ).strftime('%Y_%m_%d_%H.%M.%S')
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        if not os.path.exists(dir_path + '/saved_actor_networks'):
            os.makedirs(dir_path + '/saved_actor_networks')
        if not os.path.exists(dir_path + '/saved_critic_networks'):
            os.makedirs(dir_path + '/saved_critic_networks')
        self.logging = logger(dir_path)
        config.save_configuration(dir_path)
        config.record_configuration(dir_path)
        config.print_configuration()
        self.agent.load_weight(dir_path)
        self.dir_path = dir_path

        self.on_policy_paths = []
        self.off_policy_paths = []
        self.buffer = ReplayBuffer(self.config.conf['replay-buffer-size'])

        self.force = [0, 0, 0]
        self.force_chest = [0, 0,
                            0]  # max(0,force_chest[1]-300*1.0 / EXPLORE)]
        self.force_pelvis = [0, 0, 0]