예제 #1
0
 def __init__(self, env, args, gpu_id):
     self.results_filename = "./results"
     self.env = env
     self.models = [
         A3Clstm(self.env.observation_space.shape[0],
                 self.env.action_space),
         A3Clstm(self.env.observation_space.shape[0], self.env.action_space)
     ]
     self.state = None
     self.hx = None
     self.cx = None
     self.eps_len = 0
     self.args = args
     self.values = []
     self.log_probs = []
     self.rewards = []
     self.entropies = []
     self.done = True
     self.info = None
     self.reward = 0
     self.gpu_id = gpu_id
     self.episodic_reward = 0
     self.life_counter = 5
     self.model_sequence = []
     self.curr_model_id = 0
     self.first_time_changeover = True
     self.fire_action_next = True
     if self.gpu_id >= 0:
         with torch.cuda.device(self.gpu_id):
             self.models[0] = self.models[0].cuda()
             self.models[1] = self.models[1].cuda()
     with open(self.results_filename, 'w'):
         pass
예제 #2
0
def test(args, shared_model, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = atari_env(args.env, env_conf)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0

    player = Agent(None, env, args, None)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.eval()

    for t in itertools.count():
        if player.done:
            player.model.load_state_dict(shared_model.state_dict())

        player.action_test(t)
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args.save_score_level:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save,
                           '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
예제 #3
0
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')  #; multiprocrssing

    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]

    env = atari_env(args.env, env_conf, args)
    shared_model = A3Clstm(env.observation_space.shape[0],
                           env.action_space)  # main A3C

    if args.load:  # if --load is True, load the .dat file.
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
예제 #4
0
for i in setup_json.keys():
    if i in args.env:
        env_conf = setup_json[i]
torch.set_default_tensor_type('torch.FloatTensor')

saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir, args.env),
                         map_location=lambda storage, loc: storage)

log = {}
setup_logger('{}_mon_log'.format(args.env),
             r'{0}{1}_mon_log'.format(args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

env = atari_env("{}".format(args.env), env_conf)
model = A3Clstm(env.observation_space.shape[0], env.action_space)

num_tests = 0
reward_total_sum = 0
player = Agent(model, env, args, state=None)
player.env = gym.wrappers.Monitor(player.env,
                                  "{}_monitor".format(args.env),
                                  force=True)
player.model.eval()
for i_episode in range(args.num_episodes):
    state = player.env.reset()
    player.state = torch.from_numpy(state).float()
    player.eps_len = 0
    reward_sum = 0
    while True:
        if args.render:
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    print("prank:", rank, "os.pid:", os.getpid())
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = AllowBacktracking(
        make_local_env(env_conf['game'],
                       env_conf['level'],
                       stack=False,
                       scale_rew=False))
    print("Got a local env; obs space:", env.observation_space)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    print("player.state.shape:", player.state.shape)
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            # if player.info['ale.lives'] == 0 or player.max_length:
            #    player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 100.0)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
예제 #6
0
def train(rank, args, shared_model, optimizer, optimizer_r, env_conf, lock,
          counter):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = [
                        Variable(torch.zeros(1, 512).cuda()),
                        Variable(torch.zeros(1, 512).cuda())
                    ]
                    player.hx = [
                        Variable(torch.zeros(1, 512).cuda()),
                        Variable(torch.zeros(1, 512).cuda())
                    ]
            else:
                player.cx = [
                    Variable(torch.zeros(1, 512)),
                    Variable(torch.zeros(1, 512))
                ]
                player.hx = [
                    Variable(torch.zeros(1, 512)),
                    Variable(torch.zeros(1, 512))
                ]
        else:
            player.cx = [
                Variable(player.cx[0].data),
                Variable(player.cx[1].data)
            ]
            player.hx = [
                Variable(player.hx[0].data),
                Variable(player.cx[1].data)
            ]

        # 测试rnet的更新有没有影响到这里
        # ps = list(player.model.r_net.named_parameters())
        # n, v = ps[6]
        # print(v.sum())
        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)),
                 (player.hx[0], player.cx[0]), (player.hx[1], player.cx[1])))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        with lock:
            counter.value += 1
        # rnet
        player.model.r_net.zero_grad()
        (args.actor_weight * policy_loss +
         (1 - args.actor_weight) * value_loss).backward(retain_graph=True)
        ensure_shared_grads(player.model.r_net,
                            shared_model.r_net,
                            gpu=gpu_id >= 0)
        optimizer_r.step()

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        player.model.r_net.zero_grad()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
예제 #7
0
def train(rank, args, shared_model, optimizer, env_conf):
    torch.manual_seed(args.seed + rank)

    env = atari_env(args.env_name, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)
    _ = env.reset()
    action = env.action_space.sample()
    _, _, _, info = env.step(action)
    start_lives = info['ale.lives']

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    env.seed(args.seed + rank)
    state = env.reset()
    state = torch.from_numpy(state).float()
    done = True
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 512))
            hx = Variable(torch.zeros(1, 512))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):

            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, info = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            if args.count_lives:
                if start_lives > info['ale.lives']:
                    done = True
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state).float()
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:

            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
예제 #8
0
def train(args, envs, observation_space, action_space):
    gpu_id = 0
    #每个单独的work,独立的环境和model,在cuda中运行
    player = Agent(envs, args)

    player.model = A3Clstm(observation_space, action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()

    with torch.cuda.device(gpu_id):
        player.model = player.model.cuda()
        player.state = player.state.cuda()
        player.cx = torch.zeros(args.workers, 512).cuda()
        player.hx = torch.zeros(args.workers, 512).cuda()

    optimizer = torch.optim.Adam(player.model.parameters(),
                                 lr=args.lr,
                                 amsgrad=args.amsgrad)

    #切换到训练模式
    player.model.train()
    while True:
        #训练20步或者game over就结束训练
        for step in range(args.num_steps):
            #训练时,保存每一步的相关信息到list
            player.env.get_images()
            player.action_train()
            if player.dones[-1][0]:
                break

        if not player.dones[-1][0]:
            value, _, _ = player.model((player.state, (player.hx, player.cx)))
            R = value.detach()
        else:
            R = torch.zeros(args.workers, 1)
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(R)

        for j in range(args.num_ppo_train):
            policy_loss = 0
            value_loss = 0
            gae = 0

            for i in reversed(range(len(player.rewards))):
                value, logit, _ = player.model(
                    (player.states[i], (player.hxs[i], player.cxs[i])))
                prob = F.softmax(logit, dim=1)
                log_prob = F.log_softmax(logit, dim=1)
                entropy = -(log_prob * prob).sum(1)
                log_probs_current = log_prob.gather(1, player.actions[i])

                R = args.gamma * R + player.rewards[i]

                advantage = R - value
                value_loss = value_loss + 0.5 * advantage.pow(2)

                # Generalized Advantage Estimataion
                delta_t = player.rewards[i] + args.gamma * player.values[
                    i + 1].detach() - player.values[i].detach()
                gae = gae * args.gamma * args.tau + delta_t

                ratio = torch.exp(log_probs_current - player.log_probs[i])
                surr1 = ratio
                surr2 = torch.clamp(ratio, 1.0 - args.clip_param,
                                    1.0 + args.clip_param)

                policy_loss = policy_loss - torch.min(
                    surr1, surr2) * gae * -0.01 * entropy

            optimizer.zero_grad()
            (policy_loss + 0.5 * value_loss).mean().backward()
            optimizer.step()

        #game over时记忆值重置
        if player.dones[-1][0]:
            with torch.cuda.device(gpu_id):
                player.cx = torch.zeros(args.workers, 512).cuda()
                player.hx = torch.zeros(args.workers, 512).cuda()
        else:
            player.cx = player.cx.detach()
            player.hx = player.hx.detach()

        player.clear_actions()


# advantage[0:n]
# 第0,1,2,...n 到 n+1的估值差 r[0-n],r[1-n],r[2-n]....rn   Value(N+1) 取反:
# 第n,n-1,n-2,n-3,......3,2,1
# r[n] + Value(N+1) - Value(N)
# r[n:n-1] + Value(N+1) - Value(N-1)
# ...
# r[n:2] + Value(N + 1) - Value(2)
# r[n:1] + Value(N + 1) - Value(1)
# R = args.gamma * R + player.rewards[i]
# advantage = R - player.values[i]
# value_loss = value_loss + 0.5 * advantage.pow(2)
# value_loss = 0.5 * advantage.pow(2)
# advantage = args.gamma * R + player.rewards[i] - player.values[i]

#entropy = -(log_prob * prob).sum(1)
#self.entropies.append(entropy)
#通过prob 采样对应的动作和动作logprob
# 计算每次的概率和entropy(entropies)和entropy的sum,sum是每一步所有动作概率的熵值
예제 #9
0
def train(rank, args, shared_model, optimizer, env_conf):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = Environment()  # 創建環境
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    # env.seed(args.seed + rank)
    player = Agent(None, env, args, None)  # 創建代理人
    player.gpu_id = gpu_id
    num_actions = env.get_num_actions()

    player.model = A3Clstm(
        Config.STACKED_FRAMES,  # A3C模型
        num_actions)

    player.state, available = player.env.reset()  # 初始環境
    player.state = torch.from_numpy(player.state).float()
    player.available = torch.from_numpy(available).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
            player.available = player.available.cuda()
    player.model.train()  # 訓練模式
    player.eps_len += 1
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())  # 更新網路
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))  # 完成一次訓練 初始化LSTM
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):  # T-max = 20
            player.action_train()
            if player.done:
                break

        if player.done:
            state, available = player.env.reset()
            player.state = torch.from_numpy(state).float()
            player.available = torch.from_numpy(available).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
                    player.available = player.available.cuda()

        R = torch.zeros(1, 1)  # if done : R_t-max = 0
        if not player.done:
            value, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data  # R_t-max = V(s)

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
# Implemented multiprocessing using locks but was not beneficial. Hogwild
# training was far superior

if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    setup_json = read_config(args.env_config)

    obs_shape = setup_json["Spaces"]["observation_channels"]
    action_space = gym.spaces.Discrete(setup_json["Spaces"]["action_shape"])
    shared_model = A3Clstm(obs_shape, action_space)
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
        optimizer.share_memory()
예제 #11
0
def test(args, shared_model, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(
        args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger(
        '{}_log'.format(args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)
    model.eval()

    state = env.reset()
    state = torch.from_numpy(state).float()
    reward_sum = 0
    done = True
    start_time = time.time()
    episode_length = 0
    num_tests = 0
    reward_total_sum = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 512), volatile=True)
            hx = Variable(torch.zeros(1, 512), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        if done:
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length, reward_mean))

            if reward_sum > args.save_score_level:
                model.load_state_dict(shared_model.state_dict())
                state_to_save = model.state_dict()
                torch.save(state_to_save, '{0}{1}.dat'.format(
                    args.save_model_dir, args.env))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state).float()
예제 #12
0
def train(rank, args, shared_model, optimizer, env_conf, iters,
          checkpoint_path):
    iters = dill.loads(iters)
    if args.enable_gavel_iterator and rank == 0:
        iters._init_logger()
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    elapsed_time = 0
    start_time = time.time()

    for i in iters:
        if i % 100 == 0:
            print('GPU %d finished step %d' % (rank, i), flush=True)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
        elapsed_time += time.time() - start_time
        start_time = time.time()

        if (args.throughput_estimation_interval is not None
                and i % args.throughput_estimation_interval == 0
                and rank == 0):
            print('[THROUGHPUT_ESTIMATION]\t%s\t%d' % (time.time(), i))

        if (args.max_duration is not None
                and elapsed_time >= args.max_duration):
            break
    if args.enable_gavel_iterator and rank == 0:
        state = shared_model.state_dict()
        iters.save_checkpoint(state, checkpoint_path)
        iters.complete()
예제 #13
0
if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    env = atari_env(args.env, env_conf, args)
    shared_model = A3Clstm(env.observation_space.shape[0], env.action_space,
                           args.terminal_prediction, args.reward_prediction
                           )  # this is global NN copy workers sync to-from ...
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
예제 #14
0
def train(rank, args, shared_model, optimizer):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]

    writer = SummaryWriter(log_dir=args.log_dir + 'tb_train')
    log = {}
    setup_logger('{}_train_log'.format(rank),
                 r'{0}{1}_train_log'.format(args.log_dir, rank))
    log['{}_train_log'.format(rank)] = logging.getLogger(
        '{}_train_log'.format(rank))
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(env_id=rank, args=args, type='train')
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id

    player.model = A3Clstm(player.env.observation_space.shape[2],
                           player.env.action_space.n)

    player.state = player.env.reset()
    player.state = normalize_rgb_obs(player.state)
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    num_trains = 0

    if not os.path.exists(args.log_dir + "images/"):
        os.makedirs(args.log_dir + "images/")

    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()

            if player.done:
                break

        if player.done:
            num_trains += 1
            log['{}_train_log'.format(rank)].info('entropy:{0}'.format(
                player.entropy.data[0]))
            writer.add_scalar("data/entropy_" + str(rank),
                              player.entropy.data[0], num_trains)
            writer.add_image('FCN_' + str(rank), player.fcn, num_trains)
            writer.add_image('Depth_GroundTruth_' + str(rank), player.depth,
                             num_trains)
            writer.add_image('RGB_' + str(rank), player.env.get_rgb(),
                             num_trains)

            save_image(
                player.fcn.data, args.log_dir + "images/" + str(rank) + "_" +
                str(num_trains) + "_fcn.png")
            # print("player.fcn.data:", player.fcn.data)
            save_image(
                player.depth.data, args.log_dir + "images/" + str(rank) + "_" +
                str(num_trains) + "_depth.png")
            cv2.imwrite(
                args.log_dir + "images/" + str(rank) + "_" + str(num_trains) +
                "_rgb.png", player.env.get_rgb())
            # print("player.depth.data:", player.depth.data)

            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            state = normalize_rgb_obs(state)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            with torch.cuda.device(gpu_id):
                value, _, _, _ = player.model(
                    (Variable(player.state.unsqueeze(0)), (player.hx,
                                                           player.cx),
                     Variable(
                         torch.from_numpy(player.env.target).type(
                             torch.FloatTensor).cuda())))
                R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = args.gamma * player.values[
                i + 1].data + player.rewards[i] - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            # policy_loss =  policy_loss - \
            #     player.log_probs[i] * \
            #     Variable(gae) - 0.01 * player.entropies[i] \
            #     + player.fcn_losses[i] # FCN

            policy_loss =  policy_loss - 1e-5*(player.log_probs[i] * Variable(gae)) - 1e-5*(0.01 * player.entropies[i]) \
                + player.fcn_losses[i] * DEPTH_LOSS_DISCOUNT # FCN

            # policy_loss = policy_loss + player.fcn_losses[i]  # FCN

        writer.add_scalar("data/value_loss_" + str(rank), value_loss,
                          num_trains)
        writer.add_scalar("data/policy_loss_" + str(rank), policy_loss,
                          num_trains)

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40.0)
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
예제 #15
0
def test(args, shared_model, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)

    state = env.reset()
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(model, env, args, state)
    player.state = torch.from_numpy(state).float()
    player.model.eval()
    while True:
        if player.starter and player.flag:
            player = player_start(player)
        else:
            player.flag = False
        if player.done and not player.flag:
            player.model.load_state_dict(shared_model.state_dict())
            player.cx = Variable(torch.zeros(1, 512), volatile=True)
            player.hx = Variable(torch.zeros(1, 512), volatile=True)
            player.flag = False
        elif not player.flag:
            player.cx = Variable(player.cx.data, volatile=True)
            player.hx = Variable(player.hx.data, volatile=True)
            player.flag = False
        if not player.flag:
            player, reward = player_act(player, train=False)
            reward_sum += reward

        if not player.done:
            if player.current_life > player.info['ale.lives']:
                player.flag = True
                player.current_life = player.info['ale.lives']
            else:
                player.current_life = player.info['ale.lives']
                player.flag = False

        if player.done:
            num_tests += 1
            player.current_life = 0
            player.flag = True
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args.save_score_level:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save,
                           '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()
예제 #16
0
def test(args, shared_model, env_conf, shared_counter):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu')

    log = {}
    setup_logger(
        '{}_log'.format(args.env),
        os.path.join(args.log_dir, '{}-{}_log'.format(args.env,
                                                      args.exp_name)))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None, gpu_id=gpu_id)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.model.apply(weights_init)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).to(torch.float32)

    player.model = player.model.to(device)
    player.state = player.state.to(device)

    flag = True
    max_score = 0
    while True:
        if flag:
            player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}, alpha {4:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean,
                    player.model.log_alpha.exp().detach().item()))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                torch.save(
                    player.model.state_dict(),
                    os.path.join(args.save_model_dir,
                                 '{}-{}.dat'.format(args.env, args.exp_name)))

            with shared_counter.get_lock():
                shared_counter.value += player.eps_len
                if shared_counter.value > args.interact_steps:
                    break
            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)

            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)
예제 #17
0
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

d_args = vars(args)
for k in d_args.keys():
    log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

env = Environment(True) #atari_env(True)
#env = atari_env("{}".format(args.env), env_conf, args)
num_tests = 0
start_time = time.time()
reward_total_sum = 0
player = Agent(None, env, args, None)

num_actions = env.get_num_actions()
player.model = A3Clstm(Config.STACKED_FRAMES,
                       num_actions)

player.gpu_id = gpu_id
if gpu_id >= 0:
    with torch.cuda.device(gpu_id):
        player.model = player.model.cuda()

if gpu_id >= 0:
    with torch.cuda.device(gpu_id):
        player.model.load_state_dict(saved_state)
else:
    player.model.load_state_dict(saved_state)

player.model.eval()   # model變成测试模式
for i_episode in range(args.num_episodes):
    player.state ,_= player.env.reset()
예제 #18
0
def train_rep(args, shared_model, env_conf):
    batch_size = 16
    train_times = args.rep_train_time
    trace = []
    td_class = [(0, 1), (1, 2), (2, 3), (3, 5), (5, 7), (7, 9)]
    loss_fn = nn.CrossEntropyLoss()
    optimizer_r = Adam(shared_model.r_net.parameters(), lr=args.rl_r)
    optimizer_c = Adam(shared_model.c_net.parameters(), lr=args.rl_r)
    ptitle('Train rep')
    gpu_id = args.gpu_ids[-1]

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(args.env, env_conf, args)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
            # player.model.r_net = player.model.r_net.cuda()
            # player.model.c_net = player.model.c_net.cuda()
    flag = True
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.train()
            flag = False

        player.action_test()
        trace.append(player.state)
        if len(trace) > args.trace_length:
            # 训练几百次
            for _ in range(train_times):
                range_c = np.random.randint(0, len(td_class))
                TD = np.random.randint(td_class[range_c][0],
                                       td_class[range_c][1])
                begin = np.random.randint(0, len(trace) - TD - batch_size)
                former = torch.stack(trace[begin:begin + batch_size], dim=0)
                latter = torch.stack(trace[begin + TD:begin + TD + batch_size],
                                     dim=0)
                target = torch.zeros(batch_size, dtype=torch.long) + range_c
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        former = former.cuda()
                        latter = latter.cuda()
                        target = target.cuda()

                rep_f, rep_l = player.model.r_net(former), player.model.r_net(
                    latter)
                output = player.model.c_net(rep_f, rep_l, False)
                loss = loss_fn(output, target)
                optimizer_r.zero_grad()
                optimizer_c.zero_grad()
                loss.backward()
                ensure_shared_grads(player.model.r_net,
                                    shared_model.r_net,
                                    gpu=gpu_id >= 0)
                ensure_shared_grads(player.model.c_net,
                                    shared_model.c_net,
                                    gpu=gpu_id >= 0)
                optimizer_r.step()
                optimizer_c.step()
            trace = []
        if player.done and not player.info:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True

            state = player.env.reset()
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
# Based on
# https://github.com/pytorch/examples/tree/master/mnist_hogwild
# Training settings
# Implemented multiprocessing using locks but was not beneficial. Hogwild
# training was far superior

if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')

    shared_model = A3Clstm(OBSERVATION_SPACE_SHAPE[3], ACTION_SIZE)
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(
            args.load_model_dir, args.env), map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(
                shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
        optimizer.share_memory()
    else:
        optimizer = None
예제 #20
0
def train(rank, args, shared_model, optimizer, env_conf, shared_counter,
          targ_shared):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    device = torch.device('cuda:{}'.format(gpu_id) if gpu_id >= 0 else 'cpu')

    torch.manual_seed(args.seed + rank)
    torch.cuda.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None, gpu_id=gpu_id)

    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)
    player.model.apply(weights_init)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).to(torch.float32)
    player.state = player.state.to(device)
    player.model = player.model.to(device)
    #player.targ_model = copy.deepcopy(player.model)

    player.model.train()
    #player.targ_model.eval()
    player.eps_len += 2
    while True:
        player.model.load_state_dict(shared_model.state_dict())
        #player.targ_model.load_state_dict(targ_shared.state_dict())
        if player.done:
            player.cx = torch.zeros(1, 512).to(device)
            player.hx = torch.zeros(1, 512).to(device)
            #player.targ_cx = copy.deepcopy(player.cx).detach()
            #player.targ_hx = copy.deepcopy(player.hx).detach()
        else:
            player.cx = player.cx.detach()
            player.hx = player.hx.detach()

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).to(torch.float32)
            player.state = player.state.to(device)

        #alpha = player.model.log_alpha.exp().detach()
        alpha = .01
        #alpha = 0
        x_R = torch.zeros(1, 1)
        if not player.done:
            with torch.no_grad():
                action, value, logit, q_value, _ = player.model(
                    (player.state.unsqueeze(0), (player.hx, player.cx)))
                x_R = q_value[1].detach() - alpha * F.log_softmax(
                    logit, -1).gather(-1, action)
        x_R = x_R.to(device)
        policy_loss = 0
        adv_gae_loss = 0
        for i in reversed(range(len(player.rewards))):
            x_R = args.gamma * x_R + player.rewards[i]
            adv_gae_loss = adv_gae_loss + (player.tra_adv_gae[i][1] -
                                           x_R.detach()).pow(2) * .5
            #policy_loss = policy_loss - player.log_probs[i] * player.tra_adv_gae[i][0].detach() + alpha * player.log_probs[i] * player.log_probs[i].detach()

            policy_loss = policy_loss - (F.softmax(
                player.values[i], -1) * player.tra_adv_gae[i][0].detach()).sum(
                    -1) - alpha * player.entropies[i].unsqueeze(-1)
            #policy_loss = policy_loss - player.log_probs[i] * (x_R - (F.softmax(player.values[i], -1) *
            #        player.tra_adv_gae[i][0]).sum(-1) - alpha * player.entropies[i]).detach() + alpha * player.log_probs[i] * player.log_probs[i].detach()
            #prob = F.softmax(player.values[i], -1)
            #ent_alpha = alpha * player.entropies[i].unsqueeze(-1)
            #advs = (player.tra_adv_gae[i][0] -
            #        ((player.tra_adv_gae[i][0] * prob).sum(-1, True) +
            #         ent_alpha)).detach()
            #policy_loss = policy_loss - (prob * advs).sum(-1) - ent_alpha
            x_R = x_R - alpha * player.log_probs[i].detach()
        player.model.zero_grad()
        (policy_loss + .5 * adv_gae_loss).backward(retain_graph=False)

        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        with shared_counter.get_lock():
            shared_counter.value += len(player.rewards)
            if shared_counter.value > args.interact_steps:
                break
def test(args, shared_model, env_conf):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger(
        '{}_log'.format(args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    print("test proc:")
    env = AllowBacktracking(make_local_env(env_conf['game'], env_conf['level'], stack=False, scale_rew=False))
    print("test got env:", env.observation_space)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        """
        if player.done and player.info['ale.lives'] > 0 and not player.max_length:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        """
        if player.done or player.max_length:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(state_to_save, '{0}{1}.dat'.format(args.save_model_dir, args.env))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
예제 #22
0
def train(rank, args, shared_model, optimizer, env_conf, num_tau_samples=32, num_tau_prime_samples=32, kappa=1.0, num_quantiles=32):
    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(
                shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad)
    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()
    player.eps_len += 2
    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 512).cuda())
                    player.hx = Variable(torch.zeros(1, 512).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 512))
                player.hx = Variable(torch.zeros(1, 512))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        
        R = torch.zeros(1,num_tau_prime_samples)
        if not player.done:
            logit, _, _ = player.model((Variable(
                    player.state.unsqueeze(0)), (player.hx, player.cx)))
        
            q_vals = torch.mean(logit,0)
            _, action = torch.max(q_vals,0)
            logit, _, _ = player.model((Variable(player.state.unsqueeze(0)),
                    (player.hx, player.cx)))
            
            R = logit[:,action]

        
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()       
        #R = R.detach()
        R = Variable(R)
        
        value_loss = 0
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]

            advantage = R.repeat(num_tau_samples,1) - player.logits_array[i].repeat(1, num_tau_prime_samples)
            #print("Ad: ",advantage)
            loss = (torch.abs(advantage) <= kappa).float() * 0.5 * advantage ** 2
            #print("loss: ",loss.sum(0).sum(0), loss)
            loss += (torch.abs(advantage) > kappa).float() * kappa * (torch.abs(advantage) - 0.5 * kappa)
            #print("loss: ",loss.sum(0).sum(0), loss)
            step_loss = torch.abs(player.quantiles_array[i].cuda() - (advantage.detach()<0).float()) * loss/kappa                 
            value_loss += step_loss.sum(0).mean(0)

        
        player.model.zero_grad()
        value_loss.backward()
        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()
예제 #23
0
log = {}
setup_logger('{}_mon_log'.format(args.env), r'{0}{1}_mon_log'.format(
    args.log_dir, args.env))
log['{}_mon_log'.format(args.env)] = logging.getLogger('{}_mon_log'.format(
    args.env))

d_args = vars(args)
for k in d_args.keys():
    log['{}_mon_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

env = atari_env("{}".format(args.env), env_conf, args)
num_tests = 0
start_time = time.time()
reward_total_sum = 0
player = Agent(None, env, args, None)
player.model = A3Clstm(player.env.observation_space.shape[0],
                       player.env.action_space)
player.gpu_id = gpu_id
if gpu_id >= 0:
    with torch.cuda.device(gpu_id):
        player.model = player.model.cuda()
if args.new_gym_eval:
    player.env = gym.wrappers.Monitor(
        player.env, "{}_monitor".format(args.env), force=True)

if gpu_id >= 0:
    with torch.cuda.device(gpu_id):
        player.model.load_state_dict(saved_state)
else:
    player.model.load_state_dict(saved_state)

player.model.eval()
예제 #24
0
def train(rank, args, shared_model, optimizer, env_conf):

    torch.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    while True:
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()
            if args.count_lives:
                player.check_state()
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()
예제 #25
0
def test(args, shared_model, env_conf, lock, counter):
    ptitle('Test Agent')
    gpu_id = args.gpu_ids[-1]
    log = {}
    setup_logger(
        '{}_log'.format(args.env),
        r'{0}{1}-{2}_log'.format(args.log_dir, args.env, args.log_target))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed)
    env = atari_env(args.env, env_conf, args)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.eps_len += 2
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.model = player.model.cuda()
            player.state = player.state.cuda()
    flag = True
    max_score = 0
    while True:
        if flag:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.model.load_state_dict(shared_model.state_dict())
            else:
                player.model.load_state_dict(shared_model.state_dict())
            player.model.eval()
            flag = False

        player.action_test()
        reward_sum += player.reward

        if player.done and not player.info:
            state = player.env.reset()
            player.eps_len += 2
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
        elif player.info:
            flag = True
            num_tests += 1
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            with lock:
                counter.value += 1
            log['{}_log'.format(args.env)].info(
                "UpdateStep {0} Time {1}, episode reward {2}, episode length {3}, reward mean {4:.4f}"
                .format(
                    counter.value,
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if args.save_max and reward_sum >= max_score:
                max_score = reward_sum
                if gpu_id >= 0:
                    with torch.cuda.device(gpu_id):
                        state_to_save = player.model.state_dict()
                        torch.save(
                            state_to_save,
                            '{0}{1}_{2}.dat'.format(args.save_model_dir,
                                                    args.env, args.log_target))
                else:
                    state_to_save = player.model.state_dict()
                    torch.save(
                        state_to_save,
                        '{0}{1}_{2}.dat'.format(args.save_model_dir, args.env,
                                                args.log_target))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            player.eps_len += 2
            time.sleep(10)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()
예제 #26
0
# python main.py --env Pong-v0 --workers 7 --gpu-ids 0 --amsgrad True --pre-rnet 1wsam --rep-train-time 10 --trace-length 50 --log-target name
if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    env = atari_env(args.env, env_conf, args)
    shared_model = A3Clstm(env.observation_space.shape[0], env.action_space,
                           args.pre_rnet)
    if args.load:
        saved_state = torch.load('{0}{1}_{2}.dat'.format(
            args.load_model_dir, args.env, args.log_target),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
            optimizer_r = SharedAdam(shared_model.r_net.parameters(),
예제 #27
0
if __name__ == '__main__':
    args = parser.parse_args()
    torch.manual_seed(args.seed)
    if args.gpu_ids == -1:
        args.gpu_ids = [-1]
    else:
        torch.cuda.manual_seed(args.seed)
        mp.set_start_method('spawn')
    setup_json = read_config(args.env_config)
    env_conf = setup_json["Default"]
    for i in setup_json.keys():
        if i in args.env:
            env_conf = setup_json[i]
    env = Environment()
    num_actions = env.get_num_actions()
    shared_model = A3Clstm(Config.STACKED_FRAMES, num_actions)
    if args.load:
        saved_state = torch.load('{0}{1}.dat'.format(args.load_model_dir,
                                                     args.env),
                                 map_location=lambda storage, loc: storage)
        shared_model.load_state_dict(saved_state)
    shared_model.share_memory()

    if args.shared_optimizer:
        if args.optimizer == 'RMSprop':
            optimizer = SharedRMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = SharedAdam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
        optimizer.share_memory()
예제 #28
0
def train(rank, reward_type, args, shared_model, optimizer, env_conf):
    log = {}
    setup_logger('{}_log'.format(args.env),
                 r'{0}{1}_log'.format(args.log_dir, args.env))
    log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format(
        args.env))
    d_args = vars(args)
    for k in d_args.keys():
        log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf)
    env.seed(args.seed + rank)

    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0

    player = Agent(None, env, args, None, reward_type)
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    for i in itertools.count():
        if i % 10 == 0:
            print("reward type {0}, iter {1}".format(reward_type, i))
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args.num_steps):
            player.action_train()
            reward_sum += player.reward
            if args.count_lives:
                player.check_state()
            if player.done:
                break

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args.env)].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()
예제 #29
0
def train(rank, args, shared_model, optimizer, env_conf):
    torch.manual_seed(args.seed + rank)

    env = atari_env(args.env, env_conf)
    model = A3Clstm(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    env.seed(args.seed + rank)
    state = env.reset()
    player = Agent(model, env, args, state)
    player.state = torch.from_numpy(state).float()
    player.model.train()
    epoch = 0
    while True:

        player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            player.cx = Variable(torch.zeros(1, 512))
            player.hx = Variable(torch.zeros(1, 512))
            if player.starter:
                player = player_start(player, train=True)
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):

            player = player_act(player, train=True)

            if player.done:
                break

            if player.current_life > player.info['ale.lives']:
                player.flag = True
                player.current_life = player.info['ale.lives']
            else:
                player.current_life = player.info['ale.lives']
                player.flag = False
            if args.count_lives:
                if player.flag:
                    player.done = True
                    break

            if player.starter and player.flag:
                player = player_start(player, train=True)
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            player.flag = False

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss += 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * player.values[i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - player.log_probs[i] * Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()

        (policy_loss + value_loss).backward()

        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.values = []
        player.log_probs = []
        player.rewards = []
        player.entropies = []
예제 #30
0
def train(rank, args, shared_model, optimizer, env_conf):

    ptitle('Training Agent: {}'.format(rank))
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = atari_env(args.env, env_conf, args)
    if optimizer is None:
        if args.optimizer == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr)
        if args.optimizer == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(),
                                   lr=args.lr,
                                   amsgrad=args.amsgrad)
    env.seed(args.seed + rank)

    tp_weight = args.tp

    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    player.model = A3Clstm(player.env.observation_space.shape[0],
                           player.env.action_space, args.terminal_prediction,
                           args.reward_prediction)

    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()
    player.model.train()

    # Below is where the cores are running episodes continously ...
    average_ep_length = 0

    while True:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(shared_model.state_dict())
        else:
            player.model.load_state_dict(shared_model.state_dict())
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 128).cuda())
                    player.hx = Variable(torch.zeros(1, 128).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 128))
                player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        for step in range(args.num_steps):
            player.eps_len += 1
            player.action_train()
            if player.done:
                break

        if player.done:
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = R.cuda()

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        reward_pred_loss = 0
        terminal_loss = 0

        gae = torch.zeros(1, 1)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = gae.cuda()
        R = Variable(R)  # TODO why this is here?

        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * player.values[
                i + 1].data - player.values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - player.log_probs[i] * Variable(
                gae) - 0.01 * player.entropies[i]

            if args.reward_prediction:
                reward_pred_loss = reward_pred_loss + (
                    player.reward_predictions[i] - player.rewards[i]).pow(2)

        if args.terminal_prediction:  # new way of using emprical episode length as a proxy for current length.
            if player.average_episode_length is None:
                end_predict_labels = np.arange(
                    player.eps_len - len(player.terminal_predictions),
                    player.eps_len) / player.eps_len  # heuristic
            else:
                end_predict_labels = np.arange(
                    player.eps_len - len(player.terminal_predictions),
                    player.eps_len) / player.average_episode_length

            for i in range(len(player.terminal_predictions)):
                terminal_loss = terminal_loss + (
                    player.terminal_predictions[i] -
                    end_predict_labels[i]).pow(2)

            terminal_loss = terminal_loss / len(player.terminal_predictions)

        player.model.zero_grad()
        #print(f"policy loss {policy_loss} and value loss {value_loss} and terminal loss {terminal_loss} and reward pred loss {reward_pred_loss}")

        total_loss = policy_loss + 0.5 * value_loss + tp_weight * terminal_loss + 0.5 * reward_pred_loss

        total_loss.backward()  # will free memory ...

        # Visualize Computation Graph
        #graph = make_dot(total_loss)
        #from graphviz import Source
        #Source.view(graph)

        ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0)
        optimizer.step()
        player.clear_actions()

        if player.done:
            if player.average_episode_length is None:  # initial one
                player.average_episode_length = player.eps_len
            else:
                player.average_episode_length = int(
                    0.99 * player.average_episode_length +
                    0.01 * player.eps_len)
            #print(player.average_episode_length, 'current one is ', player.eps_len)
            player.eps_len = 0  # reset here