示例#1
0
def main():
    #env
    args = config()
    mp.set_start_method("spawn")
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    env = create_atari_env(args.env_name)
    shared_model = AcotrCritic(env.observation_space.shape[0],
                               env.action_space)
    shared_model.share_memory()

    optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()

    processes = []

    counter = mp.Value('i', 0)
    lock = mp.Lock()

    p = mp.Process(target=test,
                   args=(args.num_processes, args, shared_model, counter,
                         "./log/"))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train,
                       args=(rank, args, shared_model, counter, lock,
                             optimizer))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
示例#2
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

        if done and counter.value > args.max_steps:
            test_final(shared_model, env, args)
            save_model(shared_model, args)
            exit()

        with torch.no_grad():
            value, logit = model(state.unsqueeze(0))
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print(
                "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
示例#3
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank) # asynchronizing the test agent
    env = create_atari_env(params.env_name, video=True) # running an environment with a video
    env.seed(params.seed + rank) # asynchronizing the environment
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model
    model.eval() # putting the model in "eval" model because it won't be trained
    state = env.reset() # getting the input images as numpy arrays
    state = torch.from_numpy(state) # converting them into torch tensors
    reward_sum = 0 # initializing the sum of rewards to 0
    done = True # initializing done to True
    start_time = time.time() # getting the starting time to measure the computation time
    actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html
    episode_length = 0 # initializing the episode length to 0
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        if done: # synchronizing with the shared model (same as train.py)
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action
        state, reward, done, _ = env.step(action[0, 0]) # done = done or episode_length >= params.max_episode_length
        reward_sum += reward
        if done: # printing the results at the end of each part
            print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
            reward_sum = 0 # reinitializing the sum of rewards
            episode_length = 0 # reinitializing the episode length
            actions.clear() # reinitializing the actions
            state = env.reset() # reinitializing the environment
            time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done)
        state = torch.from_numpy(state) # new state and we continue
示例#4
0
    def __init__(self,
                 sess,
                 GameName,
                 name,
                 seed,
                 rank,
                 globalAC,
                 OPT,
                 coord,
                 UPDATE_GLOBAL_ITER,
                 MAX_GLOBAL_EP,
                 GAMMA,
                 ENTROPY_BETA,
                 lstm=True):
        """Initialize the object of the class Worker.
        Args:
            sess: the running session
            GameName: name of game
            name: name of the worker/scope
            globalAC: the global net
            OPT_A: the optimizer of actor
            OPT_C: the optimizer of critic
            coord: training coordinator
            UPDATE_GLOBAL_ITER: the number of updating steps
            MAX_GLOBAL_EP: the maximum of steps
            GAMMA: decay rate on the contribution of past scores
            ENTROPY_BETA: used to calculate the entropy loss
            lstm: whether to use LSTM
        """

        self.sess = sess
        self.name = name
        self.lstm = lstm
        self.env = create_atari_env(GameName)
        self.env.seed(seed + rank)
        shape = self.env.observation_space.shape
        if lstm:
            self.AC = A3CNet_LSTM(
                self.env,
                name,
                sess,
                OPT,
                ENTROPY_BETA,
                globalAC,
                input_shape=[None, shape[1], shape[2], shape[0]])
        else:
            self.AC = A3CNet_CONV(
                self.env,
                name,
                sess,
                OPT,
                ENTROPY_BETA,
                globalAC,
                input_shape=[None, shape[1], shape[2], shape[0]])
        self.coord = coord
        self.T0 = time.time()
        self.UPDATE_GLOBAL_ITER = UPDATE_GLOBAL_ITER
        self.MAX_GLOBAL_EP = MAX_GLOBAL_EP
        self.GAMMA = GAMMA
示例#5
0
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
示例#6
0
def monitor(rank, args, shared_model):

    env = create_atari_env(args.env_name)
    env = wrappers.Monitor(env,
                           './video/pong-a3c',
                           video_callable=lambda count: count % 30 == 0,
                           force=True)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    # eval mode
    model.eval()

    # init
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    episode_length = 0
    done = True
    start_time = time.time()

    while True:
        env.render()
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)  # lstm's param
            hx = Variable(torch.zeros(1, 256), volatile=True)  # lstm's param
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        # unsqueeze(0)後tensor的size會從1x42x42 -> 1x1x42x42
        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        # 直接選機率最大的動作
        action = prob.max(1, keepdim=True)[1].data.numpy()

        state, reward, done, _ = env.step(action[0][0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            # reset
            reward_sum = 0
            episode_length = 0
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
示例#7
0
def test(shared_model, render=0):
    # torch.manual_seed(rank)

    env = create_atari_env(args.rom)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    cx = hx = None
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        # print logit.data.numpy()
        action = prob.max(1, keepdim=True)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        if render == 1:
            env.render()
            time.sleep(0.03)
        done = done or episode_length >= 10000
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".
                  format(get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
示例#8
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)  # Test ajanını asenkron yapmak için
    env = create_atari_env(params.env_name,
                           video=True)  # Ortamı video ile oynatmak için
    env.seed(params.seed + rank)  # Ortamı asenkron yapmak için

    model = ActorCritic(env.observation_space.shape[0],
                        env.action_space)  # Modelin oluşturulması
    model.eval()  # Modelin eğitim yapmaması için

    state = env.reset()  # input resmini numpy array olarak alıyoruz.
    state = torch.from_numpy(state)  # Bunu torch tensörüne çeviriyoruz.
    reward_sum = 0
    done = True
    start_time = time.time()  # Başlangıç zamanı
    actions = deque(maxlen=100)  # https://pymotw.com/2/collections/deque.html
    episode_length = 0

    while True:
        episode_length += 1  # Bölüm uzunluğunu birer birer arttırıyoruz.
        if done:  # Eğitim modundaki gibi paylaşımlı model ile senkronize hale getiriyoruz.
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy(
        )  # Test Ajanı keşif yapmadan doğrudan en iyi aksiyonu kullanarak oynu oynar.
        state, reward, done, _ = env.step(action[
            0,
            0])  # done = done or episode_length >= params.max_episode_length
        reward_sum += reward

        if done:  # Her bölümün sonunda sonucu yazdırır.
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)  # Öbür ajanları beklemek için 1 dk beklemesi için.
        state = torch.from_numpy(
            state)  # Yeni durum (state) oluşturup devam eder.
示例#9
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            save(model, 'brain.pkl')
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0])
        reward_sum += reward
        if done:
            f = open("Statistics.txt", 'a')
            f.write(str(reward_sum) + " " + str(episode_length) + "\n")
            f.close()
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
示例#10
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0, 0])
        reward_sum += reward
        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(
                60
            )  # 60 seconds break to allow the other agents to test the environment
        state = torch.from_numpy(state)
示例#11
0
def initEnv():
  env = create_atari_env('PongDeterministic-v4')
  state = env.reset()
  return env, state
示例#12
0
class Params():
    def __init__(self):
        self.lr = 0.0001
        self.gamma = 0.99
        self.tau = 1.
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'
params = Params()
torch.manual_seed(params.seed)
env = create_atari_env(params.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
optimizer.share_memory()
processes = []
p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
    processes.append(p)
for p in processes:
    p.join()
示例#13
0
def train(rank, shared_model, optimizer):
    """
    :param rank: worker-ID
    :param shared_model: model to sync between workers
    :param optimizer:
    :return:
    """
    # torch.manual_seed(SEED + rank)
    ac_steps = 20
    max_episode_length = 10000
    gamma = 0.99
    tau = 1.0
    max_grad_norm = 50.0
    checkpoint_n = 20

    env = create_atari_env(romname)
    env.seed(SEED + rank)
    state = env.reset()
    state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    t = 0
    done = True
    episodes = 0
    reward_sum = 0
    reward_sum1 = 0
    start_time = time.time()
    best_reward = -999
    isbest = 0
    cx = hx = None
    while True:
        model.load_state_dict(shared_model.state_dict())
        if done:  # need to reset LSTM cell's input
            cx = Variable(torch.zeros(1, 256)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 256)).type(FloatTensor)
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)  # basically this is to detach from previous comp graph

        states = []
        values = []
        log_probs = []
        rewards = []
        entropies = []

        for i in range(ac_steps):
            t += 1
            v, logit, (hx, cx) = model((state, (hx, cx)))
            states.append(state)
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial().detach()  # detach -- so the backprob will NOT go through multinomial()
            log_prob = log_prob.gather(1, action)
            action = action.data[0, 0]
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            reward_sum1 += reward
            done = done or t >= max_episode_length
            if done:
                t_ = t
                t = 0
                state = env.reset()
                episodes += 1
                if episodes % 10 == 0:
                    time_str = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print("Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {}".
                          format(time_str, rank, episodes, reward_sum / 10.0, t_))
                    reward_sum = 0.0

                if episodes % checkpoint_n == 0:
                    ave_reward = reward_sum1 / checkpoint_n
                    if best_reward < ave_reward:
                        isbest = 1
                        best_reward = ave_reward

                    print("Saving checkpoint Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {} best_reward {}".
                          format(get_elapsed_time_str(), rank, episodes, ave_reward, t_, best_reward))
                    checkpoint_fname = os.path.join(
                        args.savedir,
                        args.rom + '_worker' + str(rank) + '_' + str(episodes))
                    save_checkpoint({'epoch': episodes,
                                     'average_reward': ave_reward,
                                     'time': time.time(),
                                     'state_dict': model.state_dict(),
                                     'optimizer': optimizer.state_dict(),
                                     }, isbest, checkpoint_fname)
                    reward_sum1 = 0.0

            state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor), requires_grad=False)
            reward = max(min(reward, 1), -1)
            values.append(v)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        # We reach here because either
        # i) an episode ends, such as game over
        # ii) we have explored certain steps into the future and now it is
        #     time to look-back and summerise the
        if done:
            R = torch.zeros(1, 1).type(FloatTensor)
        else:
            value, _, _ = model((state, (hx, cx)))
            R = value.data

        values.append(Variable(R))
        critic_loss = 0
        actor_loss = 0
        R = Variable(R)
        gae = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            advantage = R - values[i]  # type: Variable
            critic_loss += 0.5 * advantage.pow(2)
            td_error = rewards[i] + gamma * values[i + 1].data - values[i].data
            gae = gae * gamma * tau + td_error
            actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i])

        optimizer.zero_grad()
        total_loss = actor_loss + critic_loss * 0.5  # type: Variable
        total_loss.backward()  # error occur
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#14
0
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".
                  format(get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)


if __name__ == '__main__':
    env = create_atari_env(args.rom)
    # torch.manual_seed(SEED)
    shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()
    # print (shared_model.conv1._parameters['weight'].data.is_cuda)
    optimizer = SharedAdam(shared_model.parameters(), lr=0.0001)
    optimizer.share_memory()

    if args.play:
        if os.path.isfile(args.play):
            print("=> loading checkpoint '{}'".format(args.play))
            checkpoint = torch.load(args.play)
            #            args.start_epoch = checkpoint['epoch']
            #            best_prec1 = checkpoint['best_prec1']
            shared_model.load_state_dict(checkpoint['state_dict'])
            #optimizer.load_state_dict(checkpoint['optimizer'])
示例#15
0
def train(rank, params, shared_model, optimizer):
    torch.manual_seed(params.seed + rank) # shifting the seed with rank to asynchronize each training agent
    env = create_atari_env(params.env_name) # creating an optimized environment thanks to the create_atari_env function
    env.seed(params.seed + rank) # aligning the seed of the environment on the seed of the agent
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating the model from the ActorCritic class
    state = env.reset() # state is a numpy array of size 1*42*42, in black & white
    state = torch.from_numpy(state) # converting the numpy array into a torch tensor
    done = True # when the game is done
    episode_length = 0 # initializing the length of an episode to 0
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        model.load_state_dict(shared_model.state_dict()) # synchronizing with the shared model - the agent gets the shared model to do an exploration on num_steps
        if done: # if it is the first iteration of the while loop or if the game was just done, then:
            cx = Variable(torch.zeros(1, 256)) # the cell states of the LSTM are reinitialized to zero
            hx = Variable(torch.zeros(1, 256)) # the hidden states of the LSTM are reinitialized to zero
        else: # else:
            cx = Variable(cx.data) # we keep the old cell states, making sure they are in a torch variable
            hx = Variable(hx.data) # we keep the old hidden states, making sure they are in a torch variable
        values = [] # initializing the list of values (V(S))
        log_probs = [] # initializing the list of log probabilities
        rewards = [] # initializing the list of rewards
        entropies = [] # initializing the list of entropies
        for step in range(params.num_steps): # going through the num_steps exploration steps
            value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx))) # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
            prob = F.softmax(action_values) # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
            log_prob = F.log_softmax(action_values) # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
            entropy = -(log_prob * prob).sum(1) # H(p) = - sum_x p(x).log(p(x))
            entropies.append(entropy) # storing the computed entropy
            action = prob.multinomial().data # selecting an action by taking a random draw from the prob distribution
            log_prob = log_prob.gather(1, Variable(action)) # getting the log prob associated to this selected action
            values.append(value) # storing the value V(S) of the state
            log_probs.append(log_prob) # storing the log prob of the action
            state, reward, done, _ = env.step(action.numpy()) # playing the selected action, reaching the new state, and getting the new reward
            done = (done or episode_length >= params.max_episode_length) # if the episode lasts too long (the agent is stucked), then it is done
            reward = max(min(reward, 1), -1) # clamping the reward between -1 and +1
            if done: # if the episode is done:
                episode_length = 0 # we restart the environment
                state = env.reset() # we restart the environment
            state = torch.from_numpy(state) # tensorizing the new state
            rewards.append(reward) # storing the new observed reward
            if done: # if we are done
                break # we stop the exploration and we directly move on to the next step: the update of the shared model
        R = torch.zeros(1, 1) # intializing the cumulative reward
        if not done: # if we are not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx))) # we initialize the cumulative reward with the value of the last shared state
            R = value.data # we initialize the cumulative reward with the value of the last shared state
        values.append(Variable(R)) # storing the value V(S) of the last reached state S
        policy_loss = 0 # initializing the policy loss
        value_loss = 0 # initializing the value loss
        R = Variable(R) # making sure the cumulative reward R is a torch Variable
        gae = torch.zeros(1, 1) # initializing the Generalized Advantage Estimation to 0
        for i in reversed(range(len(rewards))): # starting from the last exploration step and going back in time
            R = params.gamma * R + rewards[i] # R = gamma*R + r_t = r_0 + gamma r_1 + gamma^2 * r_2 ... + gamma^(n-1)*r_(n-1) + gamma^nb_step * V(last_state)
            advantage = R - values[i] # R is an estimator of Q at time t = i so advantage_i = Q_i - V(state_i) = R - value[i]
            value_loss = value_loss + 0.5 * advantage.pow(2) # computing the value loss
            TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data # computing the temporal difference
            gae = gae * params.gamma * params.tau + TD # gae = sum_i (gamma*tau)^i * TD(i) with gae_i = gae_(i+1)*gamma*tau + (r_i + gamma*V(state_i+1) - V(state_i))
            policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i] # computing the policy loss
        optimizer.zero_grad() # initializing the optimizer
        (policy_loss + 0.5 * value_loss).backward() # we give 2x more importance to the policy loss than the value loss because the policy loss is smaller
        torch.nn.utils.clip_grad_norm(model.parameters(), 40) # clamping the values of gradient between 0 and 40 to prevent the gradient from taking huge values and degenerating the algorithm
        ensure_shared_grads(model, shared_model) # making sure the model of the agent and the shared model share the same gradient
        optimizer.step() # running the optimization step
示例#16
0
def make_env():
  env = create_atari_env(FLAGS.env)
  return env
示例#17
0
        ap = model.output(s)
        a = int(np.argmax(ap))
        s_, r, done, info = env.step(a)

        score += r
        frame += 1
        s = s_
        if done:
            break
    return score, frame


#game_Name = 'PongDeterministic-v4'
#game_Name = 'AlienDeterministic-v4'
game_Name = 'YarsRevengeDeterministic-v4'
env = create_atari_env(game_Name)
shape = env.observation_space.shape
N_A = env.action_space.n

# saver = tf.train.import_meta_graph('modellstm/model/model.ckpt.meta')
SESS = tf.Session()
# saver.restore(SESS, "./model15k/model.ckpt") # 注意路径写法
# graph = tf.get_default_graph()
ckpt_path = "./modellstm/model/model.ckpt"
input = tf.placeholder(tf.float32, [1, 42, 42, 1],
                       's')  #graph.get_operation_by_name('s_1').outputs[0]#

# var_to_shape_map = reader.get_variable_to_shape_map()
# for key in var_to_shape_map:
#     if key.find('Global_Net/actor') != -1:
#         print("tensor_name: ", key)
示例#18
0
parser.add_argument('--num-processes', type=int, default=4, metavar='N',
                    help='how many training processes to use (default: 4)')
parser.add_argument('--num-steps', type=int, default=20, metavar='NS',
                    help='number of forward steps in A3C (default: 20)')
parser.add_argument('--max-episode-length', type=int, default=10000, metavar='M',
                    help='maximum length of an episode (default: 10000)')
parser.add_argument('--env-name', default='PongDeterministic-v3', metavar='ENV',
                    help='environment to train on (default: PongDeterministic-v3)')


if __name__ == '__main__':
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    env = create_atari_env(args.env_name)
    shared_model = ActorCritic(
        env.observation_space.shape[0], env.action_space)
    shared_model.share_memory()

    processes = []

    p = mp.Process(target=test, args=(args.num_processes, args, shared_model))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train, args=(rank, args, shared_model))
        p.start()
        processes.append(p)
    for p in processes:
示例#19
0
文件: test.py 项目: 404akhan/research
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    if not os.path.exists('models-a3c'):
        os.makedirs('models-a3c')
    path = 'models-a3c/model-{}.pth'.format(args.model_name)
    print('saving directory is', path)

    model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma)
    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * model.num_outputs

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            torch.save(shared_model.state_dict(), path)
            print('saved model')

        atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        state_new, reward, done, info = env.step(action_np)
        dead = is_dead(info)

        if args.testing:
            atoms_prob = F.softmax(atoms_logit)
            value = model.get_v(atoms_prob, batch=False)
            atoms_prob = atoms_prob.squeeze().data.numpy()

            print('episode', episode_length, 'normal action', action_np,
                  'lives', info['ale.lives'], 'value', value)
            env.render()

            if ep_counter % 100 == 0:
                plt.plot(model.z, atoms_prob)
                plt.title('average v is {}'.format(value))
                plt.show()
        state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
        done = done or episode_length >= args.max_episode_length

        reward_sum += reward
        episode_length += 1

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}".format(
                action_stat[:model.num_outputs]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter)
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * model.num_outputs
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
示例#20
0
def train(rank, shared_model, optimizer):
    """
    :param rank: worker-ID
    :param shared_model: model to sync between workers
    :param optimizer:
    :return:
    """
    # torch.manual_seed(SEED + rank)
    ac_steps = 20  # The amount of steps before you review
    max_episode_length = 10000  # The game will stop after this amount of time and maybe re run the game?
    gamma = 0.99
    tau = 1.0
    max_grad_norm = 50.0  # Limit the direction of gradient travel within the queue. Anything outside the queue is cut
    checkpoint_n = 20  # To see the model after this many n. Can increase this number if have a shit comp

    env = create_atari_env(
        romname
    )  # enage game. romname is depending on the game of your choice.
    env.seed(SEED + rank)  # For the problem to occur again? LOOK THIS UP
    state = env.reset()
    # Allow torch to handle pixel data. Don't understrand squeeze. FloatTensor - Tensor is an array, therefore array of float.
    state = Variable(torch.from_numpy(state).unsqueeze(0).type(FloatTensor),
                     requires_grad=False)
    # Selecting model, with this size of input and that kind of output
    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    t = 0
    done = True  # Starting from a state when gameover is true!
    episodes = 0
    reward_sum = 0
    reward_sum1 = 0
    start_time = time.time()
    best_reward = -999
    isbest = 0
    cx = hx = None
    while True:
        model.load_state_dict(shared_model.state_dict(
        ))  # Pull the up to date model from the shared model
        if done:  # need to reset LSTM cell's input
            # the LSTM units need their own output to feed into next step
            # input (hence the name of the kind: recurrent neural nets).
            # At the beginning of an episode, to get things started,
            # we need to allocate some initial values in the required format,
            # i.e. the same size as the output of the layer.
            #
            # see http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM
            # for details
            #
            # Optionally, you can remove LSTM to simplify the code
            # Think: what is the possible loss?
            cx = Variable(torch.zeros(1, 256)).type(
                FloatTensor
            )  # torch.zeros - setting the values to all zeros since there's nothing there yet
            hx = Variable(torch.zeros(1, 256)).type(FloatTensor)
        else:
            cx = Variable(
                cx.data)  # takes the last computed value for the next input
            hx = Variable(
                hx.data
            )  # basically this is to detach from previous comp graph

        states = []
        values = []
        log_probs = []
        rewards = []
        entropies = []

        for i in range(ac_steps):  # Running through the 20 steps
            t += 1
            v, logit, (hx, cx) = model(
                (state, (hx, cx))
            )  # When you run model, it will return you 4 values -> store those 4 values in v, logit, etc.
            states.append(state)
            prob = F.softmax(logit)  # The gradient descent thing
            log_prob = F.log_softmax(
                logit)  # Do it again, a lot to make sure its correct
            entropy = -(log_prob * prob).sum(
                1, keepdim=True
            )  # To increase diversity of our choice (part of e-greedy?)
            entropies.append(entropy)

            # detach - anything compute with pytorch will drag a trail behind it. When get gradient descent, the calculation will race with the result. We do not want the descent to chase it randomly, so we just detach it. !Do not need to modify this function when modify the code.
            action = prob.multinomial().detach(
            )  # detach -- so the backprob will NOT go through multinomial()
            # use the current action as an index to get the
            # corresponding log probability
            log_prob = log_prob.gather(
                1, action
            )  # allow you to simultenously take probability of many actions.

            action = action.data[
                0,
                0]  # Extract the variables out of the integer. Turning it from a torch integer to a "normal" integer
            # Accept what was given by the action, does it things? and the env will return the 4 following; state, reward, done
            # _ is something that we don't care about but since env.step is returning 4 values so we just have to have something to take it.
            state, reward, done, _ = env.step(action)
            reward_sum += reward
            reward_sum1 += reward  # reason why store reward sum twice just for re-assurance
            done = (done or t >= max_episode_length)
            if done:
                t_ = t
                t = 0
                state = env.reset()
                episodes += 1
                if episodes % 10 == 0:
                    time_str = time.strftime(
                        "%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                    print("Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {}".format(time_str, rank, episodes,
                                                     reward_sum / 10.0, t_))
                    reward_sum = 0.0

                if episodes % checkpoint_n == 0:
                    ave_reward = reward_sum1 / checkpoint_n
                    if best_reward < ave_reward:
                        isbest = 1
                        best_reward = ave_reward

                    print("Saving checkpoint Time {}, worker-{} episode {} "
                          "mean episode reward {}, "
                          "episode length {} best_reward {}".format(
                              get_elapsed_time_str(), rank, episodes,
                              ave_reward, t_, best_reward))
                    checkpoint_fname = os.path.join(
                        args.savedir,
                        args.rom + '_worker' + str(rank) + '_' + str(episodes))
                    save_checkpoint(
                        {
                            'epoch': episodes,
                            'average_reward': ave_reward,
                            'time': time.time(),
                            'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        }, isbest, checkpoint_fname)
                    reward_sum1 = 0.0

            state = Variable(
                torch.from_numpy(state).unsqueeze(0).type(FloatTensor),
                requires_grad=False)
            reward = max(min(reward, 1), -1)
            values.append(v)
            log_probs.append(log_prob)  # Keep record
            rewards.append(reward)

            if done:
                break

        # We reach here because either
        # i) an episode ends, such as game over
        # ii) we have explored certain steps into the future and now it is
        #     time to look-back and summerise the
        if done:
            R = torch.zeros(1, 1).type(
                FloatTensor
            )  # If game over, the game over stage receive a reward of 0
        else:
            value, _, _ = model(
                (state, (hx, cx))
            )  # if its not game over, then we will use the model to evaluate the reward
            R = value.data

        values.append(Variable(R))
        critic_loss = 0
        actor_loss = 0
        R = Variable(R)
        gae = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]  # R - longterm reward
            advantage = R - values[
                i]  # type: Variable, advantage against the average

            # Compare the actual long-term reward. Note: we are reversing the
            # experience of a complete trajectory. If the full length is 100
            # (time indexes are among 0, 1, 2, ..., 99), and now i=50, that means
            # we have processed all information in steps, 51, 52, ..., 99
            # and R will contain the actual long term reward at time step 51 at
            # the beginning of this step. The above computation injects the reward
            # information in step 50 to R. Now R is the long-term reward at this
            # step.
            #
            # So-called advantage is then the "unexpected gain/loss". It forms the base
            # of evaluating the action taken at this step (50).
            #
            # critic_loss accumulates those "exceptional gain/loss" so that later we will
            # adjust our expectation for each state and reduce future exceptions (to better
            # evaluate actions, say, the advantage agains expectation is only meaningful
            # when the expectation itself is meaningful).
            critic_loss += 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            # see https://arxiv.org/abs/1506.02438
            # we can use advantage in the computation of the direction to adjust policy,
            # but the manipulation here improves stability (as claims by the paper).
            #
            # Note advantage implicitly contributes to GAE, since it helps
            # achieve a good estimation of state-values.
            td_error = rewards[i] + gamma * values[i + 1].data - values[i].data
            gae = gae * gamma * tau + td_error

            # log_probs[i] is the log-probability(action-taken). If GAE is great, that
            # means the choice we had made was great, and we want to make the same
            # action decision in future -- make log_probs[i] large. Otherwise,
            # we add log_probs to our regret and will be less likely to take the same
            # action in future.
            #
            # entropy means the variety in a probabilistic distribution,
            # to encourage big entropies is to make more exploration.
            actor_loss -= (Variable(gae) * log_probs[i] + 0.01 * entropies[i])

        optimizer.zero_grad(
        )  # Applied the gradient to the parameter (back-propagation will get you good stuff from gradient)
        total_loss = actor_loss + critic_loss * 0.5  # type: Variable
        total_loss.backward()  # error occur, back propagation
        # this is to improve stability
        torch.nn.utils.clip_grad_norm(model.parameters(), max_grad_norm)
        ensure_shared_grads(
            model, shared_model)  # Push each updated model to the shared model
        optimizer.step()
示例#21
0
def test(shared_model, render=0):
    env = create_atari_env(args.rom)
    if render == 1:
        env.render()

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    # a quick hack to prevent the agent from stucking
    episode_length = 0
    cx = hx = None
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model(
            (Variable(state.unsqueeze(0).type(FloatTensor),
                      volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        # print logit.data.numpy()
        action = prob.max(1, keepdim=True)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        if render:
            #env.render()
            # Spits out images in the selected path
            img = env.render('rgb_array')
            imsave(
                '/opt/tmp/img/pac-20000/frame_{:06d}.png'.format(
                    episode_length), img)
        """    
        TEST-DEMO-ONLY
        state_im = state.numpy()
        state_im.transpose()
        scipy.misc.imageio.saveim(state_im, filename-with-time-step-number)
        #ffmpeg 
        END-WORKZONE
        """

        done = done or episode_length >= 10000
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
示例#22
0
def get_env_ac_space(env_id):
    from envs import create_atari_env
    return create_atari_env(env_id).action_space.n
示例#23
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()
    avg_rew_win_size = 25
    avg_rew = 0
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    avg_rew_cnt = 0
    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward
            reward = max(min(reward, 1), -1)
            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True
                with lock:
                    counter.value += 1

            if done:
                avg_rew = avg_rew + reward_sum
                if avg_rew_cnt % avg_rew_win_size == 0:
                    print(" avg. episode reward {}".format(avg_rew /
                                                           avg_rew_win_size))
                    avg_rew = 0
                print("Time {},  episode reward {}, episode length {}".format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length))
                episode_length = 0
                reward_sum = 0
                actions.clear()
                state = env.reset()
                avg_rew_cnt = avg_rew_cnt + 1

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#24
0
def test(rank, args, shared_model, counter, logger):
    console_f = logger.init_console_log_file()

    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    max_score = 0

    start_time = time.time()

    while True:
        if args.max_counter_num != 0 and counter.value > args.max_counter_num:
            if args.save_policy_models:
                logger.save_policy_model(shared_model, counter.value + 1)
            exit(0)
        # monitor counter value
        if counter.value % args.testing_every_counter > 1:
            continue
        counter_value = counter.value
        model.load_state_dict(shared_model.state_dict())

        if args.save_policy_models:
            if counter_value % args.save_policy_models_every <= 5:
                logger.save_policy_model(shared_model, counter_value)

        state = env.reset()
        state = torch.from_numpy(state)
        reward_sum = 0
        done = True

        # a quick hack to prevent the agent from stucking
        # actions = deque(maxlen=100)
        # actions = deque(maxlen=500)
        actions = deque(maxlen=1000)
        episode_length = 0
        episode_count = 0
        episode_rewards_sum = 0
        episode_length_sum = 0
        while True:
            episode_length += 1
            # Sync with the shared model
            with torch.no_grad():
                if done:
                    cx = Variable(torch.zeros(1, 256))
                    hx = Variable(torch.zeros(1, 256))
                else:
                    cx = Variable(cx.data)
                    hx = Variable(hx.data)

                value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
                prob = F.softmax(logit, dim=1)
                action = prob.max(1, keepdim=True)[1].data.numpy()

            state, reward, done, _ = env.step(action[0, 0])
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward

            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True

            if done:
                episode_count += 1
                episode_rewards_sum += reward_sum
                episode_length_sum += episode_length
                if episode_count == args.testing_episodes_num:
                    print("Time {}, num steps {}, FPS {:.0f}, avg episode reward {}, avg episode length {}".format(
                        time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
                        counter_value, counter_value / (time.time() - start_time),
                        episode_rewards_sum/args.testing_episodes_num, episode_length_sum/args.testing_episodes_num))
                    logger.write_results_log(console_f,
                                             time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
                                             counter_value,
                                             counter_value / (time.time() - start_time),
                                             episode_rewards_sum / args.testing_episodes_num,
                                             episode_length_sum / args.testing_episodes_num)
                    if args.save_max and (episode_rewards_sum / args.testing_episodes_num) >= max_score:
                        max_score = episode_rewards_sum / args.testing_episodes_num
                        logger.save_policy_model(shared_model, count="max_reward")
                    break

                reward_sum = 0
                episode_length = 0
                actions.clear()
                state = env.reset()

            state = torch.from_numpy(state)
示例#25
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            episode_length += 1
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)
            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).detach()
            log_prob = log_prob.gather(1, action)

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            with lock:
                counter.value += 1

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((state.unsqueeze(0), (hx, cx)))
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        (policy_loss + args.value_loss_coef * value_loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
示例#26
0
class Params():
    def __init__(self):
        self.lr = 0.0001
        self.gamma = 0.99
        self.tau = 1.
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'

# Main run
os.environ['OMP_NUM_THREADS'] = '1' # 1 thread per core
params = Params() # creating the params object from the Params class, that sets all the model parameters
torch.manual_seed(params.seed) # setting the seed (not essential)
env = create_atari_env(params.env_name) # we create an optimized environment thanks to universe
# shared_model is the model shared by the different agents (different threads in different cores)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
# storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory
#  even if they are in different cores
# the optimizer is also shared because it acts on the shared model
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
# same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory
# to optimize the model
optimizer.share_memory()
processes = []  # initializing the processes with an empty list
p = mp.Process(target=test, args=(params.num_processes, params, shared_model)) # allowing to create the 'test' process
#  with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread
p.start() # starting the created process p
processes.append(p) # adding the created process p to the list of processes
示例#27
0
def train(rank, params, shared_model, optimizer):
	torch.manual_seed(params.seed + rank)
	env = create_atari_env(params.env_name) #getting the environment
	env.seed(params.seed + rank)
	model = ActorCritic(env.observation_space.shape[0], env.action_space)
	state = env.reset()
	state = torch.from_numpy(state)
	done = True 
	episode_length = 0
	while True:
		episode_length+=1
		model.load_state_dict(shared_model.state_dict())
		if done:
			cx = Variable(torch.zeros(1,256))
			hx = Variable(torch.zeros(1,256))
		else:
			cx = Variable(cx.data)
			hx = Variable(hx.data)
		values = []
		log_probs = []
		rewards = []
		entropies = []
		for step in range(params.num_steps):
			value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
			prob = F.softmax(action_values)
			log_prob = F.log_softmax(action_values)
			entropy = -(log_prob * prob).sum(1)
			entropies.append(entropy)
			action = prob.multinomial().data
			log_prob = log_prob.gather(1, Variable(action))
			values.append(value)
			log_probs.append(log_prob)
			state, reward, done = env.step(action.numpy())
			done = (done or episode_length >= params.max_episode_length)
			reward = max(min(reward,1), -1)
			if done:
				episode_length = 0
				state = env.reset()
			state = torch.from_numpy(state)
			rewards.append(reward)
			if done:
				break 
		R = torch.zeros(1,1)
		if not done:
			value, _, _ = model.((Variable(state.unsqueeze(0)), (hx, cx)))
			R = value.data
		values.append(Variable(R))
		policy_loss = 0
		value_loss = 0
		R = Variable(R)
		gae = torch.zeros(1,1)
		for i in reversed(range(len(rewards))):
			R = params.gamma*R + rewards[i]
			advantage = R - values[i]
			value_loss = value_loss + 0.5 * advantage.pow(2)
			TD = rewards[i] + params.gamma * values[i+1].data - values[i].data
			gae = gae * params.gamma * params.tau + TD 
			policy_loss = policy_loss - log_probs[i]*Variable(gae) - 0.01*entropies[i]
		optimizer.zero_grad()
		(policy_loss + 0.5 * value_loss).backward()
		torch.nn.utils.clip_grad_norm(model.parameters(), 40)
		ensure_shared_grads(model, shared_model)
		optimizer.step()
示例#28
0
def train(
    rank, args, shared_model, shared_curiosity,
    counter, lock, pids, optimizer, train_policy_losses,
    train_value_losses, train_rewards
):
    pids.append(os.getpid())

    torch.manual_seed(args.seed + rank)

    if args.game == 'doom':
        env = create_doom_env(
            args.env_name, rank,
            num_skip=args.num_skip, num_stack=args.num_stack)
    elif args.game == 'atari':
        env = create_atari_env(args.env_name)
    elif args.game == 'picolmaze':
        env = create_picolmaze_env(args.num_rooms)
    env.seed(args.seed + rank)

    model = ActorCritic(
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)
    curiosity = IntrinsicCuriosityModule(  # ICM
        # env.observation_space.shape[0],
        args.num_stack,
        env.action_space)

    if optimizer is None:
        # optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
        optimizer = optim.Adam(  # ICM
            chain(shared_model.parameters(), shared_curiosity.parameters()),
            lr=args.lr)

    model.train()
    curiosity.train()  # ICM

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0

    killer = Killer()
    while not killer.kill_now:
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        curiosity.load_state_dict(shared_curiosity.state_dict())  # ICM

        if done:
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        values = []
        log_probs = []
        rewards = []
        entropies = []

        inv_loss = torch.tensor(0.0)   # ICM
        forw_loss = torch.tensor(0.0)  # ICM

        for step in range(args.num_steps):
            if done:
                episode_length = 0
                state = env.reset()
                state = torch.from_numpy(state)
            episode_length += 1

            value, logit, (hx, cx) = model(state.unsqueeze(0),
                                           hx, cx)
            prob = F.softmax(logit, dim=-1)
            log_prob = F.log_softmax(logit, dim=-1)
            entropy = -(log_prob * prob).sum(1, keepdim=True)

            # Entropy trick
            if 'sparse' in args.env_name.lower():
                max_entropy = torch.log(
                    torch.tensor(logit.size()[1], dtype=torch.float))
                entropy = entropy \
                    if entropy <= args.max_entropy_coef * max_entropy \
                    else torch.tensor(0.0)

            entropies.append(entropy)

            action = prob.multinomial(num_samples=1).flatten().detach()
            log_prob = log_prob.gather(1, action.view(1, 1))

            state_old = state  # ICM

            state, external_reward, done, _ = env.step(action)
            state = torch.from_numpy(state)

            # external reward = 0 if ICM-only mode
            external_reward = external_reward * (1 - args.icm_only)

            # <---ICM---
            inv_out, forw_out, curiosity_reward = \
                curiosity(
                    state_old.unsqueeze(0), action,
                    state.unsqueeze(0))
            # In noreward-rl:
            # self.invloss = tf.reduce_mean(
            #     tf.nn.sparse_softmax_cross_entropy_with_logits(logits, aindex),
            #     name="invloss")
            # self.forwardloss = 0.5 * tf.reduce_mean(tf.square(tf.subtract(f, phi2)), name='forwardloss')
            # self.forwardloss = self.forwardloss * 288.0 # lenFeatures=288. Factored out to make hyperparams not depend on it.
            current_inv_loss = F.nll_loss(F.log_softmax(inv_out, dim=-1), action)
            current_forw_loss = curiosity_reward
            inv_loss += current_inv_loss
            forw_loss += current_forw_loss

            curiosity_reward = args.eta * curiosity_reward

            reward = max(min(external_reward, args.clip), -args.clip) + \
                max(min(curiosity_reward.detach(), args.clip), -args.clip)
            # ---ICM--->

            done = done or episode_length >= args.max_episode_length

            with lock:
                counter.value += 1

            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        train_rewards[rank - 1] = sum(rewards)

        # <---ICM---
        inv_loss = inv_loss / episode_length
        forw_loss = forw_loss * (32 * 3 * 3) * 0.5 / episode_length

        curiosity_loss = args.lambda_1 * (
            (1 - args.beta) * inv_loss + args.beta * forw_loss)
        # ---ICM--->

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model(state.unsqueeze(0), hx, cx)
            R = value.detach()

        values.append(R)
        policy_loss = 0
        value_loss = 0
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + args.gamma * \
                values[i + 1] - values[i]
            gae = gae * args.gamma * args.gae_lambda + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * gae.detach() - args.entropy_coef * entropies[i]

        optimizer.zero_grad()

        train_policy_losses[rank - 1] = float((policy_loss).detach().item())
        train_value_losses[rank - 1] = float((value_loss).detach().item())

        (policy_loss + args.value_loss_coef * value_loss +
            curiosity_loss).backward()  # ICM
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        torch.nn.utils.clip_grad_norm_(curiosity.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        ensure_shared_grads(curiosity, shared_curiosity)
        optimizer.step()

    env.close()
示例#29
0
def train(rank, args, shared_model, optimizer=None):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    state = env.reset()
    state = torch.from_numpy(state)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256))
            hx = Variable(torch.zeros(1, 256))
        else:
            cx = Variable(cx.data)
            hx = Variable(hx.data)

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.numpy())
            done = done or episode_length >= args.max_episode_length
            reward = max(min(reward, 1), -1)

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - 0.01 * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.lstm_size)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    #actions=deque(maxlen=100)
    episode_length = 0

    currentPath = os.getcwd()
    File = open(currentPath + '/record.txt', 'a+')
    print("\n\n\n\n------------------------------\n\n\n\n\n")
    File.write("\n\n\n\n------------------------------\n\n\n\n\n")
    File.close()

    cnt = 0
    episode_number = 0

    while True:
        env.render()
        cnt = cnt + 1
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
            cx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
        else:
            hx = Variable(hx.data, volatile=True)
            cx = Variable(cx.data, volatile=True)

        #print(state)
        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        #action=prob.max(1)[1].data.numpy()
        action = prob.multinomial().data

        #if(args.env_name=='Breakout-v3'):
        #    state,reward,done,_=env.step(1)
        #     reward_sum+=reward
        #state,reward,done,_ =env.step(action[0,0])
        state, reward, done, _ = env.step(action.numpy())
        done = done  #or episode_length >= args.max_episode_length
        if episode_length >= args.max_episode_length:
            done = True
            reward_sum -= 30
        reward_sum += reward

        #actions.append(action[0,0])
        #if actions.count(actions[0])==actions.maxlen:
        #    done=True
        #if reward!=0:
        #  print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!')

        if done:
            hour = int(
                time.strftime("%H", time.gmtime(time.time() - start_time)))
            _min = int(
                time.strftime("%M", time.gmtime(time.time() - start_time)))

            print("Time {},episode reward {}, episode length {} ".format(
                hour * 60 + _min + args.starttime, reward_sum, episode_length))

            File = open(currentPath + '/record.txt', 'a+')
            File.write(
                "Time {},episode reward {}, episode length {} \n".format(
                    hour * 60 + _min + args.starttime, reward_sum,
                    episode_length))
            File.close()

            reward_sum = 0
            episode_length = 0
            #actions.clear()
            state = env.reset()

            torch.save(model.state_dict(), currentPath + '/A3C.t7')
            episode_number += 1
            time.sleep(60)

        state = torch.from_numpy(state)