示例#1
0
def test(args, model, env):
    torch.manual_seed(args.seed)

    # env = create_atari_env(args.env_name)
    # env = create_car_racing_env()
    env.seed(args.seed)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        #env.render()
        episode_length += 1
        # Sync with the shared model
        if done:
            # model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, model.lstm_size), volatile=True)
            hx = Variable(torch.zeros(1, model.lstm_size), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            return
            # time.sleep(60)

        state = torch.from_numpy(state)
示例#2
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

        if done and counter.value > args.max_steps:
            test_final(shared_model, env, args)
            save_model(shared_model, args)
            exit()

        with torch.no_grad():
            value, logit = model(state.unsqueeze(0))
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print(
                "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    counter.value, counter.value / (time.time() - start_time),
                    reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
示例#3
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank) # asynchronizing the test agent
    env = create_atari_env(params.env_name, video=True) # running an environment with a video
    env.seed(params.seed + rank) # asynchronizing the environment
    model = ActorCritic(env.observation_space.shape[0], env.action_space) # creating one model
    model.eval() # putting the model in "eval" model because it won't be trained
    state = env.reset() # getting the input images as numpy arrays
    state = torch.from_numpy(state) # converting them into torch tensors
    reward_sum = 0 # initializing the sum of rewards to 0
    done = True # initializing done to True
    start_time = time.time() # getting the starting time to measure the computation time
    actions = deque(maxlen=100) # cf https://pymotw.com/2/collections/deque.html
    episode_length = 0 # initializing the episode length to 0
    while True: # repeat
        episode_length += 1 # incrementing the episode length by one
        if done: # synchronizing with the shared model (same as train.py)
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model((Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy() # the test agent does not explore, it directly plays the best action
        state, reward, done, _ = env.step(action[0, 0]) # done = done or episode_length >= params.max_episode_length
        reward_sum += reward
        if done: # printing the results at the end of each part
            print("Time {}, episode reward {}, episode length {}".format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, episode_length))
            reward_sum = 0 # reinitializing the sum of rewards
            episode_length = 0 # reinitializing the episode length
            actions.clear() # reinitializing the actions
            state = env.reset() # reinitializing the environment
            time.sleep(60) # doing a one minute break to let the other agents practice (if the game is done)
        state = torch.from_numpy(state) # new state and we continue
示例#4
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)
    torch.save(shared_model.state_dict(), 't.pkl')

    env = Env(args.seed + rank)
    model = ActorCritic(1, env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    # env.visual()

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=500)
    episode_length = 0
    while True:

        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())


        with torch.no_grad():
            value, logit = model((state.unsqueeze(0)).type(torch.FloatTensor))
        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()
        print(action)

        state, reward, done = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            # env.visual()
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()

            time.sleep(60)

        state = torch.from_numpy(state)
示例#5
0
文件: main.py 项目: JasonTang99/A3C
def run(args):
    device = torch.device("cpu")
    env = gym.make('SpaceInvaders-v0')
    state_size = env.observation_space.shape
    action_size = env.action_space.n

    model = ActorCritic([1, 4, 84, 84], action_size).to(device)
    opt = SharedRMSprop(model.parameters(),
                        lr=args.lr,
                        alpha=args.alpha,
                        eps=1e-8,
                        weight_decay=args.weight_decay,
                        momentum=args.momentum,
                        centered=False)
    opt_lock = mp.Lock()
    scheduler = LRScheduler(args)

    if args.load_fp:
        checkpoint = torch.load(args.load_fp)
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])

    if args.train:
        start = time.time()

        model.share_memory()
        model.train()

        step_counter, max_reward, ma_reward, ma_loss = [
            mp.Value('d', 0.0) for _ in range(4)
        ]

        processes = []
        if args.num_procs == -1:
            args.num_procs = mp.cpu_count()
        for rank in range(args.num_procs):
            p = mp.Process(target=train,
                           args=(rank, args, device, model, opt, opt_lock,
                                 scheduler, step_counter, max_reward,
                                 ma_reward, ma_loss))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()

        if args.verbose > 0:
            print(f"Seconds taken: {time.time() - start}")
        if args.save_fp:
            torch.save(
                {
                    'model_state_dict': model.state_dict(),
                    # 'optimizer_state_dict': opt.state_dict(),
                },
                args.save_fp)

    if args.test:
        model.eval()
        test(args, device, model)
示例#6
0
def test(shared_model, render=0):
    # torch.manual_seed(rank)

    env = create_atari_env(args.rom)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    cx = hx = None
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0).type(FloatTensor), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        # print logit.data.numpy()
        action = prob.max(1, keepdim=True)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        if render == 1:
            env.render()
            time.sleep(0.03)
        done = done or episode_length >= 10000
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".
                  format(get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
示例#7
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)  # Test ajanını asenkron yapmak için
    env = create_atari_env(params.env_name,
                           video=True)  # Ortamı video ile oynatmak için
    env.seed(params.seed + rank)  # Ortamı asenkron yapmak için

    model = ActorCritic(env.observation_space.shape[0],
                        env.action_space)  # Modelin oluşturulması
    model.eval()  # Modelin eğitim yapmaması için

    state = env.reset()  # input resmini numpy array olarak alıyoruz.
    state = torch.from_numpy(state)  # Bunu torch tensörüne çeviriyoruz.
    reward_sum = 0
    done = True
    start_time = time.time()  # Başlangıç zamanı
    actions = deque(maxlen=100)  # https://pymotw.com/2/collections/deque.html
    episode_length = 0

    while True:
        episode_length += 1  # Bölüm uzunluğunu birer birer arttırıyoruz.
        if done:  # Eğitim modundaki gibi paylaşımlı model ile senkronize hale getiriyoruz.
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy(
        )  # Test Ajanı keşif yapmadan doğrudan en iyi aksiyonu kullanarak oynu oynar.
        state, reward, done, _ = env.step(action[
            0,
            0])  # done = done or episode_length >= params.max_episode_length
        reward_sum += reward

        if done:  # Her bölümün sonunda sonucu yazdırır.
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)  # Öbür ajanları beklemek için 1 dk beklemesi için.
        state = torch.from_numpy(
            state)  # Yeni durum (state) oluşturup devam eder.
示例#8
0
def test(rank, args, model_path,
         all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, num):
    torch.manual_seed(args.seed + rank)

    env = Environment(args, all_cooked_time, all_cooked_bw, all_vp_time, all_vp_unit, random_seed=args.seed + rank)

    model = ActorCritic()
    model.load_state_dict(torch.load(model_path))
    model.eval()
    state = env.reset()
    state_time = time.time()
    episode_length = 0
    # log = open('new-result-1/test-vp-log20000.txt', 'w')
    # log = open('results-3/log20000.txt', 'w')
    # log = open('train_norway_result-2/test_log3000.txt', 'w')
    log = open('result-1/log-' + str(num) + '.txt', 'w')
    while True:
        episode_length += 1
        state = Variable(torch.FloatTensor(state))
        # print('state', state)
        logit, value = model(state.view(-1, 11, 8))
        prob = F.softmax(logit, dim=1)
        _, action = torch.max(prob, 1)
        state, reward, done, (action, vp_quality, ad_quality, out_quality, rebuf, cv, blank_ratio, reward, real_vp_bitrate, smooth) \
            = env.step(action.data.numpy()[0])
        update = True

        if update:
            print("Time {}, action {}, ({},{},{}), bitrate {:.3f}, rebuf {:.3f}, cv {:.3f}, smooth {:.3f}, reward {:.3f}, episode {}".format(
                time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time)),
                action, vp_quality, ad_quality, out_quality, real_vp_bitrate, rebuf, cv, smooth,
                reward, episode_length))
            log.write('action: ' + str(action) + ' (' + str(vp_quality) + ',' + str(ad_quality) + ',' + str(out_quality)
                      + ') rebuf: ' + str(rebuf) + ' cv: ' + str(cv) + ' bitrate: ' + str(real_vp_bitrate) + ' smooth: ' + str(smooth) + ' reward: ' + str(reward)
                      + ' episode: ' + str(episode_length) + '\n')
            # log.write(str())
            # print('Time {}'.format(time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - state_time))))
            # print('time: ', time.gmtime(time.time() - state_time))
            # time.sleep(0.5)
        if done:
            state = env.reset()
        if episode_length == 50000:
            log.close()
            break
示例#9
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            save(model, 'brain.pkl')
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0])
        reward_sum += reward
        if done:
            f = open("Statistics.txt", 'a')
            f.write(str(reward_sum) + " " + str(episode_length) + "\n")
            f.close()
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)
示例#10
0
def local_test(index, opt, global_model):
    torch.manual_seed(123 + index)
    env, num_states, num_actions = create_train_env(opt.world, opt.stage,
                                                    opt.action_type)
    local_model = ActorCritic(num_states, num_actions)
    local_model.eval()
    state = torch.from_numpy(env.reset())
    done = True
    curr_step = 0
    actions = deque(maxlen=opt.max_actions)

    while True:
        curr_step += 1
        if done:
            local_model.load_state_dict(global_model.state_dict())

        with torch.no_grad():
            if done:
                h_0 = torch.zeros((1, 512), dtype=torch.float)
                c_0 = torch.zeros((1, 512), dtype=torch.float)
            else:
                h_0 = h_0.detach()
                c_0 = c_0.detach()

        logits, value, h_0, c_0 = local_model(state, h_0, c_0)
        policy = F.softmax(logits, dim=1)
        action = torch.argmax(policy).item()
        state, reward, done, _ = env.step(action)
        env.render()
        actions.append(action)

        if curr_step > opt.num_global_steps or actions.count(
                actions[0]) == actions.maxlen:
            done = True

        if done:
            curr_step = 0
            actions.clear()
            state = env.reset()

        state = torch.from_numpy(state)
示例#11
0
def test(rank, params, shared_model):
    torch.manual_seed(params.seed + rank)
    env = create_atari_env(params.env_name, video=True)
    env.seed(params.seed + rank)
    model = ActorCritic(env.observation_space.shape[0], env.action_space)
    model.eval()
    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    start_time = time.time()
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        value, action_value, (hx, cx) = model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(action_value)
        action = prob.max(1)[1].data.numpy()
        state, reward, done, _ = env.step(action[0, 0])
        reward_sum += reward
        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(
                60
            )  # 60 seconds break to allow the other agents to test the environment
        state = torch.from_numpy(state)
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    num_inputs = env.observation_space.shape[0]
    num_actions = env.action_space.n
    print('state size:', num_inputs)
    print('action size:', num_actions)

    net = ActorCritic(num_inputs, num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()
    running_score = 0
    steps = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()
        state = torch.Tensor(state).to(device)
        state = state.unsqueeze(0)

        while not done:
            env.render()

            steps += 1
            policy, value = net(state)
            action = get_action(policy, num_actions)
            next_state, reward, done, _ = env.step(action)

            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)

            score += reward
            state = next_state

        print('{} episode | score: {:.2f}'.format(e, score))
示例#13
0
文件: agent.py 项目: jxzhn/reversi-ai
class Agent:
    def __init__(self):
        self.net = ActorCritic()
        self.net.load_state_dict(
            torch.load('models/good.pt', map_location='cpu'))
        self.net.eval()
        torch.no_grad().__enter__()  # 关闭梯度记录

    def brain(self, reversi: Reversi, who: int) -> Coordinate:
        # assert reversi.next == who
        state = torch.Tensor(getBoardState(reversi)).unsqueeze(0)
        policy = self.net(state)[1][0]

        # 保证位置合法性
        for y, x in itertools.product(range(SIZE), repeat=2):
            if not reversi.good[y][x]:
                policy[y * SIZE + x] = 0.
            else:
                policy[y * SIZE + x] += 1e-8  # 防止概率全为 0

        action = policy.max(dim=-1).indices.item()
        return (action // SIZE, action % SIZE)
示例#14
0
def test(args):
    args.device = torch.device(
        'cuda:1') if torch.cuda.is_available() else torch.device('cpu')
    scorer = Scorer(args)

    if args.use_tensorboard:
        args.runs_path = os.path.join(args.output_dir, 'runs_test')
        summary_writer = SummaryWriter(args.runs_path)

    model = ActorCritic(args).to(args.device)
    model.eval()

    for epoch_id in range(args.n_epochs):
        cur_reward, used_steps, not_finish, status = run_one_epoch(
            args, scorer, model)
        summary_writer.add_scalar('aver_reward', cur_reward, epoch_id)
        summary_writer.add_scalar('used_steps', used_steps, epoch_id)
        if (epoch_id + 1) % args.add_image_per_epoch == 0:
            (origin_img, cropped_bbox, score_diff) = status
            # import pdb; pdb.set_trace();
            (xmin, ymin, xmax, ymax) = cropped_bbox
            cropped_img = np.ones_like(origin_img) * 255
            cropped_img[ymin:ymax, xmin:xmax, :] = origin_img[ymin:ymax,
                                                              xmin:xmax, :]
            # cropped_img = transform.resize(cropped_img, (origin_img.shape[0], origin_img.shape[1]))
            [origin_img, cropped_img] = map(lambda x: x.transpose((2, 0, 1)),
                                            [origin_img, cropped_img])
            # summary_writer.add_image('origin_img {}'.format(epoch_id), origin_img, epoch_id)
            # summary_writer.add_image('cropped_img {}'.format(epoch_id), cropped_img, epoch_id)
            stacked_img = torchvision.utils.make_grid(torch.from_numpy(
                np.stack((origin_img, cropped_img))),
                                                      nrow=1,
                                                      padding=2)
            summary_writer.add_image('origin_cropped {}'.format(epoch_id),
                                     stacked_img)
            summary_writer.add_scalar('score_diff {}'.format(epoch_id),
                                      score_diff)
        print("epoch : {:03f},  aver_reward: {:03f}, used_steps: {:03d}, not_finish: {:d}".\
                    format(epoch_id, cur_reward, used_steps, int(not_finish)))
示例#15
0
def load_checkpoint(filepath):
    #    checkpoint = torch.load(filepath)
    #    model = checkpoint['model']
    #    model.load_state_dict(checkpoint['state_dict'])
    #    for parameter in model.parameters():
    #        parameter.requires_grad = False
    #    model.eval()
    #####################
    model = ActorCritic(len(state), params.output_space)
    optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr)
    checkpoint = torch.load(params.file_path_shared_model)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    model.eval()

    model_test = ActorCritic(len(state), params.output_space)
    optimizer_test = my_optim.SharedAdam(model_test.parameters(), lr=params.lr)
    checkpoint = torch.load(params.file_path_shared_model_test)
    model_test.load_state_dict(checkpoint['state_dict'])
    optimizer_test.load_state_dict(checkpoint['optimizer'])
    model_test.eval()
    ###########################
    return model
示例#16
0
def test(rank, args, shared_model, counter, loggers, kill):
    counter, steps, max_episodes = counter

    torch.manual_seed(args.seed + rank)

    env = create_vizdoom_env(args.config_path, args.test_scenario_path)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.spaces[0].shape[0],
                        env.action_space, args.topology)

    model.eval()

    state = env.reset()
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)), (torch.zeros(1, 256),
                                                         torch.zeros(1, 256)))
    actions = deque(maxlen=100)
    episode_length = 0
    episode_counter = 0

    obs_index = 0
    obs_history = []
    pose_history = []
    goal_loc = env.goal()

    model.load_state_dict(shared_model.state_dict())

    while not kill.is_set():
        if steps.value > args.max_episode_steps:
            break

        if episode_counter > max_episodes:
            break

        try:
            episode_start_time = time.time()
            episode_length += 1

            value, logit, _, _, hidden = model((state_to_torch(state), hidden))
            prob = F.softmax(logit)
            action = prob.max(1, keepdim=True)[1].data.numpy()

            for i in range(4):
                state, reward, done, _ = env.step(action[0, 0], steps=1)
                reward_sum += reward

                if done:
                    break
                else:
                    obs_frame = (np.moveaxis(state[0], 0, -1) * 255).astype(
                        np.uint8)

                    if isinstance(obs_history, list):
                        obs_history.append(obs_frame)
                    else:
                        obs_history[obs_index, :, :, :] = obs_frame
                        obs_index += 1

                    pose_history.append(env.pose())

            # a quick hack to prevent the agent from stucking
            # actions.append(action[0, 0])
            # if actions.count(actions[0]) == actions.maxlen:
            #     done = True

            if done:
                if isinstance(obs_history, list):
                    obs_history = np.array(obs_history)

                if loggers:
                    loggers['test_reward'](env.game.get_total_reward(),
                                           episode_counter)
                    loggers['video'](video(env.wad, env.current_map, goal_loc,
                                           obs_history, pose_history),
                                     episode_counter)
                    loggers['test_time'](time.time() - episode_start_time,
                                         episode_counter)

                print(
                    "Time {}, num episodes {}, FPS {:.0f}, episode reward {}, episode length {}".
                    format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start_time)),
                        counter.value,
                        counter.value / (time.time() - start_time), reward_sum,
                        episode_length))
                reward_sum = 0
                episode_length = 0
                actions.clear()
                state = env.reset()

                obs_index = 0
                pose_history = []
                goal_loc = env.goal()

                hidden = ((torch.zeros(1, 64), torch.zeros(1, 64)),
                          (torch.zeros(1, 256), torch.zeros(1, 256)))

                time.sleep(args.eval_interval)

                model.load_state_dict(shared_model.state_dict())

                episode_counter += 1
        except Exception as err:
            kill.set()
            raise err
示例#17
0
def test(name, backend, env_name, rank, args, shared_model, counter, docker, train_mode=True):
    torch.manual_seed(args.seed + rank)

    if backend == 'unity3d':
        if docker:
            os.chdir('/mnt/code/')
        env = create_unity3d_env(train_mode=train_mode,\
         file_name=env_name, \
         worker_id=rank, seed=args.seed, \
         docker_training=docker)
    elif backend == 'gym':
        env = create_atari_env(env_name)
        env.seed(args.seed + rank)
    else:
        print(f' [!]: {backend} is not a valid backend')
        raise ValueError

    print(env.action_space)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state).float()
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    history = {'num-steps': [], 'times': [], 'rewards': [], 'episode-length': []}
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1, keepdim=True)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            end = time.time() - start_time
            history['num-steps'].append(counter.value)
            history['times'].append(end)
            history['rewards'].append(reward_sum)
            history['episode-length'].append(episode_length)
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss", time.gmtime(end)), counter.value, counter.value / (end),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()

            if train_mode:
                history['weights'] = shared_model.state_dict()
                torch.save(history, f'{name}-history.t7')
                time.sleep(60)

        state = torch.from_numpy(state).float()

    env.close()
示例#18
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()

    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    # actions = deque(maxlen=100)
    episode_length = 0
    while True:
        env.render()
        print('here')
        # env.render()
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)
        print('there')
        value, logit, (hx, cx) = model((Variable(
            state.unsqueeze(0), volatile=True), (hx, cx)))
        print('hi')
        prob = F.softmax(logit)
        # print(prob)
        action = prob.max(1, keepdim=True)[1].data.numpy()
        print(action)

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
示例#19
0
def test(rank, args, shared_model, counter, logger):
    console_f = logger.init_console_log_file()

    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    max_score = 0

    start_time = time.time()

    while True:
        if args.max_counter_num != 0 and counter.value > args.max_counter_num:
            if args.save_policy_models:
                logger.save_policy_model(shared_model, counter.value + 1)
            exit(0)
        # monitor counter value
        if counter.value % args.testing_every_counter > 1:
            continue
        counter_value = counter.value
        model.load_state_dict(shared_model.state_dict())

        if args.save_policy_models:
            if counter_value % args.save_policy_models_every <= 5:
                logger.save_policy_model(shared_model, counter_value)

        state = env.reset()
        state = torch.from_numpy(state)
        reward_sum = 0
        done = True

        # a quick hack to prevent the agent from stucking
        # actions = deque(maxlen=100)
        # actions = deque(maxlen=500)
        actions = deque(maxlen=1000)
        episode_length = 0
        episode_count = 0
        episode_rewards_sum = 0
        episode_length_sum = 0
        while True:
            episode_length += 1
            # Sync with the shared model
            with torch.no_grad():
                if done:
                    cx = Variable(torch.zeros(1, 256))
                    hx = Variable(torch.zeros(1, 256))
                else:
                    cx = Variable(cx.data)
                    hx = Variable(hx.data)

                value, logit, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
                prob = F.softmax(logit, dim=1)
                action = prob.max(1, keepdim=True)[1].data.numpy()

            state, reward, done, _ = env.step(action[0, 0])
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward

            # a quick hack to prevent the agent from stucking
            actions.append(action[0, 0])
            if actions.count(actions[0]) == actions.maxlen:
                done = True

            if done:
                episode_count += 1
                episode_rewards_sum += reward_sum
                episode_length_sum += episode_length
                if episode_count == args.testing_episodes_num:
                    print("Time {}, num steps {}, FPS {:.0f}, avg episode reward {}, avg episode length {}".format(
                        time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
                        counter_value, counter_value / (time.time() - start_time),
                        episode_rewards_sum/args.testing_episodes_num, episode_length_sum/args.testing_episodes_num))
                    logger.write_results_log(console_f,
                                             time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)),
                                             counter_value,
                                             counter_value / (time.time() - start_time),
                                             episode_rewards_sum / args.testing_episodes_num,
                                             episode_length_sum / args.testing_episodes_num)
                    if args.save_max and (episode_rewards_sum / args.testing_episodes_num) >= max_score:
                        max_score = episode_rewards_sum / args.testing_episodes_num
                        logger.save_policy_model(shared_model, count="max_reward")
                    break

                reward_sum = 0
                episode_length = 0
                actions.clear()
                state = env.reset()

            state = torch.from_numpy(state)
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space.n,
                        args.lstm_size)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    #actions=deque(maxlen=100)
    episode_length = 0

    currentPath = os.getcwd()
    File = open(currentPath + '/record.txt', 'a+')
    print("\n\n\n\n------------------------------\n\n\n\n\n")
    File.write("\n\n\n\n------------------------------\n\n\n\n\n")
    File.close()

    cnt = 0
    episode_number = 0

    while True:
        env.render()
        cnt = cnt + 1
        episode_length += 1
        if done:
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
            cx = Variable(torch.zeros(1, args.lstm_size), volatile=True)
        else:
            hx = Variable(hx.data, volatile=True)
            cx = Variable(cx.data, volatile=True)

        #print(state)
        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        #action=prob.max(1)[1].data.numpy()
        action = prob.multinomial().data

        #if(args.env_name=='Breakout-v3'):
        #    state,reward,done,_=env.step(1)
        #     reward_sum+=reward
        #state,reward,done,_ =env.step(action[0,0])
        state, reward, done, _ = env.step(action.numpy())
        done = done  #or episode_length >= args.max_episode_length
        if episode_length >= args.max_episode_length:
            done = True
            reward_sum -= 30
        reward_sum += reward

        #actions.append(action[0,0])
        #if actions.count(actions[0])==actions.maxlen:
        #    done=True
        #if reward!=0:
        #  print("ep %d : game finished,reward: %d " %(episode_number,reward))+('' if reward == #-1 else ' !!!!!!!!')

        if done:
            hour = int(
                time.strftime("%H", time.gmtime(time.time() - start_time)))
            _min = int(
                time.strftime("%M", time.gmtime(time.time() - start_time)))

            print("Time {},episode reward {}, episode length {} ".format(
                hour * 60 + _min + args.starttime, reward_sum, episode_length))

            File = open(currentPath + '/record.txt', 'a+')
            File.write(
                "Time {},episode reward {}, episode length {} \n".format(
                    hour * 60 + _min + args.starttime, reward_sum,
                    episode_length))
            File.close()

            reward_sum = 0
            episode_length = 0
            #actions.clear()
            state = env.reset()

            torch.save(model.state_dict(), currentPath + '/A3C.t7')
            episode_number += 1
            time.sleep(60)

        state = torch.from_numpy(state)
示例#21
0
def test(rank, args, shared_model, gl_step_cnt):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    local_episode_num = 0

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256), volatile=True)
            hx = Variable(torch.zeros(1, 256), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            passed_time = time.time() - start_time
            local_episode_num += 1
            global_step_count = gl_step_cnt.get_value()

            logger.info("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss", time.gmtime(passed_time)),
                reward_sum, episode_length))
            tb.log_value('steps_second', global_step_count / passed_time,
                         global_step_count)
            tb.log_value('reward', reward_sum, global_step_count)

            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()
            time.sleep(60)

        state = torch.from_numpy(state)
示例#22
0
def test(rank, args, shared_model, counter):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env_name)
    env.seed(args.seed + rank)

    model = ActorCritic(env.observation_space.shape[2], env.action_space)

    model.eval()

    state = env.reset()

    state = np.transpose(state, (2,0,1))
    state = np.ascontiguousarray(state, dtype=np.float32) / 255
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    start_time = time.time()

    # a quick hack to prevent the agent from stucking
    actions = deque(maxlen=100)
    episode_length = 0

    while True:
        episode_length += 1

        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = torch.zeros(1, 256)
            hx = torch.zeros(1, 256)
        else:
            cx = cx.detach()
            hx = hx.detach()

        with torch.no_grad():
            value, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))

        prob = F.softmax(logit, dim=-1)
        action = prob.max(1, keepdim=True)[1].numpy()

        state, reward, done, _ = env.step(action[0, 0])

        state = np.transpose(state, (2, 0, 1))
        state = np.ascontiguousarray(state, dtype=np.float32) / 255

        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        actions.append(action[0, 0])
        if actions.count(actions[0]) == actions.maxlen:
            done = True

        if done:
            print("Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                counter.value, counter.value / (time.time() - start_time),
                reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            actions.clear()
            state = env.reset()

            state = np.transpose(state, (2,0,1))
            state = np.ascontiguousarray(state, dtype=np.float32) / 255

            time.sleep(60)

        state = torch.from_numpy(state)
示例#23
0
文件: test.py 项目: 404akhan/a3c-dlab
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = WrapEnv(args.env_name)
    model = ActorCritic(4, env.num_actions, args.num_skips)

    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * (model.n_real_acts + model.n_aux_acts)

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            if not os.path.exists('model-a3c-aux'):
                os.makedirs('model-a3c-aux')
            torch.save(shared_model.state_dict(),
                       'model-a3c-aux/model-{}.pth'.format(args.model_name))
            print('saved model')

        value, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        if action_np < model.n_real_acts:
            state_new, reward, done, info = env.step(action_np)

            if args.testing:
                print('episode', episode_length, 'normal action', action_np,
                      'lives', info['ale.lives'])
                env.render()
            state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
            done = done or episode_length >= args.max_episode_length

            reward_sum += reward
            episode_length += 1
        else:
            state = state.numpy()

            for _ in range(action_np - model.n_real_acts + 2):
                state_new, rew, done, info = env.step(
                    0)  # instead of random perform NOOP=0

                if args.testing:
                    print('episode', episode_length, 'no_op action', action_np,
                          'lives', info['ale.lives'])
                    # env.render()
                state = np.append(state[1:, :, :], state_new, axis=0)
                done = done or episode_length >= args.max_episode_length

                reward_sum += rew
                episode_length += 1
                if done:
                    break

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}, aux {}".format(
                action_stat[:model.n_real_acts],
                action_stat[model.n_real_acts:]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * (model.n_real_acts + model.n_aux_acts)
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
示例#24
0
def test(args, shared_model):
    action_map = _set_action_map()

    env = FixedEnvWrap()

    # time.sleep(10)
    model = ActorCritic()
    model.load_state_dict(shared_model.state_dict())
    model.eval()

    state = env.reset()

    training_time = 0
    vis = visdom.Visdom(env='final')
    line_plot = vis.line(Y=np.array([0]),
                         opts=dict(xlabel='testing count',
                                   ylabel='average reward',
                                   title='ali-v1'))

    start = time.time()
    vis_count = 0
    while True:
        video_count = 1
        reward_all_sum = 0
        reward_all = 0
        reward_all_ave = 0
        reward_gop = 0
        action = 3
        last_action = 3
        # update model before testing all trace files
        # time.sleep(5)
        print('load updated model')
        model.load_state_dict(shared_model.state_dict())
        while True:
            # get the reward for one gop
            while True:
                _, done, decision_flag = env.step_gop(action)
                if decision_flag or done:
                    reward_gop = env.get_reward_gop()
                    state = env.get_state_gop()
                    break
                else:
                    continue
            # print('testing')
            # get action from model
            last_action = action
            with torch.no_grad():
                state = torch.FloatTensor(state)
                logit, _ = model(
                    state.view(-1, args.s_gop_info, args.s_gop_len))
                prob = F.softmax(logit, dim=1)
                _, action = torch.max(prob, 1)
                action = action.data.numpy()[0]

            bitrate, target_buffer = action_map[last_action]
            # print('bitrate: %d, target_buffer: %d, reward is %s' % (bitrate, target_buffer, reward_gop))
            if done:
                print("video count %d, reward is %.5f" %
                      (video_count, reward_all))
                # reward_all_sum += reward_all / 100
                reward_all_sum += reward_all
                video_count += 1
                if reward_all < 0:
                    print('bad model ! just break this loop')
                    reward_all_ave = 0
                    break
                if video_count > env.traces_len * 2:
                    reward_all_ave = reward_all_sum / video_count
                    break
                action = 3
                last_action = 3
                reward_all = 0

            reward_all += reward_gop

        # update the figure of average reward of all testing files
        vis_count += 1
        reward_all_ave = max(reward_all_ave, 0)
        vis.line(Y=np.array([reward_all_ave]),
                 X=np.array([vis_count]),
                 win=line_plot,
                 update='append')
        path = 'ali-v1/actor.pt-' + str(vis_count)
        torch.save(model.state_dict(), path)

        end = time.time()
        hours, rem = divmod(end - start, 3600)
        minutes, seconds = divmod(rem, 60)

        print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                              seconds))
        print("average reward of traces are: ", reward_all_ave)
        print('saved one model in epoch:', vis_count)
示例#25
0
文件: test.py 项目: Luo1996/ACER
def test(rank, args, T, shared_model):
  torch.manual_seed(args.seed + rank)

  env = gym.make(args.env)
  env.seed(args.seed + rank)
  model = ActorCritic(env.observation_space, env.action_space, args.hidden_size)
  model.eval()

  can_test = True  # Test flag
  t_start = 1  # Test step counter to check against global counter
  rewards, steps = [], []  # Rewards and steps for plotting
  l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
  done = True  # Start new episode

  while T.value() <= args.T_max:
    if can_test:
      t_start = T.value()  # Reset counter

      # Evaluate over several episodes and average results
      avg_rewards, avg_episode_lengths = [], []
      for _ in range(args.evaluation_episodes):
        while True:
          # Reset or pass on hidden state
          if done:
            # Sync with shared model every episode
            model.load_state_dict(shared_model.state_dict())
            hx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            cx = Variable(torch.zeros(1, args.hidden_size), volatile=True)
            # Reset environment and done flag
            state = state_to_tensor(env.reset())
            done, episode_length = False, 0
            reward_sum = 0

          # Optionally render validation states
          if args.render:
            env.render()

          # Calculate policy
          policy, _, _, (hx, cx) = model(Variable(state, volatile=True), (hx.detach(), cx.detach()))  # Break graph for memory efficiency

          # Choose action greedily
          action = policy.max(1)[1].data[0, 0]

          # Step
          state, reward, done, _ = env.step(action)
          state = state_to_tensor(state)
          reward_sum += reward
          done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
          episode_length += 1  # Increase episode counter

          # Log and reset statistics at the end of every episode
          if done:
            avg_rewards.append(reward_sum)
            avg_episode_lengths.append(episode_length)
            break

      print(('[{}] Step: {:<' + l + '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
            datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
            t_start,
            sum(avg_rewards) / args.evaluation_episodes,
            sum(avg_episode_lengths) / args.evaluation_episodes))

      if args.evaluate:
        return

      rewards.append(avg_rewards)  # Keep all evaluations
      steps.append(t_start)
      plot_line(steps, rewards)  # Plot rewards
      torch.save(model.state_dict(), 'model.pth')  # Save model params
      can_test = False  # Finish testing
    else:
      if T.value() - t_start >= args.evaluation_interval:
        can_test = True

    time.sleep(0.001)  # Check if available to test every millisecond

  env.close()
示例#26
0
def test(rank, args, T, shared_model):
    torch.manual_seed(args.seed + rank)

    env = gym.make(args.env)
    env.seed(args.seed + rank)
    model = ActorCritic(env.observation_space, env.action_space,
                        args.hidden_size)
    model.eval()

    save_dir = os.path.join('results', args.name)

    can_test = True  # Test flag
    t_start = 1  # Test step counter to check against global counter
    rewards, steps = [], []  # Rewards and steps for plotting
    l = str(len(str(args.T_max)))  # Max num. of digits for logging steps
    done = True  # Start new episode

    # stores step, reward, avg_steps and time
    results_dict = {'t': [], 'reward': [], 'avg_steps': [], 'time': []}

    while T.value() <= args.T_max:
        if can_test:
            t_start = T.value()  # Reset counter

            # Evaluate over several episodes and average results
            avg_rewards, avg_episode_lengths = [], []
            for _ in range(args.evaluation_episodes):
                while True:
                    # Reset or pass on hidden state
                    if done:
                        # Sync with shared model every episode
                        model.load_state_dict(shared_model.state_dict())
                        hx = torch.zeros(1, args.hidden_size)
                        cx = torch.zeros(1, args.hidden_size)
                        # Reset environment and done flag
                        state = state_to_tensor(env.reset())
                        done, episode_length = False, 0
                        reward_sum = 0

                    # Optionally render validation states
                    if args.render:
                        env.render()

                    # Calculate policy
                    with torch.no_grad():
                        policy, _, _, (hx, cx), _ = model(state, (hx, cx))

                    # Choose action greedily
                    action = policy.max(1)[1][0]

                    # Step
                    state, reward, done, _ = env.step(action.item())
                    state = state_to_tensor(state)
                    reward_sum += reward
                    done = done or episode_length >= args.max_episode_length  # Stop episodes at a max length
                    episode_length += 1  # Increase episode counter

                    # Log and reset statistics at the end of every episode
                    if done:
                        avg_rewards.append(reward_sum)
                        avg_episode_lengths.append(episode_length)
                        break
            print(('[{}] Step: {:<' + l +
                   '} Avg. Reward: {:<8} Avg. Episode Length: {:<8}').format(
                       datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S,%f')[:-3],
                       t_start,
                       sum(avg_rewards) / args.evaluation_episodes,
                       sum(avg_episode_lengths) / args.evaluation_episodes))
            fields = [
                t_start,
                sum(avg_rewards) / args.evaluation_episodes,
                sum(avg_episode_lengths) / args.evaluation_episodes,
                str(datetime.now())
            ]

            # storing data in the dictionary.
            results_dict['t'].append(t_start)
            results_dict['reward'].append(
                sum(avg_rewards) / args.evaluation_episodes)
            results_dict['avg_steps'].append(
                sum(avg_episode_lengths) / args.evaluation_episodes)
            results_dict['time'].append(str(datetime.now()))

            # Dumping the results in pickle format
            with open(os.path.join(save_dir, 'results.pck'), 'wb') as f:
                pickle.dump(results_dict, f)

            # Saving the data in csv format
            with open(os.path.join(save_dir, 'results.csv'), 'a') as f:
                writer = csv.writer(f)
                writer.writerow(fields)

            if args.evaluate:
                return

            rewards.append(avg_rewards)  # Keep all evaluations
            steps.append(t_start)
            plot_line(steps, rewards, save_dir)  # Plot rewards
            torch.save(model.state_dict(),
                       os.path.join(save_dir,
                                    'model.pth'))  # Save model params
            #   torch.save(model.state_dict(), os.path.join(save_dir, 'model_{}.pth'.format(t_start)))  # Save model params
            can_test = False  # Finish testing
        else:
            if T.value() - t_start >= args.evaluation_interval:
                can_test = True

        time.sleep(0.001)  # Check if available to test every millisecond

    # Dumping the results in pickle format
    with open(os.path.join(save_dir, 'results.pck'), 'wb') as f:
        pickle.dump(results_dict, f)

    env.close()
示例#27
0
def main():
    # 确定神经网络计算设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 构建神经网络
    net = ActorCritic()
    net = net.to(device)

    # 准备优化器
    optimizer = torch.optim.Adam(net.parameters(), lr=3e-4)

    # 准备环境
    envs = Envs(NUM_WORKERS, gamma=GAMMA)

    # 开始训练
    for episode in range(EPISODES):

        # 从多个环境采集一回合数据
        net.eval()
        with torch.no_grad():
            states = envs.reset()
            done = False
            while not done:
                states = states.to(device)
                _, policys = net(states)
                policys = policys.cpu()  # 移到CPU上处理比较好
                # 不能下的位置概率填 0
                for i in range(NUM_WORKERS):
                    if envs.reversis[i].next != 0:
                        for y, x in itertools.product(range(SIZE), repeat=2):
                            if not envs.reversis[i].good[y][x]:
                                policys[i][y * SIZE + x] = 0.
                            else:
                                policys[i][y * SIZE + x] += 1e-8  # 防止概率全为 0
                actions = Categorical(probs=policys).sample()
                done, states = envs.step(actions)

        envs.setReturn()
        data = EpisodeData(envs.readHistory())
        loader = DataLoader(data,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=2)

        # 训练网络
        net.train()

        # 相关指标
        value_loss_total = 0.
        entropy_total = 0.

        for states, actions, Returns in loader:
            states, actions, Returns = states.to(device), actions.to(
                device), Returns.to(device)
            values, policys = net(states)

            dist = Categorical(probs=policys)
            action_log_probs = dist.log_prob(actions).view(-1, 1)
            dist_entropy = dist.entropy().mean()  # 我们希望分布的熵更大些,保持模型的探索性

            advantages = Returns.view(-1, 1) - values

            value_loss = advantages.pow(2).mean()
            action_loss = -(advantages.detach() * action_log_probs).mean()

            optimizer.zero_grad()
            (VALUE_LOSS_COEF * value_loss + action_loss -
             ENTROPY_LOSS_COEF * dist_entropy).backward()
            optimizer.step()

            value_loss_total += value_loss.item()
            entropy_total += dist_entropy.item()

        print('Episode: {:>10d}, Value Loss: {:g}, Entropy: {:g}'.format(
            episode, value_loss_total / len(loader),
            entropy_total / len(loader)),
              flush=True)

        if episode != 0 and episode % SAVE_INTERVAL == 0:
            if not os.path.isdir('models'):
                os.mkdir('models')
            torch.save(net.state_dict(),
                       'models/{}.pt'.format(episode // SAVE_INTERVAL))
示例#28
0
    def do_test(self, rank, args, shared_model, counter):
        torch.manual_seed(args.seed + rank)
        if args.run_name is None:
            rn = None
        else:
            rn = 'runs/' + args.run_name
        writer = SummaryWriter(log_dir=rn, flush_secs=60)

        cnt = 0
        env = grid2op.make(args.env_name,
                           test=args.for_test,
                           reward_class=L2RPNReward)
        env.seed(args.seed + rank)

        model = ActorCritic(env.observation_space.size(), self.action_space,
                            args.hidden_size)

        model.eval()

        state = self.convert_obs(env.reset())
        state = torch.from_numpy(state)
        reward_sum = 0
        done = True

        start_time = time.time()

        # a quick hack to prevent the agent from stucking
        #actions = deque(maxlen=100)
        episode_length = 0
        while True:
            episode_length += 1
            # Sync with the shared model
            if done:
                model.load_state_dict(shared_model.state_dict())
                cx = torch.zeros(1, args.hidden_size)
                hx = torch.zeros(1, args.hidden_size)
            else:
                cx = cx.detach()
                hx = hx.detach()

            with torch.no_grad():
                _, logit, (hx, cx) = model((state.unsqueeze(0), (hx, cx)))
            prob = F.softmax(logit, dim=-1)
            action = prob.max(1, keepdim=True)[1].numpy()

            state, reward, done, _ = env.step(self.convert_act(action[0, 0]))
            state = self.convert_obs(state)
            done = done or episode_length >= args.max_episode_length
            reward_sum += reward

            # a quick hack to prevent the agent from stucking
            #actions.append(action[0, 0])
            #if actions.count(actions[0]) == actions.maxlen:
            #    done = True

            if done:
                print(
                    "Time {}, num steps {}, episode reward {}, episode length {}"
                    .format(
                        time.strftime("%Hh %Mm %Ss",
                                      time.gmtime(time.time() - start_time)),
                        counter.value, reward_sum, episode_length),
                    flush=True)

                writer.add_scalar('Main/Reward', reward_sum, cnt)
                writer.add_scalar('Main/Episode Length', episode_length, cnt)
                writer.add_scalar('Stats/Global steps', counter.value, cnt)
                cnt += 1

                reward_sum = 0
                episode_length = 0
                #actions.clear()
                state = self.convert_obs(env.reset())
                time.sleep(args.test_interval)

            state = torch.from_numpy(state)
示例#29
0
文件: test.py 项目: 404akhan/research
def test(rank, args, shared_model):
    torch.manual_seed(args.seed + rank)

    env = create_atari_env(args.env_name)
    env.seed(args.seed + rank)

    if not os.path.exists('models-a3c'):
        os.makedirs('models-a3c')
    path = 'models-a3c/model-{}.pth'.format(args.model_name)
    print('saving directory is', path)

    model = ActorCritic(env.action_space.n, args.num_atoms, args.gamma)
    model.eval()

    state = env.reset()
    state = np.concatenate([state] * 4, axis=0)
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True
    action_stat = [0] * model.num_outputs

    start_time = time.time()
    episode_length = 0

    for ep_counter in itertools.count(1):
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())

            torch.save(shared_model.state_dict(), path)
            print('saved model')

        atoms_logit, logit = model(Variable(state.unsqueeze(0), volatile=True))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()

        action_np = action[0, 0]
        action_stat[action_np] += 1

        state_new, reward, done, info = env.step(action_np)
        dead = is_dead(info)

        if args.testing:
            atoms_prob = F.softmax(atoms_logit)
            value = model.get_v(atoms_prob, batch=False)
            atoms_prob = atoms_prob.squeeze().data.numpy()

            print('episode', episode_length, 'normal action', action_np,
                  'lives', info['ale.lives'], 'value', value)
            env.render()

            if ep_counter % 100 == 0:
                plt.plot(model.z, atoms_prob)
                plt.title('average v is {}'.format(value))
                plt.show()
        state = np.append(state.numpy()[1:, :, :], state_new, axis=0)
        done = done or episode_length >= args.max_episode_length

        reward_sum += reward
        episode_length += 1

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                time.strftime("%Hh %Mm %Ss",
                              time.gmtime(time.time() - start_time)),
                reward_sum, episode_length))
            print("actions stats real {}".format(
                action_stat[:model.num_outputs]))

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            env.seed(args.seed + rank + (args.num_processes + 1) * ep_counter)
            state = np.concatenate([state] * 4, axis=0)
            action_stat = [0] * model.num_outputs
            if not args.testing: time.sleep(60)

        state = torch.from_numpy(state)
示例#30
0
def test(shared_model, render=0):
    env = create_atari_env(args.rom)
    if render == 1:
        env.render()

    model = ActorCritic(env.observation_space.shape[0], env.action_space)

    model.eval()

    state = env.reset()
    state = torch.from_numpy(state)
    reward_sum = 0
    done = True

    # a quick hack to prevent the agent from stucking
    episode_length = 0
    cx = hx = None
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(FloatTensor), volatile=True)
        else:
            cx = Variable(cx.data, volatile=True)
            hx = Variable(hx.data, volatile=True)

        value, logit, (hx, cx) = model(
            (Variable(state.unsqueeze(0).type(FloatTensor),
                      volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        # print logit.data.numpy()
        action = prob.max(1, keepdim=True)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        if render:
            #env.render()
            # Spits out images in the selected path
            img = env.render('rgb_array')
            imsave(
                '/opt/tmp/img/pac-20000/frame_{:06d}.png'.format(
                    episode_length), img)
        """    
        TEST-DEMO-ONLY
        state_im = state.numpy()
        state_im.transpose()
        scipy.misc.imageio.saveim(state_im, filename-with-time-step-number)
        #ffmpeg 
        END-WORKZONE
        """

        done = done or episode_length >= 10000
        reward_sum += reward

        # a quick hack to prevent the agent from stucking
        # actions.append(action[0, 0])
        # if actions.count(actions[0]) == actions.maxlen:
        #     done = True

        if done:
            print("Time {}, episode reward {}, episode length {}".format(
                get_elapsed_time_str(), reward_sum, episode_length))
            reward_sum = 0
            episode_length = 0
            state = env.reset()
            time.sleep(60)
        state = torch.from_numpy(state)