policy_stable = False
        # print(Policy)
        if (policy_stable):
            break

        # 计算获胜的概率
        win = 0
        draw = 0  # 平局
        win_rate = 0

        for i in range(200000):
            env.reset()
            while (True):
                d, p = env.get_state()
                action = Policy[d - 1, p - 1]
                next, reward = env.step(action)

                if (next == 'terminal'):
                    if (reward > 0): win += 1
                    if (reward == 0): draw += 1
                    win_rate = win / 200000
                    break
        print(epi, win, draw, win_rate)
        result.append(win_rate)

        f = open('result', 'a')  # 打开test.txt   如果文件不存在,创建该文件。
        f.write(
            str(epi) + '\t' + str(win) + '\t' + str(draw) + '\t' +
            str(win_rate) + '\n')
        f.close()
Пример #2
0
        self.canvas.delete(self.explosionBH)
        self.canvas.delete(self.bombA)
        self.canvas.delete(self.bombB)
        self.explosionAV = None
        self.explosionAH = None
        self.explosionBV = None
        self.explosionBH = None
        self.bombA = None
        self.bombB = None


if __name__ == '__main__':
    env = Game()
    vis = Visualiser(env, 80)
    for i in range(100):
        env.step(random.randint(0, 4), random.randint(0, 4))
        vis.update_canvas(env)

    # env.step(1,1)
    # vis.update_canvas(env)
    # env.step(1,1)
    # vis.update_canvas(env)
    # env.step(2,1)
    # vis.update_canvas(env)
    # env.step(2,1)
    # vis.update_canvas(env)
    # env.step(4,1)
    # vis.update_canvas(env)
    # env.step(1,1)
    # vis.update_canvas(env)
    # env.step(2,1)
Пример #3
0
def train(net, rank):
    torch.set_num_threads(1)  #also do: export MKL_NUM_THREADS=1

    net.reset()
    env = Game(True, 4000 + rank + 1, max_steps=250)

    target_net = Net(1254, 6, 36)
    target_net.load_state_dict(net.state_dict())
    target_net.reset()

    epsilon = epsilon1

    optimizer = optim.RMSprop(net.parameters(), lr=learning_rate)
    last_save = time.time()
    last_notify = time.time()
    last_sync = time.time()
    episode_number = 0
    terminal = True
    prev_value = None
    available_objects = None
    num_objects = len(env.objects)
    recent_rewards_of_episodes = []
    recent_steps_of_episodes = []

    quest1_reward_cnt = 0
    quest2_reward_cnt = 0
    quest3_reward_cnt = 0
    quest4_reward_cnt = 0
    quest1_rewards = np.zeros(100)
    quest2_rewards = np.zeros(100)
    quest3_rewards = np.zeros(100)
    quest4_rewards = np.zeros(100)

    if rank == 0:
        stats = []

    while True:
        if terminal:
            student_saw_obelisk = False
            quest1_rewards[episode_number % len(quest1_rewards)] = 0
            quest2_rewards[episode_number % len(quest2_rewards)] = 0
            quest3_rewards[episode_number % len(quest3_rewards)] = 0
            quest4_rewards[episode_number % len(quest4_rewards)] = 0
            prev_value = None
            num_steps = 0
            net.reset()
            target_net.reset()
            state, reward, terminal, available_objects = env.reset()
            sum_rewards = reward

        state = torch.LongTensor(state)
        objects_probs = net(Variable(state.unsqueeze(0)))

        _objects_probs = objects_probs.data.numpy()

        #Choose action
        if random.random() < epsilon:
            if available_objects is None:
                objects = list(enumerate(env.objects))
            else:
                objects = [
                    _ for _ in list(enumerate(env.objects))
                    if _[0] in available_objects
                ]

            _object = random.choice(objects)[0]
        else:
            if available_objects is not None:
                mask = np.zeros(num_objects)
                for e in available_objects:
                    mask[e] = 1
                _objects_probs = objects_probs.data.numpy() * mask
                _objects_probs = _objects_probs + (_objects_probs == 0) * -1e30
            _object = int(np.argmax(_objects_probs))

        prev_value = objects_probs[0, _object]

        # step the environment and get new measurements
        state, reward, terminal, available_objects = env.step(5, _object)
        sum_rewards += reward
        num_steps += 1

        if reward > 10 - 0.0001:
            quest4_reward_cnt = quest4_reward_cnt + 1
            quest4_rewards[episode_number % len(quest4_rewards)] = 1
        elif reward > 8 - 0.0001:
            quest3_reward_cnt = quest3_reward_cnt + 1
            quest3_rewards[episode_number % len(quest3_rewards)] = 1
            if not disable_curriculum:
                if not student_saw_obelisk:
                    reward = -8
                    terminal = True
        elif reward > 7 - 0.0001:
            student_saw_obelisk = True
            quest2_reward_cnt = quest2_reward_cnt + 1
            quest2_rewards[episode_number % len(quest2_rewards)] = 1
            if not disable_curriculum:
                if np.mean(quest2_rewards) < 0.75 and random.random() < 0.9:
                    terminal = True
        elif reward > 5 - 0.0001:
            quest1_reward_cnt = quest1_reward_cnt + 1
            quest1_rewards[episode_number % len(quest1_rewards)] = 1
            if not disable_curriculum:
                if np.mean(quest1_rewards) < 0.9 and random.random() < 0.85:
                    terminal = True

        if 2 * epsilon > (epsilon1 + epsilon2):
            if np.mean(quest3_rewards) > .98:
                if np.mean(quest2_rewards) > .98:
                    if np.mean(quest1_rewards) > .98:
                        epsilon = epsilon2
                        if rank == 0:
                            notify("Epsilon is now:" + str(epsilon))

        if terminal:
            next_value = 0
        else:
            if target_q_ts is None:
                next_value = float(np.max(_objects_probs))
            else:
                state = torch.LongTensor(state)
                objects_probs = target_net(Variable(state.unsqueeze(0)))
                _objects_probs = objects_probs.data.numpy()
                if available_objects is not None:
                    mask = np.zeros(num_objects)
                    for e in available_objects:
                        mask[e] = 1
                    _objects_probs = _objects_probs * mask
                    _objects_probs = _objects_probs + (_objects_probs
                                                       == 0) * -1e30
                next_value = float(np.max(_objects_probs))

        loss = (reward + gamma * next_value - prev_value)**2

        #Update for only a tenth of the non important steps
        if abs(reward) > 4 or random.random() < 0.05:
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            nn.utils.clip_grad_norm(net.parameters(), 1)
            optimizer.step()

        if terminal:
            recent_rewards_of_episodes.append(sum_rewards)
            recent_steps_of_episodes.append(num_steps)
            if len(recent_rewards_of_episodes) > 100:
                recent_rewards_of_episodes.pop(0)
            if len(recent_steps_of_episodes) > 100:
                recent_steps_of_episodes.pop(0)

            episode_number += 1
            if target_q_ts is not None and time.time(
            ) - last_sync > target_q_ts:
                if rank == 0:
                    print("Update target")
                target_net.load_state_dict(net.state_dict())
                last_sync = time.time()

            if rank == 0:
                stats.append({})
                stats[-1]["episode_number"] = episode_number
                stats[-1]["sum_rewards"] = sum_rewards
                stats[-1]["num_steps"] = num_steps
                stats[-1]["mean_recent_rewards_of_episodes"] = np.mean(
                    recent_rewards_of_episodes)
                stats[-1]["mean_recent_steps_of_episodes"] = np.mean(
                    recent_steps_of_episodes)
                stats[-1]["quest1_reward_cnt"] = quest1_reward_cnt
                stats[-1]["quest2_reward_cnt"] = quest2_reward_cnt
                stats[-1]["quest3_reward_cnt"] = quest3_reward_cnt
                stats[-1]["quest4_reward_cnt"] = quest4_reward_cnt
                stats[-1]["mean_quest1_rewards"] = np.mean(quest1_rewards)
                stats[-1]["mean_quest2_rewards"] = np.mean(quest2_rewards)
                stats[-1]["mean_quest3_rewards"] = np.mean(quest3_rewards)
                stats[-1]["mean_quest4_rewards"] = np.mean(quest4_rewards)

                summary = "{} {:.4} {} {:.4} {:.4} Qc: {} {} {} {} Q: {} {} {} {}".format(
                    episode_number, sum_rewards, num_steps,
                    np.mean(recent_rewards_of_episodes),
                    np.mean(recent_steps_of_episodes), quest1_reward_cnt,
                    quest2_reward_cnt, quest3_reward_cnt, quest4_reward_cnt,
                    np.mean(quest1_rewards), np.mean(quest2_rewards),
                    np.mean(quest3_rewards), np.mean(quest4_rewards))
                print(summary)

                if save_every is not None:
                    if time.time() - last_save > save_every:
                        print("Saving..")
                        torch.save(net.state_dict(), name)
                        with open(name_stats, "wb") as _fh:
                            pickle.dump(stats, _fh)
                        last_save = time.time()

                if notify_every is not None:
                    if time.time() - last_notify > notify_every:
                        print("Notify..")
                        notify(summary)
                        last_notify = time.time()

                if max_episodes is not None and episode_number == max_episodes:
                    torch.save(net.state_dict(), name)
                    with open(name_stats, "wb") as _fh:
                        pickle.dump(stats, _fh)
                    notify(summary)
                    notify("Done.")
                    print("Done.")
                    sys.exit()