예제 #1
0
    def __init__(self, model_path, dtype, seed=451):
        self._seed = seed
        self._idx = 0
        self._dtype = dtype
        self.env = LoveLetterEnv(AgentRandom(seed), seed)
        state = self.env.reset()

        self._model = ActorCritic(state.shape[0],
                                  self.env.action_space).type(dtype)
        if torch.cuda.is_available():
            self._model.load_state_dict(torch.load(model_path))
예제 #2
0
    def __init__(self, model_path, dtype, seed=451):
        self._seed = seed
        self._idx = 0
        self._dtype = dtype
        self.env = LoveLetterEnv(AgentRandom(seed), seed)
        state = self.env.reset()

        self._model = ActorCritic(state.shape[0],
                                  self.env.action_space).type(dtype)
        self._model.load_state_dict(
            torch.load(model_path, map_location={'cuda:0': 'cpu'}))
예제 #3
0
    def __init__(self, model_path, seed=451):
        self._seed = seed
        self._idx = 0
        self.env = LoveLetterEnv(AgentRandom(seed), seed)
        self.vec_env = DummyVecEnv([
            lambda: self.env
        ])  # The algorithms require a vectorized environment to run

        state = self.env.reset()

        self._model = PPO2(MlpPolicy,
                           self.vec_env,
                           verbose=0,
                           tensorboard_log="./tensorboard/")
        self._model.load(model_path)
예제 #4
0
파일: a3c.py 프로젝트: user01/love-letter
class AgentA3C(Agent):
    '''Agent which leverages Actor Critic Learning'''

    def __init__(self,
                 model_path,
                 dtype,
                 seed=451):
        self._seed = seed
        self._idx = 0
        self._dtype = dtype
        self.env = LoveLetterEnv(AgentRandom(seed), seed)
        state = self.env.reset()

        self._model = ActorCritic(
            state.shape[0], self.env.action_space).type(dtype)
        self._model.load_state_dict(torch.load(model_path))

    def _move(self, game):
        '''Return move which ends in score hole'''
        assert game.active()
        self._idx += 1

        state = self.env.force(game)
        state = torch.from_numpy(state).type(self._dtype)
        cx = Variable(torch.zeros(1, 256).type(self._dtype), volatile=True)
        hx = Variable(torch.zeros(1, 256).type(self._dtype), volatile=True)

        _, logit, (hx, cx) = self._model(
            (Variable(state.unsqueeze(0), volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action_idx = prob.max(1)[1].data.cpu().numpy()[0, 0]

        player_action = self.env.action_from_index(action_idx, game)
        if player_action is None:
            # print("ouch")
            options = Agent.valid_actions(game, self._seed + self._idx)
            if len(options) < 1:
                raise Exception("Unable to play without actions")

            random.seed(self._seed + self._idx)
            return random.choice(options)

        # print("playing ", self._idx, player_action)
        return player_action
예제 #5
0
class TFAgent(Agent):
    '''Agent which leverages tensorflow'''
    def __init__(self, model_path, seed=451):
        self._seed = seed
        self._idx = 0
        self.env = LoveLetterEnv(AgentRandom(seed), seed)
        self.vec_env = DummyVecEnv([
            lambda: self.env
        ])  # The algorithms require a vectorized environment to run

        state = self.env.reset()

        self._model = PPO2(MlpPolicy,
                           self.vec_env,
                           verbose=0,
                           tensorboard_log="./tensorboard/")
        self._model.load(model_path)

    def _move(self, game):
        '''Return move which ends in score hole'''
        assert game.active()
        self._idx += 1

        state = self.env.force(game)
        action_idx = self._model.predict(state, deterministic=True)[0]

        player_action = self.env.action_from_index(action_idx, game)
        if player_action is None:
            # print("ouch")
            options = Agent.valid_actions(game, self._seed + self._idx)
            if len(options) < 1:
                raise Exception("Unable to play without actions")

            random.seed(self._seed + self._idx)
            return random.choice(options)

        # print("playing ", self._idx, player_action)
        return player_action
예제 #6
0
    help='path/prefix for the filename to save shared model\'s parameters')
parser.add_argument(
    '--load-name',
    default=None,
    metavar='SN',
    help='path/prefix for the filename to load shared model\'s parameters')

if __name__ == '__main__':
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    dtype = torch.cuda.FloatTensor if torch.cuda.is_available(
    ) else torch.FloatTensor

    env = LoveLetterEnv(AgentRandom(args.seed), args.seed)
    state = env.reset()
    shared_model = ActorCritic(state.shape[0], env.action_space).type(dtype)
    if args.load_name is not None:
        shared_model.load_state_dict(torch.load(args.load_name))
    shared_model.share_memory()

    # train(1,args,shared_model,dtype)
    processes = []

    p = mp.Process(target=test,
                   args=(args.num_processes, args, shared_model, dtype))
    p.start()
    processes.append(p)

    if not args.evaluate:
예제 #7
0
def test(rank, args, shared_model, dtype):
    test_ctr = 0
    torch.manual_seed(args.seed + rank)

    # set up logger
    timestring = str(date.today()) + '_' + \
        time.strftime("%Hh-%Mm-%Ss", time.localtime(time.time()))
    run_name = args.save_name + '_' + timestring
    configure("logs/run_" + run_name, flush_secs=5)

    env = LoveLetterEnv(AgentRandom(args.seed + rank), args.seed + rank)
    env.seed(args.seed + rank)
    state = env.reset()

    model = ActorCritic(state.shape[0], env.action_space).type(dtype)

    model.eval()

    state = torch.from_numpy(state).type(dtype)
    reward_sum = 0
    max_reward = -99999999
    max_winrate = 0
    rewards_recent = deque([], 100)
    done = True

    start_time = time.time()

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        if done:
            model.load_state_dict(shared_model.state_dict())
            cx = Variable(torch.zeros(1, 256).type(dtype), volatile=True)
            hx = Variable(torch.zeros(1, 256).type(dtype), volatile=True)
        else:
            cx = Variable(cx.data.type(dtype), volatile=True)
            hx = Variable(hx.data.type(dtype), volatile=True)

        value, logit, (hx, cx) = model((Variable(state.unsqueeze(0),
                                                 volatile=True), (hx, cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.cpu().numpy()

        state, reward, done, _ = env.step(action[0, 0])
        done = done or episode_length >= args.max_episode_length
        reward_sum += reward

        if done:
            rewards_recent.append(reward_sum)
            rewards_recent_avg = sum(rewards_recent) / len(rewards_recent)
            print(
                "{} | Episode Reward {: >4}, Length {: >2} | Avg Reward {:0.2f}"
                .format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, episode_length, rewards_recent_avg))

            # if not stuck or args.evaluate:
            log_value('Reward', reward_sum, test_ctr)
            log_value('Reward Average', rewards_recent_avg, test_ctr)
            log_value('Episode length', episode_length, test_ctr)

            if reward_sum >= max_reward:
                # pickle.dump(shared_model.state_dict(), open(args.save_name + '_max' + '.p', 'wb'))
                path_output = args.save_name + '_max'
                torch.save(shared_model.state_dict(), path_output)
                path_now = "{}_{}".format(args.save_name,
                                          datetime.datetime.now().isoformat())
                torch.save(shared_model.state_dict(), path_now)
                max_reward = reward_sum

                win_rate_v_random = Arena.compare_agents_float(
                    lambda seed: AgentA3C(path_output, dtype, seed),
                    lambda seed: AgentRandom(seed), 800)
                msg = " {} | VsRandom: {: >4}%".format(
                    datetime.datetime.now().strftime("%c"),
                    round(win_rate_v_random * 100, 2))
                print(msg)
                log_value('Win Rate vs Random', win_rate_v_random, test_ctr)
                if win_rate_v_random > max_winrate:
                    print("Found superior model at {}".format(
                        datetime.datetime.now().isoformat()))
                    torch.save(
                        shared_model.state_dict(), "{}_{}_best_{}".format(
                            args.save_name,
                            datetime.datetime.now().isoformat(),
                            win_rate_v_random))
                    max_winrate = win_rate_v_random

            reward_sum = 0
            episode_length = 0
            state = env.reset()
            test_ctr += 1

            if test_ctr % 10 == 0 and not args.evaluate:
                # pickle.dump(shared_model.state_dict(), open(args.save_name + '.p', 'wb'))
                torch.save(shared_model.state_dict(), args.save_name)
            if not args.evaluate:
                time.sleep(60)
            elif test_ctr == evaluation_episodes:
                # Ensure the environment is closed so we can complete the
                # submission
                env.close()
                # gym.upload('monitor/' + run_name, api_key=api_key)

        state = torch.from_numpy(state).type(dtype)
예제 #8
0
def train(rank, args, shared_model, dtype):
    torch.manual_seed(args.seed + rank)

    env = LoveLetterEnv(AgentRandom(args.seed + rank), args.seed + rank)
    env.seed(args.seed + rank)
    state = env.reset()

    model = ActorCritic(state.shape[0], env.action_space).type(dtype)

    optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)

    model.train()

    values = []
    log_probs = []

    state = torch.from_numpy(state).type(dtype)
    done = True

    episode_length = 0
    while True:
        episode_length += 1
        # Sync with the shared model
        model.load_state_dict(shared_model.state_dict())
        if done:
            cx = Variable(torch.zeros(1, 256).type(dtype))
            hx = Variable(torch.zeros(1, 256).type(dtype))
        else:
            cx = Variable(cx.data.type(dtype))
            hx = Variable(hx.data.type(dtype))

        values = []
        log_probs = []
        rewards = []
        entropies = []

        for step in range(args.num_steps):
            value, logit, (hx, cx) = model(
                (Variable(state.unsqueeze(0)), (hx, cx)))
            prob = F.softmax(logit)
            log_prob = F.log_softmax(logit)
            entropy = -(log_prob * prob).sum(1)
            entropies.append(entropy)

            action = prob.multinomial().data
            log_prob = log_prob.gather(1, Variable(action))

            state, reward, done, _ = env.step(action.cpu().numpy()[0][0])
            done = done or episode_length >= args.max_episode_length

            if done:
                episode_length = 0
                state = env.reset()

            state = torch.from_numpy(state).type(dtype)
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                break

        R = torch.zeros(1, 1).type(dtype)
        if not done:
            value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
            R = value.data

        values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1).type(dtype)
        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae) - args.beta * entropies[i]

        optimizer.zero_grad()

        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 40)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
예제 #9
0
class CustomPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy, self).__init__(
            *args,
            **kwargs,
            net_arch=[512, dict(pi=[256, 128], vf=[256, 128])],
            feature_extraction="mlp")


if __name__ == '__main__':
    args = parser.parse_args()

    if args.load_name:
        env = SubprocVecEnv([
            lambda: LoveLetterEnv(TFAgent(args.load_name, args.seed + i))
            for i in range(args.num_processes)
        ])
    else:
        env = SubprocVecEnv([
            lambda: LoveLetterEnv(AgentRandom(args.seed + i))
            for i in range(args.num_processes)
        ])

    model = PPO2(CustomPolicy,
                 env,
                 verbose=0,
                 tensorboard_log=args.log_dir,
                 learning_rate=args.lr,
                 n_steps=args.num_steps,
                 nminibatches=5)