예제 #1
0
파일: train.py 프로젝트: takatoy/mine-rl
def main():
    writer = SummaryWriter()

    env = gym.make('MineRLObtainDiamondDense-v0')
    if FRAME_SKIP > 0:
        env = FrameSkip(env, FRAME_SKIP)
    env = ObsWrapper(env)
    env = MoveAxisWrapper(env, -1, 0)
    env = CombineActionWrapper(env)

    agent = Agent(env.observation_space, env.action_space)
    data = minerl.data.make('MineRLTreechop-v0', data_dir=MINERL_DATA_ROOT)
    data_source = data.sarsd_iter(num_epochs=-1, max_sequence_len=DATA_BATCH_SIZE)

    # data_2 = minerl.data.make('MineRLObtainDiamond-v0', data_dir=MINERL_DATA_ROOT)
    # data_2_source = data.sarsd_iter(num_epochs=-1, max_sequence_len=128)

    # behavioral cloning
    train_from_expert(agent, data_source)

    net_steps = 0
    n_episode = 0
    while True:
        obs = env.reset()
        done = False
        netr = 0
        net_bonus_r = 0
        nobs = None
        step = 0
        while not done:
            action = agent.act(obs)
            nobs, reward, done, info = env.step(action)
            netr += reward
            reward += agent.bonus_reward(obs, action, nobs)
            net_bonus_r += reward
            agent.add_data(obs, action, reward, nobs, done)
            obs = nobs

            # To get better view in your training phase, it is suggested
            # to register progress continuously, example when 54% completed
            # aicrowd_helper.register_progress(0.54)

            # To fetch latest information from instance manager, you can run below when you want to know the state
            #>> parser.update_information()
            #>> print(parser.payload)
            # .payload: provide AIcrowd generated json
            # Example: {'state': 'RUNNING', 'score': {'score': 0.0, 'score_secondary': 0.0}, 'instances': {'1': {'totalNumberSteps': 2001, 'totalNumberEpisodes': 0, 'currentEnvironment': 'MineRLObtainDiamond-v0', 'state': 'IN_PROGRESS', 'episodes': [{'numTicks': 2001, 'environment': 'MineRLObtainDiamond-v0', 'rewards': 0.0, 'state': 'IN_PROGRESS'}], 'score': {'score': 0.0, 'score_secondary': 0.0}}}}
            # .current_state: provide indepth state information avaiable as dictionary (key: instance id)

            step += 1
            net_steps += 1

            if (TRAIN_INTERVAL != 0 and step % TRAIN_INTERVAL == 0) or done:
                total_discrim_loss = 0.0
                total_value = total_ppo_loss = total_value_loss = total_entropy = 0
                n_epoch = 0
                while not agent.is_memory_empty():
                    s, a, _, _, _ = data_source.__next__()
                    s = data_state_wrapper(s)
                    a = data_action_wrapper(a)
                    total_discrim_loss += agent.train_discriminator(s, a)
                    value, ppo_loss, value_loss, entropy = agent.train_policy()

                    total_value += value
                    total_ppo_loss += ppo_loss
                    total_value_loss += value_loss
                    total_entropy += entropy
                    n_epoch += 1

                writer.add_scalar('Train/Value', value / n_epoch, net_steps)
                writer.add_scalar('Train/PolicyLoss', ppo_loss / n_epoch, net_steps)
                writer.add_scalar('Train/ValueLoss', value_loss / n_epoch, net_steps)
                writer.add_scalar('Train/Entropy', entropy / n_epoch, net_steps)
                writer.add_scalar('Train/DiscriminatorLoss', total_discrim_loss / n_epoch, net_steps)
                agent.save_model()

        writer.add_scalar('Reward/ExternalReward', netr, n_episode)
        writer.add_scalar('Reward/TotalReward', net_bonus_r, n_episode)
        n_episode += 1

        agent.save_model()

    agent.save_model()

    aicrowd_helper.register_progress(1)
    env.close()