示例#1
0
def CFagent(defaults):
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    buffer = ReplayBuffer(**defaults)
    CFagent = CFAgent(env, **defaults)
    CFbuffer = CFReplayBuffer(**defaults)
    collector = Collector(**defaults)

    with Save(env, collector, mover, teleporter, CFagent, **defaults) as save:
        intervention_idx, modified_board = teleporter.pre_process(env)
        dones = CFagent.pre_process(env)
        CF_dones, cfs = None, None
        for frame in loop(env, collector, save, teleporter):
            CFagent.counterfact(env, dones, teleporter, CF_dones, cfs)
            modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
            buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
            mover.learn(modified_board, actions, modified_rewards, modified_dones)
            board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
            teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
            collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
            CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults)
            CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones)
            CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data()
            CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
示例#2
0
def metateleport(defaults):
    collector = Collector(**defaults)
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter1 = Teleporter(env, _extra_dim=1, **defaults)
    teleporter2 = MetaTeleporter(env, **defaults)
    buffer1 = ReplayBuffer(**defaults)
    buffer2 = ReplayBuffer(**defaults)

    with Save(env, collector, mover, teleporter1, teleporter2, **defaults) as save:
        intervention_idx2, modified_board2 = teleporter2.pre_process(env)
        intervention_idx1, _ = teleporter1.pre_process(env)
        for frame in loop(env, collector, save, teleporter1, teleporter2):
            modified_board2 = teleporter2.interveen(env.board, intervention_idx2, modified_board2)
            modified_board1 = teleporter1.interveen(env.board, intervention_idx1, modified_board2)
            actions = mover(modified_board1)
            observations, rewards, dones, info = env.step(actions)
            modified_board1, modified_board2, modified_rewards1, modified_rewards2, modified_dones1, modified_dones2, tele_rewards, intervention_idx1, intervention_idx2 = teleporter2.metamodify(observations, rewards, dones, info, teleporter1.interventions)
            buffer1.teleporter_save_data(teleporter1.boards, modified_board2, teleporter1.interventions, modified_rewards2, modified_dones2, intervention_idx1)
            buffer2.teleporter_save_data(teleporter2.boards, observations, teleporter2.interventions, tele_rewards, dones, intervention_idx2)
            mover.learn(modified_board1, actions, modified_rewards1, modified_dones1)
            board_before, board_after, intervention, tel_rewards, tele_dones = buffer1.sample_data()
            teleporter1.learn(board_after, intervention, tel_rewards, tele_dones, board_before)
            board_before, board_after, intervention, tel_rewards, tele_dones = buffer2.sample_data()
            teleporter2.learn(board_after, intervention, tel_rewards, tele_dones, board_before)
            collector.collect([rewards, modified_rewards1, modified_rewards2, tele_rewards], [dones, modified_dones1, modified_dones2])
示例#3
0
def Load_Cfagent(defaults):
    with Load(defaults["load_name"], num=defaults['num']) as load:
        collector, env, mover, teleporter, CFagent = load.items(Collector, Game, Mover, Teleporter, CFAgent)
        buffer = ReplayBuffer(**defaults)
        CFbuffer = CFReplayBuffer(**defaults)

        with Save(env, collector, mover, teleporter, CFagent, **defaults) as save:
            intervention_idx, modified_board = teleporter.pre_process(env)
            dones = CFagent.pre_process(env)
            CF_dones, cfs = None, None
            CFagent.CF_count = 0
            for frame in loop(env, collector, save, teleporter):
                CFagent.counterfact(env, dones, teleporter, CF_dones, cfs)
                modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
                actions = mover(modified_board)
                observations, rewards, dones, info = env.step(actions)
                modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
                buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
                mover.learn(modified_board, actions, modified_rewards, modified_dones)
                board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
                teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
                collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
                CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults)
                CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones)
                CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data()
                CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
示例#4
0
def simple(defaults):
    collector = Collector(**defaults)
    env = Game(**defaults)
    mover = Mover(env, **defaults)

    with Save(env, collector, mover, **defaults) as save:
        for frame in loop(env, collector, save):
            actions = mover(env.board)
            observations, rewards, dones, info = env.step(actions)
            mover.learn(observations, actions, rewards, dones)
            collector.collect([rewards], [dones])
示例#5
0
def teleport(defaults):
    collector = Collector(**defaults)
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    buffer = ReplayBuffer(**defaults)

    with Save(env, collector, mover, teleporter, **defaults) as save:
        intervention_idx, modified_board = teleporter.pre_process(env)
        for frame in loop(env, collector, save, teleporter):
            modified_board = teleporter.interveen(env.board, intervention_idx, modified_board)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info)
            buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx)
            mover.learn(modified_board, actions, modified_rewards, modified_dones)
            board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data()
            teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before)
            collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
示例#6
0
def option_critic_run(defaults):
    collector = Collector(**defaults)
    env = Game(**defaults)
    buffer = ReplayBuffer(**defaults)
    batch = env.batch
    num_options = len(env.layers.layers) - 3
    option_critic = OptionCriticConv(in_features=env.board.shape[1],
                                     num_actions=4,
                                     num_options=num_options,
                                     width=env.board.shape[2],
                                     height=env.board.shape[3],
                                     temperature=0.005,
                                     eps_start=2000000,
                                     eps_min=0.2,
                                     eps_decay=2000000,
                                     eps_test=0.05,
                                     device=device)
    # Create a prime network for more stable Q values
    option_critic_prime = deepcopy(option_critic)
    optim = torch.optim.RMSprop(option_critic.parameters(), lr=0.0005)

    with Save(env, collector, **defaults) as save:
        states = option_critic.get_state(env.board)
        greedy_options = [
            e.item() for e in list(option_critic.greedy_option(states))
        ]
        current_options = [0 for _ in range(batch)]
        option_terminations = [True for _ in range(batch)]
        dones = [False for _ in range(batch)]
        actions, logps, entropys = [None for _ in range(batch)
                                    ], [None for _ in range(batch)
                                        ], [None for _ in range(batch)]
        for frame in loop(env, collector, save):

            epsilon = option_critic.epsilon
            #print(epsilon)
            states = option_critic.get_state(env.board)
            #print(states.shape)
            for i, done in enumerate(dones):
                if done:
                    greedy_options[i] = option_critic.greedy_option(
                        states[i]).item()
                    current_options[i] = 0
                    option_terminations[i] = True
                    actions[i] = None
                    logps[i] = None
                    entropys[i] = None
            #print(greedy_options)

            for i, (option_termination, current_option, state,
                    greedy_option) in enumerate(
                        zip(option_terminations, current_options, states,
                            greedy_options)):
                if option_termination:
                    current_options[i] = np.random.choice(
                        num_options
                    ) if np.random.rand() < epsilon else greedy_option

                actions[i], logps[i], entropys[i] = option_critic.get_action(
                    state, current_option)
            # print(logps[0], entropys[0])
            old_obses = env.board
            next_obses, rewards, dones, _ = env.step(torch.tensor(actions))
            collector.collect([rewards], [dones])
            buffer.save_option_critic(old_obses, current_options, rewards,
                                      next_obses, dones)
            states = option_critic.get_state(next_obses)
            loss = 0
            for i, (next_obs, reward, done, state, current_option, old_obs,
                    logp, entropy) in enumerate(
                        zip(next_obses, rewards, dones, states,
                            current_options, old_obses, logps, entropys)):
                option_terminations[i], greedy_options[
                    i] = option_critic.predict_option_termination(
                        state.unsqueeze(0), current_option)
                loss += actor_loss_fn(old_obs, current_option, logp, entropy,
                                      reward, done, next_obs, option_critic,
                                      option_critic_prime)
            if frame % update_frequency == 0:
                data_batch = buffer.sample_option_critic()
                loss += critic_loss_fn(option_critic, option_critic_prime,
                                       data_batch)
            optim.zero_grad()
            loss.backward()
            optim.step()
            if frame % freeze_interval == 0:
                option_critic_prime = deepcopy(option_critic)
def graphTrain(defaults):
    layers: List[LayerType] = environments[defaults['level']][2]
    explorationN = defaults['K1']
    data = Data(layers, defaults['graphMode'])
    env = Game(**defaults)
    mover = Mover(env, _extra_dim=1, **defaults)
    teleporter = Teleporter(env, **defaults)
    collector = Collector(**defaults)
    model = BayesionNN(layers,
                       depth=defaults['depth'],
                       exploration=defaults['model_explore'],
                       samples=defaults['samples'])
    use_model = defaults['use_model']
    with Save(env, collector, mover, data, **defaults) as save:
        convert = [env.layers.types.index(layer) for layer in layers]
        player = env.layers.types.index(LayerType.Player)
        goal = env.layers.types.index(LayerType.Goal)
        old_states = [state for state in states(env.board, convert, layers)]
        dones = tensor([0 for _ in range(env.batch)])
        rewards = tensor([0 for _ in range(env.batch)])
        eatCheese, interventions = ([True] * env.batch, [None] * env.batch)
        for frame in loop(env, collector, save, teleporter=teleporter):
            data.t = frame
            new_states = [
                state for state in states(env.board, convert, layers)
            ]
            loss = transform(old_states,
                             new_states,
                             dones,
                             rewards,
                             data,
                             layers,
                             model,
                             use_model=use_model)
            loss = transformNot(env.board,
                                new_states,
                                player,
                                goal,
                                convert,
                                data,
                                layers,
                                model,
                                use_model=use_model)
            stateChanged = [
                old != new for old, new in zip(old_states, new_states)
            ]
            shouldInterviene = [
                cond1 or cond2 for cond1, cond2 in zip(stateChanged, eatCheese)
            ]
            exploration = max((explorationN - frame) / explorationN,
                              defaults['softmax_cap'])
            if use_model:
                interventions = [
                    (getInterventionsmodel(
                        state, env.layers.types, layers, model, env,
                        data.layers_not_in(state), frame) if should else old)
                    for state, should, old in zip(new_states, shouldInterviene,
                                                  interventions)
                ]
            else:
                interventions = [
                    (getInterventions(env, state, data, exploration)
                     if should else old) for state, should, old in zip(
                         new_states, shouldInterviene, interventions)
                ]
            modification = env.board[tensor(interventions)].unsqueeze(1)
            teleporter.interventions = tensor(
                [m.flatten().argmax().item() for m in list(modification)],
                device=device)
            modified_board = cat((env.board, modification), dim=1)
            actions = mover(modified_board)
            observations, rewards, dones, info = env.step(actions)
            modified_board, modified_rewards, modified_dones, _, _ = teleporter.modify(
                observations, rewards, dones, info)
            mover.learn(modified_board, actions, modified_rewards,
                        modified_dones)
            playerPositions = [
                (t := env.layers.dict[LayerType.Player].positions[i][0])[1] *
                env.layers.width + t[0] for i in range(env.batch)
            ]
            eatCheese = [
                intervention == player_pos for intervention, player_pos in zip(
                    teleporter.interventions, playerPositions)
            ]
            old_states = new_states
            collector.collect([rewards, modified_rewards],
                              [dones, modified_dones])
            collector.collect_loss(loss)