def CFagent(defaults): env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) buffer = ReplayBuffer(**defaults) CFagent = CFAgent(env, **defaults) CFbuffer = CFReplayBuffer(**defaults) collector = Collector(**defaults) with Save(env, collector, mover, teleporter, CFagent, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) dones = CFagent.pre_process(env) CF_dones, cfs = None, None for frame in loop(env, collector, save, teleporter): CFagent.counterfact(env, dones, teleporter, CF_dones, cfs) modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones]) CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults) CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones) CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data() CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
def metateleport(defaults): collector = Collector(**defaults) env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter1 = Teleporter(env, _extra_dim=1, **defaults) teleporter2 = MetaTeleporter(env, **defaults) buffer1 = ReplayBuffer(**defaults) buffer2 = ReplayBuffer(**defaults) with Save(env, collector, mover, teleporter1, teleporter2, **defaults) as save: intervention_idx2, modified_board2 = teleporter2.pre_process(env) intervention_idx1, _ = teleporter1.pre_process(env) for frame in loop(env, collector, save, teleporter1, teleporter2): modified_board2 = teleporter2.interveen(env.board, intervention_idx2, modified_board2) modified_board1 = teleporter1.interveen(env.board, intervention_idx1, modified_board2) actions = mover(modified_board1) observations, rewards, dones, info = env.step(actions) modified_board1, modified_board2, modified_rewards1, modified_rewards2, modified_dones1, modified_dones2, tele_rewards, intervention_idx1, intervention_idx2 = teleporter2.metamodify(observations, rewards, dones, info, teleporter1.interventions) buffer1.teleporter_save_data(teleporter1.boards, modified_board2, teleporter1.interventions, modified_rewards2, modified_dones2, intervention_idx1) buffer2.teleporter_save_data(teleporter2.boards, observations, teleporter2.interventions, tele_rewards, dones, intervention_idx2) mover.learn(modified_board1, actions, modified_rewards1, modified_dones1) board_before, board_after, intervention, tel_rewards, tele_dones = buffer1.sample_data() teleporter1.learn(board_after, intervention, tel_rewards, tele_dones, board_before) board_before, board_after, intervention, tel_rewards, tele_dones = buffer2.sample_data() teleporter2.learn(board_after, intervention, tel_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards1, modified_rewards2, tele_rewards], [dones, modified_dones1, modified_dones2])
def Load_Cfagent(defaults): with Load(defaults["load_name"], num=defaults['num']) as load: collector, env, mover, teleporter, CFagent = load.items(Collector, Game, Mover, Teleporter, CFAgent) buffer = ReplayBuffer(**defaults) CFbuffer = CFReplayBuffer(**defaults) with Save(env, collector, mover, teleporter, CFagent, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) dones = CFagent.pre_process(env) CF_dones, cfs = None, None CFagent.CF_count = 0 for frame in loop(env, collector, save, teleporter): CFagent.counterfact(env, dones, teleporter, CF_dones, cfs) modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones]) CF_dones, cfs = CFagent.counterfact_check(dones, env, **defaults) CFbuffer.CF_save_data(CFagent.boards, observations, CFagent.counterfactuals, rewards, dones, CF_dones) CFboard, CFobs, cf, CFrewards, CFdones1 = CFbuffer.sample_data() CFagent.learn(CFobs, cf, CFrewards, CFdones1, CFboard)
def simple(defaults): collector = Collector(**defaults) env = Game(**defaults) mover = Mover(env, **defaults) with Save(env, collector, mover, **defaults) as save: for frame in loop(env, collector, save): actions = mover(env.board) observations, rewards, dones, info = env.step(actions) mover.learn(observations, actions, rewards, dones) collector.collect([rewards], [dones])
def teleport(defaults): collector = Collector(**defaults) env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) buffer = ReplayBuffer(**defaults) with Save(env, collector, mover, teleporter, **defaults) as save: intervention_idx, modified_board = teleporter.pre_process(env) for frame in loop(env, collector, save, teleporter): modified_board = teleporter.interveen(env.board, intervention_idx, modified_board) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, teleport_rewards, intervention_idx = teleporter.modify(observations, rewards, dones, info) buffer.teleporter_save_data(teleporter.boards, observations, teleporter.interventions, teleport_rewards, dones, intervention_idx) mover.learn(modified_board, actions, modified_rewards, modified_dones) board_before, board_after, intervention, tele_rewards, tele_dones = buffer.sample_data() teleporter.learn(board_after, intervention, tele_rewards, tele_dones, board_before) collector.collect([rewards, modified_rewards, teleport_rewards], [dones, modified_dones])
def option_critic_run(defaults): collector = Collector(**defaults) env = Game(**defaults) buffer = ReplayBuffer(**defaults) batch = env.batch num_options = len(env.layers.layers) - 3 option_critic = OptionCriticConv(in_features=env.board.shape[1], num_actions=4, num_options=num_options, width=env.board.shape[2], height=env.board.shape[3], temperature=0.005, eps_start=2000000, eps_min=0.2, eps_decay=2000000, eps_test=0.05, device=device) # Create a prime network for more stable Q values option_critic_prime = deepcopy(option_critic) optim = torch.optim.RMSprop(option_critic.parameters(), lr=0.0005) with Save(env, collector, **defaults) as save: states = option_critic.get_state(env.board) greedy_options = [ e.item() for e in list(option_critic.greedy_option(states)) ] current_options = [0 for _ in range(batch)] option_terminations = [True for _ in range(batch)] dones = [False for _ in range(batch)] actions, logps, entropys = [None for _ in range(batch) ], [None for _ in range(batch) ], [None for _ in range(batch)] for frame in loop(env, collector, save): epsilon = option_critic.epsilon #print(epsilon) states = option_critic.get_state(env.board) #print(states.shape) for i, done in enumerate(dones): if done: greedy_options[i] = option_critic.greedy_option( states[i]).item() current_options[i] = 0 option_terminations[i] = True actions[i] = None logps[i] = None entropys[i] = None #print(greedy_options) for i, (option_termination, current_option, state, greedy_option) in enumerate( zip(option_terminations, current_options, states, greedy_options)): if option_termination: current_options[i] = np.random.choice( num_options ) if np.random.rand() < epsilon else greedy_option actions[i], logps[i], entropys[i] = option_critic.get_action( state, current_option) # print(logps[0], entropys[0]) old_obses = env.board next_obses, rewards, dones, _ = env.step(torch.tensor(actions)) collector.collect([rewards], [dones]) buffer.save_option_critic(old_obses, current_options, rewards, next_obses, dones) states = option_critic.get_state(next_obses) loss = 0 for i, (next_obs, reward, done, state, current_option, old_obs, logp, entropy) in enumerate( zip(next_obses, rewards, dones, states, current_options, old_obses, logps, entropys)): option_terminations[i], greedy_options[ i] = option_critic.predict_option_termination( state.unsqueeze(0), current_option) loss += actor_loss_fn(old_obs, current_option, logp, entropy, reward, done, next_obs, option_critic, option_critic_prime) if frame % update_frequency == 0: data_batch = buffer.sample_option_critic() loss += critic_loss_fn(option_critic, option_critic_prime, data_batch) optim.zero_grad() loss.backward() optim.step() if frame % freeze_interval == 0: option_critic_prime = deepcopy(option_critic)
def graphTrain(defaults): layers: List[LayerType] = environments[defaults['level']][2] explorationN = defaults['K1'] data = Data(layers, defaults['graphMode']) env = Game(**defaults) mover = Mover(env, _extra_dim=1, **defaults) teleporter = Teleporter(env, **defaults) collector = Collector(**defaults) model = BayesionNN(layers, depth=defaults['depth'], exploration=defaults['model_explore'], samples=defaults['samples']) use_model = defaults['use_model'] with Save(env, collector, mover, data, **defaults) as save: convert = [env.layers.types.index(layer) for layer in layers] player = env.layers.types.index(LayerType.Player) goal = env.layers.types.index(LayerType.Goal) old_states = [state for state in states(env.board, convert, layers)] dones = tensor([0 for _ in range(env.batch)]) rewards = tensor([0 for _ in range(env.batch)]) eatCheese, interventions = ([True] * env.batch, [None] * env.batch) for frame in loop(env, collector, save, teleporter=teleporter): data.t = frame new_states = [ state for state in states(env.board, convert, layers) ] loss = transform(old_states, new_states, dones, rewards, data, layers, model, use_model=use_model) loss = transformNot(env.board, new_states, player, goal, convert, data, layers, model, use_model=use_model) stateChanged = [ old != new for old, new in zip(old_states, new_states) ] shouldInterviene = [ cond1 or cond2 for cond1, cond2 in zip(stateChanged, eatCheese) ] exploration = max((explorationN - frame) / explorationN, defaults['softmax_cap']) if use_model: interventions = [ (getInterventionsmodel( state, env.layers.types, layers, model, env, data.layers_not_in(state), frame) if should else old) for state, should, old in zip(new_states, shouldInterviene, interventions) ] else: interventions = [ (getInterventions(env, state, data, exploration) if should else old) for state, should, old in zip( new_states, shouldInterviene, interventions) ] modification = env.board[tensor(interventions)].unsqueeze(1) teleporter.interventions = tensor( [m.flatten().argmax().item() for m in list(modification)], device=device) modified_board = cat((env.board, modification), dim=1) actions = mover(modified_board) observations, rewards, dones, info = env.step(actions) modified_board, modified_rewards, modified_dones, _, _ = teleporter.modify( observations, rewards, dones, info) mover.learn(modified_board, actions, modified_rewards, modified_dones) playerPositions = [ (t := env.layers.dict[LayerType.Player].positions[i][0])[1] * env.layers.width + t[0] for i in range(env.batch) ] eatCheese = [ intervention == player_pos for intervention, player_pos in zip( teleporter.interventions, playerPositions) ] old_states = new_states collector.collect([rewards, modified_rewards], [dones, modified_dones]) collector.collect_loss(loss)