def test_save_and_load(self): torch.manual_seed(42) config = ModelConfig() model = Model.create(config, logger=NullLogger()) model_dir = "local_tests/local_model_test" model.save(model_dir) assert os.path.exists(f"{model_dir}/config.json") assert os.path.exists(f"{model_dir}/model.pt") model = Model.load(model_dir).to(gpu) assert next(model.parameters()).device.type == gpu.type
def _update_gen_net(self, generator_net: Model, net: Model): """Create a network with parameters weighted by self.tau""" self.tt.profile("Creating generator network") genparams, netparams = generator_net.state_dict(), net.state_dict() new_genparams = dict(genparams) for pname, param in netparams.items(): new_genparams[pname].data.copy_( self.tau * param.data.to(gpu) + (1-self.tau) * new_genparams[pname].data.to(gpu) ) generator_net.load_state_dict(new_genparams) self.tt.end_profile("Creating generator network") return generator_net.to(gpu)
def rollout(self, net: Model, rollout: int, value_targets: torch.Tensor): """Saves statistics after a rollout has been performed for understanding the loss development :param torch.nn.Model net: The current net, used for saving values and policies of first 12 states :param rollout int: The rollout number. Used to determine whether it is evaluation time => check targets :param torch.Tensor value_targets: Used for visualizing value change """ # First time if self.params is None: self.params = net.get_params() # Keeping track of the entropy off on the 12-dimensional log-probability policy-output entropies = [entropy(policy, axis=1) for policy in self.rollout_policy] #Currently: Mean over all games in entire rollout. Maybe we want it more fine grained later. self.policy_entropies.append(np.mean( [np.nanmean(entropy) for entropy in entropies] )) self.rollout_policy = list() #reset for next rollout if rollout in self.evaluations: net.eval() # Calculating value targets targets = value_targets.cpu().numpy().reshape((-1, self.depth)) self.avg_value_targets.append(targets.mean(axis=0)) # Calculating model change model_change = torch.sqrt((net.get_params()-self.params)**2).mean().cpu() model_total_change = torch.sqrt((net.get_params()-self.orig_params)**2).mean().cpu() self.params = net.get_params() self.param_changes.append(float(model_change)) self.param_total_changes.append(model_total_change) #In the beginning: Calculate value given to first 12 substates if rollout <= self.extra_evals: self.first_state_values.append( net(self.first_states, policy=False, value=True).detach().cpu().numpy() ) net.train()
def test_train(self): torch.manual_seed(42) #The standard test net = Model.create(ModelConfig()) evaluator = Evaluator(2, max_time=.02, max_states=None, scrambling_depths=[2]) train = Train(rollouts=2, batch_size=2, tau=0.1, alpha_update=.5, gamma=1, rollout_games=2, rollout_depth=3, optim_fn=torch.optim.Adam, agent=PolicySearch(None), lr=1e-6, evaluation_interval=1, evaluator=evaluator, update_interval=1, with_analysis=True, reward_method='schultzfix') # Current net, min_net = train.train(net) train.plot_training("local_tests/local_train_test", "test") assert os.path.exists("local_tests/local_train_test/training_test.png")
def test_resnet(self): config = ModelConfig(architecture = 'res_big') model = Model.create(config) assert next(model.parameters()).device.type == gpu.type model.eval() x = torch.randn(2, 480).to(gpu) model(x) model.train() model(x)
def test_model(self): config = ModelConfig() model = Model.create(config) assert next(model.parameters()).device.type == gpu.type model.eval() x = torch.randn(2, 480).to(gpu) model(x) model.train() model(x)
def test_agent_optim(self, agents=['MCTS', 'AStar', 'EGVM']): run_path = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'librubiks', 'solving', 'hyper_optim.py' ) location = 'local_tests/optim' net = Model(ModelConfig()) net.save(location) for agent in agents: run_settings = { 'location': location, 'agent': agent, 'iterations': 1, 'eval_games': 1, 'depth': 2, 'save_optimal': True, 'use_best': True, 'optimizer': 'BO' } args = [sys.executable, run_path,] for k, v in run_settings.items(): args.extend([f'--{k}', str(v)]) subprocess.check_call(args) # Raises error on problems in call expected_files = [f'{agent}_optimization.log', f'{agent}_params.json'] for fname in expected_files: assert fname in os.listdir(location) return location
def test_cost(self): net = Model.create(ModelConfig()).eval() games = 5 states, _ = cube.sequence_scrambler(games, 1, True) agent = AStar(net, lambda_=1, expansions=2) agent.reset(1, 1) i = [] for i, _ in enumerate(states): agent.G[i] = 1 cost = agent.cost(states, i) assert cost.shape == (games, )
def test_agents(self): net = Model.create(ModelConfig()) agents = [ RandomSearch(), BFS(), PolicySearch(net, sample_policy=False), PolicySearch(net, sample_policy=True), ValueSearch(net), EGVM(net, 0.1, 4, 12), ] for s in agents: self._test_agents(s)
def test_expansion(self): net = Model.create(ModelConfig()).eval() init_state, _, _ = cube.scramble(3) agent = AStar(net, lambda_=0.1, expansions=5) agent.search(init_state, time_limit=1) init_idx = agent.indices[init_state.tostring()] assert init_idx == 1 assert agent.G[init_idx] == 0 for action in cube.action_space: substate = cube.rotate(init_state, *action) idx = agent.indices[substate.tostring()] assert agent.G[idx] == 1 assert agent.parents[idx] == init_idx
def test_agent(self): test_params = { (0, 10), (0.5, 2), (1, 1), } net = Model.create(ModelConfig()).eval() for params in test_params: agent = AStar(net, *params) self._can_win_all_easy_games(agent) agent.reset("Tue", "Herlau") assert not len(agent.indices) assert not len(agent.open_queue)
def _mcts_test(self, state: np.ndarray, search_graph: bool): agent = MCTS(Model.create(ModelConfig()), c=1, search_graph=search_graph) solved = agent.search(state, .2) # Indices assert agent.indices[state.tostring()] == 1 for s, i in agent.indices.items(): assert agent.states[i].tostring() == s assert sorted(agent.indices.values())[0] == 1 assert np.all(np.diff(sorted(agent.indices.values())) == 1) used_idcs = np.array(list(agent.indices.values())) # States assert np.all(agent.states[1] == state) for i, s in enumerate(agent.states): if i not in used_idcs: continue assert s.tostring() in agent.indices assert agent.indices[s.tostring()] == i # Neighbors if not search_graph: for i, neighs in enumerate(agent.neighbors): if i not in used_idcs: continue state = agent.states[i] for j, neighbor_index in enumerate(neighs): assert neighbor_index == 0 or neighbor_index in agent.indices.values( ) if neighbor_index == 0: continue substate = cube.rotate(state, *cube.action_space[j]) assert np.all(agent.states[neighbor_index] == substate) # Policy and value with torch.no_grad(): p, v = agent.net(cube.as_oh(agent.states[used_idcs])) p, v = p.softmax(dim=1).cpu().numpy(), v.squeeze().cpu().numpy() assert np.all(np.isclose(agent.P[used_idcs], p, atol=1e-5)) assert np.all(np.isclose(agent.V[used_idcs], v, atol=1e-5)) # Leaves if not search_graph: assert np.all(agent.neighbors.all(axis=1) != agent.leaves) # W assert agent.W[used_idcs].all() return agent, solved
import matplotlib.pyplot as plt import numpy as np import torch from librubiks import gpu, no_grad from librubiks import cube from librubiks.model import Model from librubiks.utils import TickTock, Logger tt = TickTock() log = Logger("data/local_analyses/net.log", "Analyzing MCTS") net = Model.load("data/local_method_comparison/asgerfix").eval().to(gpu) def _get_adi_ff_slices(b, n): slice_size = n // b + 1 # Final slice may have overflow, however this is simply ignored when indexing slices = [slice(i * slice_size, (i + 1) * slice_size) for i in range(b)] return slices def _ff(oh_states, value=True, policy=True): batches = 1 while True: try: value_parts = [net(oh_states[slice_], policy=policy, value=value).squeeze() for slice_ in _get_adi_ff_slices(batches, len(oh_states))] values = torch.cat(value_parts).cpu() break except RuntimeError as e: # Usually caused by running out of vram. If not, the error is still raised, else batch size is reduced if "alloc" not in str(e): raise e
def from_saved(cls, loc: str, use_best: bool, sample_policy=False): net = Model.load(loc, load_best=use_best) net.to(gpu) return cls(net, sample_policy)
def agent_optimize(): """ Main way to run optimization. Hard coded to run optimization at 1 sec per game, but other behaviour can be set with CLI arguments seen by running `python librubiks/solving/hyper_optim.py --help`. Does not support config arguments. NB: The path here is different to the one in runeval and runtrain: It needs to be to folder containing model.pt! It doesen't work with parent folder. Can work with runeval through ``` python librubiks/solving/hyper_optim.py --location example/net1/ python runeval.py --location example/ --optimized_params True ``` """ set_seeds() #Lot of overhead just for default argument niceness: latest model is latest from runeval import train_folders model_path = '' if train_folders: for folder in [train_folders[-1]] + glob(f"{train_folders[-1]}/*/"): if os.path.isfile(os.path.join(folder, 'model.pt')): model_path = os.path.join(folder) break parser = argparse.ArgumentParser(description='Optimize Monte Carlo Tree Search for one model') parser.add_argument('--location', help='Folder which includes model.pt. Results will also be saved here', type=str, default=model_path) parser.add_argument('--iterations', help='Number of iterations of Bayesian Optimization', type=int, default=125) parser.add_argument('--agent', help='Name of agent corresponding to agent class in librubiks.solving.agents', type=str, default='AStar', choices = ['AStar', 'MCTS', 'EGVM']) parser.add_argument('--depth', help='Single number corresponding to the depth at which to test. If 0: run this at deep', type=int, default=0) parser.add_argument('--eval_games', help='Number of games to evaluate at depth', type = int, default='100') parser.add_argument('--save_optimal', help='If Tue, saves a JSON of optimal hyperparameters usable for runeval', type=literal_eval, default=True, choices = [True, False]) parser.add_argument('--use_best', help="Set to True to use model-best.pt instead of model.pt.", type=literal_eval, default=True, choices = [True, False]) parser.add_argument('--optim_lengths', help="Set to true to optimize against sol percentage / solution length. Else, simply use sol %", type=literal_eval, default=True, choices = [True, False]) parser.add_argument('--optimizer', help="Either BO or grid", type=str, default="grid", choices = ("grid", "BO")) args = parser.parse_args() agent_name = args.agent if agent_name == 'MCTS': params = { 'c': (0.1, 100), } def prepper(params): return params persistent_params = { 'net': Model.load(args.location, load_best=args.use_best), 'search_graph': True, } elif agent_name == 'AStar': params = { 'lambda_': (0, 0.4), 'expansions': (1, 1000), } def prepper(params): params['expansions'] = int(params['expansions']) return params persistent_params = { 'net': Model.load(args.location, load_best=args.use_best), } elif agent_name == 'EGVM': params = { 'epsilon': (0, 0.5), 'workers': (1, 500), 'depth': (1, 250), } def prepper(params): params['workers'] = int(params['workers']) params['depth'] = int(params['depth']) return params persistent_params = { 'net': Model.load(args.location, load_best=args.use_best), } else: raise NameError(f"{agent_name} does not correspond to a known agent, please pick either AStar, MCTS or EGVM") logger = Logger(os.path.join(args.location, f'{agent_name}_optimization.log'), 'Optimization') logger.log(f"{agent_name} optimization. Using network from {model_path}.") logger.log(f"Received arguments: {vars(args)}") agent = getattr(agents, agent_name) evaluator = Evaluator(n_games=args.eval_games, max_time=5, scrambling_depths=range(0) if args.depth == 0 else [args.depth]) assert args.optimizer in ["BO", "grid"], f"Optimizer should be 'BO' or 'grid', not '{args.optimizer}'" if args.optimizer == "BO": optimizer = BayesianOptimizer(target_function=None, parameters=params, logger=logger) else: optimizer = GridSearch(target_function=None, parameters=params, logger=logger) optimizer.objective_from_evaluator(evaluator, agent, persistent_params, param_prepper=prepper, optim_lengths=args.optim_lengths) optimizer.optimize(args.iterations) if args.save_optimal: with open(os.path.join(args.location, f'{agent_name}_params.json'), 'w') as outfile: json.dump(prepper(copy(optimizer.optimal)), outfile)
def execute(self): # Sets representation self.logger.section( f"Starting job:\n{self.name} with {'20x24' if get_is2024() else '6x8x6'} representation\nLocation {self.location}\nCommit: {get_commit()}" ) train = Train( self.rollouts, batch_size=self.batch_size, rollout_games=self.rollout_games, rollout_depth=self.rollout_depth, optim_fn=self.optim_fn, alpha_update=self.alpha_update, lr=self.lr, gamma=self.gamma, tau=self.tau, reward_method=self.reward_method, update_interval=self.update_interval, agent=self.agent, logger=self.logger, evaluation_interval=self.evaluation_interval, evaluator=self.evaluator, with_analysis=self.analysis, ) self.logger( f"Rough upper bound on total evaluation time during training: {len(train.evaluation_rollouts)*self.evaluator.approximate_time()/60:.2f} min" ) net = Model.create(self.model_cfg, self.logger) net, min_net = train.train(net) net.save(self.location) if self.evaluation_interval: min_net.save(self.location, True) train.plot_training(self.location, name=self.name) analysispath = os.path.join(self.location, "analysis") datapath = os.path.join(self.location, "train-data") os.mkdir(datapath) os.mkdir(analysispath) if self.analysis: train.analysis.plot_substate_distributions(analysispath) train.analysis.plot_value_targets(analysispath) train.analysis.plot_net_changes(analysispath) train.analysis.visualize_first_states(analysispath) np.save(f"{datapath}/avg_target_values.npy", train.analysis.avg_value_targets) np.save(f"{datapath}/policy_entropies.npy", train.analysis.policy_entropies) np.save(f"{datapath}/substate_val_stds.npy", train.analysis.substate_val_stds) np.save(f"{datapath}/rollouts.npy", train.train_rollouts) np.save(f"{datapath}/policy_losses.npy", train.policy_losses) np.save(f"{datapath}/value_losses.npy", train.value_losses) np.save(f"{datapath}/losses.npy", train.train_losses) np.save(f"{datapath}/evaluation_rollouts.npy", train.evaluation_rollouts) np.save(f"{datapath}/evaluations.npy", train.sol_percents) return train.train_rollouts, train.train_losses
def train(self, net: Model) -> (Model, Model): """ Training loop: generates data, optimizes parameters, evaluates (sometimes) and repeats. Trains `net` for `self.rollouts` rollouts each consisting of `self.rollout_games` games and scrambled `self.rollout_depth`. The network is evaluated for each rollout number in `self.evaluations` according to `self.evaluator`. Stores multiple performance and training results. :param torch.nn.Model net: The network to be trained. Must accept input consistent with cube.get_oh_size() :return: The network after all evaluations and the network with the best evaluation score (win fraction) :rtype: (torch.nn.Model, torch.nn.Model) """ self.tt.reset() self.tt.tick() self.states_per_rollout = self.rollout_depth * self.rollout_games self.log(f"Beginning training. Optimization is performed in batches of {self.batch_size}") self.log("\n".join([ f"Rollouts: {self.rollouts}", f"Each consisting of {self.rollout_games} games with a depth of {self.rollout_depth}", f"Evaluations: {len(self.evaluation_rollouts)}", ])) best_solve = 0 best_net = net.clone() self.agent.net = net if self.with_analysis: self.analysis.orig_params = net.get_params() generator_net = net.clone() alpha = 1 if self.alpha_update == 1 else 0 optimizer = self.optim(net.parameters(), lr=self.lr) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, self.gamma) self.policy_losses = np.zeros(self.rollouts) self.value_losses = np.zeros(self.rollouts) self.train_losses = np.empty(self.rollouts) self.sol_percents = list() for rollout in range(self.rollouts): reset_cuda() generator_net = self._update_gen_net(generator_net, net) if self.tau != 1 else net self.tt.profile("ADI training data") training_data, policy_targets, value_targets, loss_weights = self.ADI_traindata(generator_net, alpha) self.tt.profile("To cuda") training_data = training_data.to(gpu) policy_targets = policy_targets.to(gpu) value_targets = value_targets.to(gpu) loss_weights = loss_weights.to(gpu) self.tt.end_profile("To cuda") self.tt.end_profile("ADI training data") reset_cuda() self.tt.profile("Training loop") net.train() batches = self._get_batches(self.states_per_rollout, self.batch_size) for i, batch in enumerate(batches): optimizer.zero_grad() policy_pred, value_pred = net(training_data[batch], policy=True, value=True) # Use loss on both policy and value policy_loss = self.policy_criterion(policy_pred, policy_targets[batch]) * loss_weights[batch] value_loss = self.value_criterion(value_pred.squeeze(), value_targets[batch]) * loss_weights[batch] loss = torch.mean(policy_loss + value_loss) loss.backward() optimizer.step() self.policy_losses[rollout] += policy_loss.detach().cpu().numpy().mean() / len(batches) self.value_losses[rollout] += value_loss.detach().cpu().numpy().mean() / len(batches) if self.with_analysis: #Save policy output to compute entropy with torch.no_grad(): self.analysis.rollout_policy.append( torch.nn.functional.softmax(policy_pred.detach(), dim=0).cpu().numpy() ) self.train_losses[rollout] = (self.policy_losses[rollout] + self.value_losses[rollout]) self.tt.end_profile("Training loop") # Updates learning rate and alpha if rollout and self.update_interval and rollout % self.update_interval == 0: if self.gamma != 1: lr_scheduler.step() lr = optimizer.param_groups[0]["lr"] self.log(f"Updated learning rate from {lr/self.gamma:.2e} to {lr:.2e}") if (alpha + self.alpha_update <= 1 or np.isclose(alpha + self.alpha_update, 1)) and self.alpha_update: alpha += self.alpha_update self.log(f"Updated alpha from {alpha-self.alpha_update:.2f} to {alpha:.2f}") elif alpha < 1 and alpha + self.alpha_update > 1 and self.alpha_update: self.log(f"Updated alpha from {alpha:.2f} to 1") alpha = 1 if self.log.is_verbose() or rollout in (np.linspace(0, 1, 20)*self.rollouts).astype(int): self.log(f"Rollout {rollout} completed with mean loss {self.train_losses[rollout]}") if self.with_analysis: self.tt.profile("Analysis of rollout") self.analysis.rollout(net, rollout, value_targets) self.tt.end_profile("Analysis of rollout") if rollout in self.evaluation_rollouts: net.eval() self.agent.net = net self.tt.profile(f"Evaluating using agent {self.agent}") with unverbose: eval_results, _, _ = self.evaluator.eval(self.agent) eval_reward = (eval_results != -1).mean() self.sol_percents.append(eval_reward) self.tt.end_profile(f"Evaluating using agent {self.agent}") if eval_reward > best_solve: best_solve = eval_reward best_net = net.clone() self.log(f"Updated best net with solve rate {eval_reward*100:.2f} % at depth {self.evaluator.scrambling_depths}") self.log.section("Finished training") if len(self.evaluation_rollouts): self.log(f"Best net solves {best_solve*100:.2f} % of games at depth {self.evaluator.scrambling_depths}") self.log.verbose("Training time distribution") self.log.verbose(self.tt) total_time = self.tt.tock() eval_time = self.tt.profiles[f'Evaluating using agent {self.agent}'].sum() if len(self.evaluation_rollouts) else 0 train_time = self.tt.profiles["Training loop"].sum() adi_time = self.tt.profiles["ADI training data"].sum() nstates = self.rollouts * self.rollout_games * self.rollout_depth * cube.action_dim states_per_sec = int(nstates / (adi_time+train_time)) self.log("\n".join([ f"Total running time: {self.tt.stringify_time(total_time, TimeUnit.second)}", f"- Training data for ADI: {self.tt.stringify_time(adi_time, TimeUnit.second)} or {adi_time/total_time*100:.2f} %", f"- Training time: {self.tt.stringify_time(train_time, TimeUnit.second)} or {train_time/total_time*100:.2f} %", f"- Evaluation time: {self.tt.stringify_time(eval_time, TimeUnit.second)} or {eval_time/total_time*100:.2f} %", f"States witnessed incl. substates: {TickTock.thousand_seps(nstates)}", f"- Per training second: {TickTock.thousand_seps(states_per_sec)}", ])) return net, best_net
def from_saved(cls, loc: str, use_best: bool, lambda_: float, expansions: int) -> DeepAgent: net = Model.load(loc, load_best=use_best).to(gpu) return cls(net, lambda_=lambda_, expansions=expansions)
def from_saved(cls, loc: str, use_best: bool, c: float, search_graph: bool): net = Model.load(loc, load_best=use_best) net.to(gpu) return cls(net, c=c, search_graph=search_graph)
def from_saved(cls, loc: str, use_best: bool, epsilon: float, workers: int, depth: int): net = Model.load(loc, load_best=use_best).to(gpu) return cls(net, epsilon=epsilon, workers=workers, depth=depth)
def from_saved(cls, loc: str, use_best: bool): net = Model.load(loc, load_best=use_best) net.to(gpu) return cls(net)
import matplotlib.pyplot as plt import numpy as np from librubiks import gpu, cube, rc_params from librubiks.model import Model from librubiks.solving.agents import MCTS from librubiks.utils import set_seeds, Logger, TickTock, TimeUnit np.set_printoptions(precision=4, threshold=np.inf) plt.rcParams.update(rc_params) tt = TickTock() log = Logger("data/local_analyses/mcts.log", "Analyzing MCTS") net = Model.load("local_net").eval().to(gpu) def solve(depth: int, c: float, time_limit: float): state, f, d = cube.scramble(depth, True) searcher = MCTS(net, c=c, search_graph=False) is_solved = searcher.search(state, time_limit) assert is_solved == (cube.get_solved().tostring() in searcher.indices) return is_solved, len(searcher.indices) def analyze_var(var: str, values: np.ndarray, other_vars: dict): x = values y = [] tree_sizes = [] log.section( f"Optimizing {var}\nExpected runtime: {len(x)*time_limit*n:.2f} s\nGames per evaluation: {n}"
def test_init(self): for init in ['glorot', 'he', 0, 1.123123123e-3]: cf = ModelConfig(init=init) model = Model.create(cf) x = torch.randn(2,480).to(gpu) model(x)