def main(args): problems = [] for file in sorted(os.listdir(args.problems_dir)): if file.endswith('.txt'): problems.append(load_problem(os.path.join(args.problems_dir, file))) results = [] t_values = [] for problem in problems: print('-' * 100) print('Problem: ' + problem.name) start = time.time() value, order, iters = solve_transportation(problem, args.branch_strategy) end = time.time() t_values.append(1. - float(iters) / math.factorial(problem.n)) print('Found solution: {}'.format(order)) print('Criterion value: {}'.format(value)) print('T={}'.format(t_values[-1])) print('Iterations performed {}'.format(iters)) print('Time elapsed {}'.format(end - start)) results.append({ 'problem_name': problem.name, 'opt_value': value, 'solution': order, 'iterations': iters, 'T': t_values[-1] }) assert value == problem.opt_value t_avg = sum(t_values) / len(t_values) print('T_avg={}'.format(t_avg)) if args.output_log: with open(args.output_log, 'w') as f: json.dump(results, f)
def _run_rl(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) encoder_class = { 'gat': GraphAttentionEncoder, 'gcn': GCNEncoder, 'mlp': MLPEncoder }.get(opts.encoder, None) assert encoder_class is not None, "Unknown encoder: {}".format( encoder_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, encoder_class, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Compute number of network parameters print(model) nb_param = 0 for param in model.parameters(): nb_param += np.prod(list(param.data.size())) print('Number of parameters: ', nb_param) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( encoder_class, 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset) opts.val_size = val_dataset.size if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts)
def _run_sl(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) assert opts.problem == 'tspsl', "Only TSP is supported for supervised learning" # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = {'attention': AttentionModel}.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) encoder_class = { 'gat': GraphAttentionEncoder, 'gcn': GCNEncoder, 'mlp': MLPEncoder }.get(opts.encoder, None) assert encoder_class is not None, "Unknown encoder: {}".format( encoder_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, encoder_class, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size, use_cuda=opts.use_cuda).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Compute number of network parameters print(model) nb_param = 0 for param in model.parameters(): nb_param += np.prod(list(param.data.size())) print('Number of parameters: ', nb_param) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }]) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop train_dataset = problem.make_dataset(size=opts.graph_size, filename=opts.train_dataset) opts.epoch_size = train_dataset.size val_dataset = problem.make_dataset(size=opts.graph_size, filename=opts.val_dataset) opts.val_size = val_dataset.size if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch_sl(model, optimizer, lr_scheduler, epoch, train_dataset, val_dataset, problem, tb_logger, opts)
def run(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size, steps=opts.awe_steps, graph_size=opts.graph_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'constant': baseline = ConstantBaseline() elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) elif opts.baseline == 'critic_lp': assert problem.NAME == 'lp' dim_vocab = {2: 2, 3: 5, 4: 15, 5: 52, 6: 203, 7: 877, 8: 4140} baseline = CriticBaseline( (CriticNetworkLP(dim_vocab[opts.awe_steps], opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution, size=opts.graph_size, degree=opts.degree, steps=opts.awe_steps, awe_samples=opts.awe_samples) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: extra = {'updates': 0, 'avg_reward': 10**8, "best_epoch": -1} start = time.time() for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts, extra) finish = time.time() with open("experiments.log", "a+") as f: f.write("{} {:.4f} {} {:.2f}\n".format( '-'.join(opts.train_dataset.split('/')[-2:]), extra["avg_reward"], extra["best_epoch"], finish - start)) print("Took {:.2f} sec for {} epochs".format(finish - start, opts.n_epochs))
def run(opts): rank = opts.local_rank if torch.cuda.device_count() > 1 else 0 # Set the random seed torch.manual_seed(opts.seed + rank) random.seed(opts.seed + rank) np.random.seed(opts.seed + rank) if not os.path.exists(opts.save_dir) and rank == 0: os.makedirs(opts.save_dir) # Optionally configure wandb if not opts.no_wandb and rank == 0: wandb.login('never', '31ce01e4120061694da54a54ab0dafbee1262420') wandb.init(dir=opts.save_dir, config=opts, project='large_scale_tsp', name=opts.run_name, sync_tensorboard=True, save_code=True) # Set the device if opts.use_cuda: torch.cuda.set_device(rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') opts.device = torch.device("cuda", rank) else: opts.device = torch.device("cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: if rank == 0: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model: torch.nn.Module = model_class( opts.embedding_dim, opts.hidden_dim, problem, attention_type=opts.attention_type, n_encode_layers=opts.n_encode_layers, n_heads=opts.n_heads, feed_forward_dim=opts.feed_forward_dim, encoding_knn_size=opts.encoding_knn_size, decoding_knn_size=opts.decoding_knn_size, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.init_normalization_parameters: for m in model.modules(): if isinstance(m, Normalization): m.init_parameters() if opts.use_cuda: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to( opts.device) model = DDP(model, device_ids=[rank]) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) scaler = torch.cuda.amp.GradScaler() if opts.precision == 16 else None # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) if rank == 0: print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, scaler, baseline, lr_scheduler, epoch, val_dataset, problem, opts)
def run(opts): # start time start_time = time() train_run = [] opts.save_hrs.sort() run_name = opts.run_name # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 torch.save(model, os.path.join('.', 'empty.pt')) if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): avg_time = train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts, start_time) train_run.append(avg_time) for hr in opts.save_hrs: if (time() - start_time) > hr * 3600: opts.save_hrs.remove(hr) print('Saving model and state...') hr_time = int(round((time() - start_time) / 3600)) with open( '../models/att/hist_{}_{}hr.pickle'.format( run_name, hr_time), 'wb') as handle: pickle.dump(train_run, handle, protocol=pickle.HIGHEST_PROTOCOL) torch.save( { 'model': get_inner_model(model).state_dict(), 'optimizer': optimizer.state_dict(), 'rng_state': torch.get_rng_state(), 'cuda_rng_state': torch.cuda.get_rng_state_all(), 'baseline': baseline.state_dict() }, os.path.join( '../models/att', '{}_{}hr-model-att-only.pt'.format( run_name, hr_time))) torch.save( model, os.path.join( '../models/att', '{}_{}hr-model.pt'.format(run_name, hr_time)))
#get_ipython().run_line_magic('run', 'options --graph_size 100 --eval_only --seed 1234 --steps 1000 --load_path outputs/tsp_100/run/epoch-199.pt') opts = get_options() opts.graph_size = 100 ########################################################change opts.eval_only = True opts.seed = 1234 opts.steps = 1000 opts.load_path = 'outputs/tsp_100/run/epoch-199.pt' # In[3]: # Set the random seed torch.manual_seed(opts.seed) # Figure out what's the problem problem = load_problem(opts.problem) val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset) torch.save(val_dataset, 'myval_test100.pt') val_dataset = torch.load('myval_test100.pt') val_dataset = val_dataset[0:200] # In[4]: # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume
def DecimalToAnyBaseArray(decimal, base): array = [] __DecimalToAnyBaseArrayRecur__(array, decimal, base) return array[::-1] file_path = os.path.abspath( os.path.join('../../outputs/ll_24/run_20210301T185411/args.json')) print(file_path) with open(file_path, 'r') as f: opts = json.load(f) problem = load_problem(opts['problem']) SNR_THR_DB = 5 # Hardcoded SNR threshold for URLLC users in db SNR_THR = 10**(SNR_THR_DB / 10) CHANNEL_USE = { 0: 24., 1: 48., 2: 96., } EPSILON = 1e-4 num_test_samples = 1000 #test_dataset = problem.make_dataset(
def run(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard and opts.no_dirpg: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) if not opts.no_dirpg: task = Task.init(project_name='DirPG-TSP', task_name=opts.run_name) tb_logger = SummaryWriter( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) tb_logger.add_text('Comment', opts.comment, 0) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, 'pointer': PointerNetwork }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model = model_class(opts.embedding_dim, opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) print(model_) # Initialize baseline if opts.baseline == 'exponential': baseline = ExponentialBaseline(opts.exp_beta) elif opts.baseline == 'critic' or opts.baseline == 'critic_lstm': assert problem.NAME == 'tsp', "Critic only supported for TSP" baseline = CriticBaseline( (CriticNetworkLSTM(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.tanh_clipping) if opts.baseline == 'critic_lstm' else CriticNetwork( 2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) print(" rollout" * 30) else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() if opts.bl_warmup_epochs > 0: print(opts.bl_warmup_epochs) baseline = WarmupBaseline(baseline, opts.bl_warmup_epochs, warmup_exp_beta=opts.exp_beta) print(" WarmupBaseline" * 30) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 model = dirpg.DirPG(model, opts) if not opts.no_dirpg else model if opts.eval_only: validate(model, val_dataset, opts) else: interactions_count = opts.epoch_start * opts.epoch_size * opts.max_interactions epoch = opts.epoch_start while interactions_count < opts.total_interactions: # for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch( model, optimizer, baseline, lr_scheduler, epoch, interactions_count, val_dataset, problem, tb_logger, opts, ) print("interactions_count model so far ", interactions_count) n_interactions = get_inner_model(model).get_and_reset_interactions(opts.use_cuda, opts.no_dirpg)\ if opts.no_dirpg else model.model.get_and_reset_interactions(opts.use_cuda, opts.no_dirpg) interactions_count += n_interactions print("interactions_count model new", n_interactions) interactions_count += get_inner_model(baseline.baseline.model).get_and_reset_interactions(opts.use_cuda, opts.no_dirpg)\ if baseline.__class__.__name__ != "NoBaseline" else 0 print("interactions_count baseline ", interactions_count) print("interactions_count: {} out of {} ".format( interactions_count, opts.total_interactions)) epoch += 1
def main(): start = time.time() # Start time. os.system('cls' if os.name == 'nt' else 'clear') # Clears the terminal. # Handles the arguments. if len(sys.argv) == 3: # If the args are 3 no output file name wasn't specified. method = sys.argv[1] input_file = sys.argv[2] elif len(sys.argv) == 4: # If the args are 4 the output file name was specified. method = sys.argv[1] input_file = sys.argv[2] output_file = sys.argv[3] else: print( f'Usage: {sys.argv[0]} <search algorithm> <problem file name> <solution file name>') print('- search algorithms: depth (Depth First), breadth (Breadth First), best (Best First), astar (A*)') sys.exit() # Initializes the type of queue based on the search method. search_queue = utils.METHODS[method] # Parse the data and get the objects (blocks), initial state and the goal state. data = utils.load_problem(input_file) objects = utils.get_objects_from_file(data) initial_state = utils.get_initial_state(data) goal_state = utils.get_goal_state(data) print('OBJECTS:', objects) print('\n#################### INITIAL STATE ####################\n') print(initial_state) i_blocks = utils.initialize_blocks(objects, initial_state) print('\n#################### GOAL STATE ####################\n') print(goal_state) g_blocks = utils.initialize_blocks(objects, goal_state) solution_node = search(search_queue, method, i_blocks, g_blocks) if solution_node != None: # If a solution is found. print('\n#################### SOLUTION ####################\n') solution_node.print_state() print(f'Number of moves: {solution_node.g}') # Calculates the time it took to find the solution. print('Took: ', time.time() - start) solution_path = solution_node.get_moves_to_solution() if len(sys.argv) == 3: # If the output file name was not specified. try: # Handling the paths with forward-slashes and back-slashes. file_name = input_file.split('\\')[-1] output_file = './solutions/' + method + '-' + file_name utils.write_solution(output_file, solution_path) except FileNotFoundError: file_name = input_file.split('/')[-1] output_file = './solutions/' + method + '-' + file_name utils.write_solution(output_file, solution_path) else: # If the output file name is specified. utils.write_solution(output_file, solution_path) else: print('Took: ', time.time() - start) print('############ ONE MINUTE PASSED AND NO SOLUTION WAS FOUND ############') sys.exit()
def run(opts): # Pretty print the run args pprint.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) if not os.path.exists(opts.save_dir): os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem)(p_size=opts.graph_size, with_assert=not opts.no_assert) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model_class = { 'attention': AttentionModel, }.get(opts.model, None) assert model_class is not None, "Unknown model: {}".format(model_class) model = model_class(problem=problem, embedding_dim=opts.embedding_dim, hidden_dim=opts.hidden_dim, n_heads=opts.n_heads_encoder, n_layers=opts.n_encode_layers, normalization=opts.normalization, device=opts.device).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Load the validation datasets val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset) # Do validation only if opts.eval_only: validate(problem, model, val_dataset, tb_logger, opts, _id=0) else: # Initialize baseline baseline = CriticBaseline( CriticNetwork(problem=problem, embedding_dim=opts.embedding_dim, hidden_dim=opts.hidden_dim, n_heads=opts.n_heads_decoder, n_layers=opts.n_encode_layers, normalization=opts.normalization, device=opts.device).to(opts.device)) # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split( opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 # Start the actual training loop for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(problem, model, optimizer, baseline, lr_scheduler, epoch, val_dataset, tb_logger, opts)
def run(opts): # Pretty print the run args pp.pprint(vars(opts)) # Set the random seed torch.manual_seed(opts.seed) # Optionally configure tensorboard tb_logger = None if not opts.no_tensorboard: tb_logger = TbLogger( os.path.join(opts.log_dir, "{}_{}".format(opts.problem, opts.graph_size), opts.run_name)) os.makedirs(opts.save_dir) # Save arguments so exact configuration can always be found with open(os.path.join(opts.save_dir, "args.json"), 'w') as f: json.dump(vars(opts), f, indent=True) # Set the device opts.device = torch.device("cuda:0" if opts.use_cuda else "cpu") # Figure out what's the problem problem = load_problem(opts.problem) # Load data from load_path load_data = {} assert opts.load_path is None or opts.resume is None, "Only one of load path and resume can be given" load_path = opts.load_path if opts.load_path is not None else opts.resume if load_path is not None: print(' [*] Loading data from {}'.format(load_path)) load_data = torch_load_cpu(load_path) # Initialize model model = AttentionModel(opts.embedding_dim, opts.hidden_dim, problem, n_encode_layers=opts.n_encode_layers, mask_inner=True, mask_logits=True, normalization=opts.normalization, tanh_clipping=opts.tanh_clipping, checkpoint_encoder=opts.checkpoint_encoder, shrink_size=opts.shrink_size).to(opts.device) if opts.use_cuda and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # Overwrite model parameters by parameters to load model_ = get_inner_model(model) model_.load_state_dict({ **model_.state_dict(), **load_data.get('model', {}) }) # Initialize baseline if opts.baseline == 'critic': baseline = CriticBaseline( (CriticNetwork(2, opts.embedding_dim, opts.hidden_dim, opts.n_encode_layers, opts.normalization)).to(opts.device)) elif opts.baseline == 'rollout': baseline = RolloutBaseline(model, problem, opts) elif opts.baseline == 'oracle': baseline = OracleBaseline() else: assert opts.baseline is None, "Unknown baseline: {}".format( opts.baseline) baseline = NoBaseline() # Load baseline from data, make sure script is called with same type of baseline if 'baseline' in load_data: baseline.load_state_dict(load_data['baseline']) # Initialize optimizer optimizer = optim.Adam([{ 'params': model.parameters(), 'lr': opts.lr_model }] + ([{ 'params': baseline.get_learnable_parameters(), 'lr': opts.lr_critic }] if len(baseline.get_learnable_parameters()) > 0 else [])) # Load optimizer state if 'optimizer' in load_data: optimizer.load_state_dict(load_data['optimizer']) for state in optimizer.state.values(): for k, v in state.items(): # if isinstance(v, torch.Tensor): if torch.is_tensor(v): state[k] = v.to(opts.device) # Initialize learning rate scheduler, decay by lr_decay once per epoch! lr_scheduler = optim.lr_scheduler.LambdaLR( optimizer, lambda epoch: opts.lr_decay**epoch) # Start the actual training loop val_dataset = problem.make_dataset(size=opts.graph_size, num_samples=opts.val_size, filename=opts.val_dataset, distribution=opts.data_distribution) val_dataset_tensor = torch.stack(val_dataset.data) dist = (val_dataset_tensor.transpose(1, 2).repeat_interleave( opts.graph_size, 2).transpose(1, 2).float() - val_dataset_tensor.repeat(1, opts.graph_size, 1).float()).norm( p=2, dim=2).view(opts.val_size, opts.graph_size, opts.graph_size) DP_val_solution = [held_karp(dist[i])[0] for i in range(opts.val_size)] DP_val_solution = torch.tensor(DP_val_solution) DP_val_solution = DP_val_solution.mean() problem.DP_cost = DP_val_solution print('problem_DPCost = ', DP_val_solution) if opts.resume: epoch_resume = int( os.path.splitext(os.path.split(opts.resume)[-1])[0].split("-")[1]) torch.set_rng_state(load_data['rng_state']) if opts.use_cuda: torch.cuda.set_rng_state_all(load_data['cuda_rng_state']) # Set the random states # Dumping of state was done before epoch callback, so do that now (model is loaded) baseline.epoch_callback(model, epoch_resume) print("Resuming after {}".format(epoch_resume)) opts.epoch_start = epoch_resume + 1 if opts.eval_only: validate(model, val_dataset, opts) else: for epoch in range(opts.epoch_start, opts.epoch_start + opts.n_epochs): train_epoch(model, optimizer, baseline, lr_scheduler, epoch, val_dataset, problem, tb_logger, opts)