def main(exp): # Number of examples per batch batch_size: Argument & int = default(256) # Dataset to load dataset: Argument torch_settings = init_torch() dataset = exp.get_dataset(dataset) loader = torch.utils.data.DataLoader( dataset.train, batch_size=batch_size, shuffle=True, num_workers=torch_settings.workers, pin_memory=True ) wrapper = iteration_wrapper(exp, sync=None) # Warm up a bit for _, batch in zip(range(10), loader): for item in batch: item.to(torch_settings.device) break for it, batch in dataloop(loader, wrapper=wrapper): it.set_count(batch_size) it.log(eta=True) batch = [item.to(torch_settings.device) for item in batch] if torch_settings.sync: torch_settings.sync()
def main(exp): # Dataset to use dataset: Argument # super resolution upscale factor upscale_factor: Argument & int = default(2) # # testing batch size (default: 10) # test_batch_size: Argument & int = default(10) # Learning rate (default: 0.1) lr: Argument & float = default(0.1) # Batch size (default: 64) batch_size: Argument & int = default(64) torch_settings = init_torch() device = torch_settings.device print('===> Loading datasets') # dataset_instance = exp.resolve_dataset("milabench.presets:bsds500") # folder = dataset_instance["environment"]["root"] sets = get_dataset(exp, dataset, upscale_factor) train_set = sets.train # train_set = get_dataset(os.path.join(folder, "bsds500/BSR/BSDS500/data/images/train"), upscale_factor) # test_set = get_dataset(os.path.join(folder, "bsds500/BSR/BSDS500/data/images/test"), upscale_factor) training_data_loader = DataLoader(dataset=train_set, num_workers=torch_settings.workers, batch_size=batch_size, shuffle=True) # testing_data_loader = DataLoader( # dataset=test_set, # num_workers=torch_settings.workers, # batch_size=test_batch_size, # shuffle=False # ) print('===> Building model') model = Net(upscale_factor=upscale_factor).to(device) model.train() criterion = nn.MSELoss() optimizer = optim.Adam(model.parameters(), lr=lr) wrapper = iteration_wrapper(exp, sync=torch_settings.sync) for it, (input, target) in dataloop(training_data_loader, wrapper=wrapper): it.set_count(batch_size) input = input.to(device) target = target.to(device) optimizer.zero_grad() loss = criterion(model(input), target) it.log(loss=loss.item()) loss.backward() optimizer.step()
def main(exp): # Model float type dtype: Argument & str = default("float32") # Number of samples samples: Argument & int = default(100) torch_settings = init_torch() device = torch_settings.device data = generate_wave_data(20, 1000, samples) _dtype = to_type[dtype] input = torch.from_numpy(data[3:, :-1]).to(device=device, dtype=_dtype) target = torch.from_numpy(data[3:, 1:]).to(device=device, dtype=_dtype) test_input = torch.from_numpy(data[:3, :-1]).to(device=device, dtype=_dtype) test_target = torch.from_numpy(data[:3, 1:]).to(device=device, dtype=_dtype) # build the model seq = Sequence().to(device=device, dtype=_dtype) criterion = nn.MSELoss().to(device=device, dtype=_dtype) optimizer = optim.SGD(seq.parameters(), lr=0.01) total_time = 0 seq.train() wrapper = iteration_wrapper(exp, sync=torch_settings.sync) for it, _ in dataloop(count(), wrapper=wrapper): it.set_count(samples) def closure(): optimizer.zero_grad() out = seq(input.to(device=device, dtype=_dtype)) loss = criterion(out, target) loss.backward() it.log(loss=loss.item()) return loss optimizer.step(closure)
def main(exp): # dataset to use dataset: Argument # batch size batch_size: Argument & int = default(32) # path to model checkpoint file checkpoint: Argument = default(None) torch_settings = init_torch() wrapper = iteration_wrapper(exp, sync=torch_settings.sync) args = NS( dataset=dataset, checkpoint=checkpoint, batch_size=batch_size, torch_settings=torch_settings, wrapper=wrapper, ) train300_mlperf_coco(exp, args)
def main(exp): # dataset to use dataset: Argument & str # Number of examples per batch batch_size: Argument & int = default(64) # path to style-image style_image: Argument & str = default( os.path.join(repo_base, "neural-style-images/style-images/candy.jpg")) # size of training images, default is 256 X 256 image_size: Argument & int = default(256) # size of style-image, default is the original size of style image style_size: Argument & int = default(None) # weight for content-loss, default is 1e5 content_weight: Argument & float = default(1e5) # weight for style-loss, default is 1e10 style_weight: Argument & float = default(1e10) # learning rate, default is 1e-3 lr: Argument & float = default(1e-3) torch_settings = init_torch() device = torch_settings.device transform = transforms.Compose([ transforms.Resize(image_size), transforms.CenterCrop(image_size), transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255)) ]) train_dataset = exp.get_dataset(dataset, transform).train train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=torch_settings.workers) transformer = TransformerNet().to(device) optimizer = Adam(transformer.parameters(), lr) mse_loss = torch.nn.MSELoss() vgg = Vgg16(requires_grad=False).to(device) print( memory_size(vgg, batch_size=batch_size, input_size=(3, image_size, image_size)) * 4) style_transform = transforms.Compose( [transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255))]) style = utils.load_image(style_image, size=style_size) style = style_transform(style) style = style.repeat(batch_size, 1, 1, 1).to(device) features_style = vgg(utils.normalize_batch(style)) gram_style = [utils.gram_matrix(y) for y in features_style] wrapper = iteration_wrapper(exp, sync=torch_settings.sync) transformer.train() for it, (x, _) in dataloop(train_loader, wrapper=wrapper): it.set_count(len(x)) n_batch = len(x) x = x.to(device) y = transformer(x) y = utils.normalize_batch(y) x = utils.normalize_batch(x) optimizer.zero_grad() features_y = vgg(y) features_x = vgg(x) content_loss = content_weight * mse_loss(features_y.relu2_2, features_x.relu2_2) style_loss = 0. for ft_y, gm_s in zip(features_y, gram_style): gm_y = utils.gram_matrix(ft_y) style_loss += mse_loss(gm_y, gm_s[:n_batch, :, :]) style_loss *= style_weight total_loss = content_loss + style_loss total_loss.backward() it.log(loss=total_loss.item()) optimizer.step()
def main(exp, argv): os.environ["BABYAI_STORAGE"] = exp.results_directory() # Parse arguments parser = ArgumentParser() parser.add_argument("--algo", default='ppo', help="algorithm to use (default: ppo)") parser.add_argument("--discount", type=float, default=0.99, help="discount factor (default: 0.99)") parser.add_argument("--reward-scale", type=float, default=20., help="Reward scale multiplier") parser.add_argument( "--gae-lambda", type=float, default=0.99, help="lambda coefficient in GAE formula (default: 0.99, 1 means no gae)" ) parser.add_argument("--value-loss-coef", type=float, default=0.5, help="value loss term coefficient (default: 0.5)") parser.add_argument("--max-grad-norm", type=float, default=0.5, help="maximum norm of gradient (default: 0.5)") parser.add_argument("--clip-eps", type=float, default=0.2, help="clipping epsilon for PPO (default: 0.2)") parser.add_argument("--ppo-epochs", type=int, default=4, help="number of epochs for PPO (default: 4)") parser.add_argument( "--save-interval", type=int, default=50, help= "number of updates between two saves (default: 50, 0 means no saving)") parser.add_argument("--workers", type=int, default=8, help="number of workers for PyTorch (default: 8)") parser.add_argument("--max-count", type=int, default=1000, help="maximum number of frames to run for") parser.add_argument("--sample_duration", type=float, default=0.5, help="sampling duration") parser.add_argument("--cuda", action="store_true", default=False, help="whether to use cuda") args = parser.parse_args(argv) utils.seed(args.seed) torch_settings = init_torch( seed=args.seed, cuda=args.cuda, workers=args.workers, ) # Generate environments envs = [] for i in range(args.procs): env = gym.make(args.env) env.seed(100 * args.seed + i) envs.append(env) # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") instr = args.instr_arch if args.instr_arch else "noinstr" mem = "mem" if not args.no_mem else "nomem" model_name_parts = { 'env': args.env, 'algo': args.algo, 'arch': args.arch, 'instr': instr, 'mem': mem, 'seed': args.seed, 'info': '', 'coef': '', 'suffix': suffix } default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}{info}{coef}_{suffix}".format( **model_name_parts) if args.pretrained_model: default_model_name = args.pretrained_model + '_pretrained_' + default_model_name args.model = args.model.format( **model_name_parts) if args.model else default_model_name utils.configure_logging(args.model) logger = logging.getLogger(__name__) # Define obss preprocessor if 'emb' in args.arch: obss_preprocessor = utils.IntObssPreprocessor( args.model, envs[0].observation_space, args.pretrained_model) else: obss_preprocessor = utils.ObssPreprocessor(args.model, envs[0].observation_space, args.pretrained_model) # Define actor-critic model # acmodel = utils.load_model(args.model, raise_not_found=False) acmodel = None if acmodel is None: if args.pretrained_model: acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch) obss_preprocessor.vocab.save() # utils.save_model(acmodel, args.model) if torch_settings.cuda: acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo( envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that. # Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that # the results of supervised-loss-coef=0. and extra-binary-info=0 match, we need to reseed here. utils.seed(args.seed) # Restore training status status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') if os.path.exists(status_path): with open(status_path, 'r') as src: status = json.load(src) else: status = {'i': 0, 'num_episodes': 0, 'num_frames': 0} # # Define logger and Tensorboard writer and CSV writer # header = (["update", "episodes", "frames", "FPS", "duration"] # + ["return_" + stat for stat in ['mean', 'std', 'min', 'max']] # + ["success_rate"] # + ["num_frames_" + stat for stat in ['mean', 'std', 'min', 'max']] # + ["entropy", "value", "policy_loss", "value_loss", "loss", "grad_norm"]) # if args.tb: # from tensorboardX import SummaryWriter # writer = SummaryWriter(utils.get_log_dir(args.model)) # csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv') # first_created = not os.path.exists(csv_path) # # we don't buffer data going in the csv log, cause we assume # # that one update will take much longer that one write to the log # csv_writer = csv.writer(open(csv_path, 'a', 1)) # if first_created: # csv_writer.writerow(header) # Log code state, command, availability of CUDA and model babyai_code = list(babyai.__path__)[0] try: last_commit = subprocess.check_output( 'cd {}; git log -n1'.format(babyai_code), shell=True).decode('utf-8') logger.info('LAST COMMIT INFO:') logger.info(last_commit) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') try: diff = subprocess.check_output('cd {}; git diff'.format(babyai_code), shell=True).decode('utf-8') if diff: logger.info('GIT DIFF:') logger.info(diff) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') logger.info('COMMAND LINE ARGS:') logger.info(args) logger.info("CUDA available: {}".format(torch.cuda.is_available())) logger.info(acmodel) # Train model total_start_time = time.time() best_success_rate = 0 best_mean_return = 0 test_env_name = args.env wrapper = iteration_wrapper( exp, sync=torch_settings.sync, max_count=args.max_count, sample_duration=args.sample_duration, ) # while status['num_frames'] < args.frames: while True: with wrapper() as it: # Update parameters if wrapper.done(): break update_start_time = time.time() logs = algo.update_parameters() update_end_time = time.time() it.set_count(logs["num_frames"]) it.log(loss=logs["loss"], )
def main(exp): # dataset to use dataset: Argument & str # batch size batch_size: Argument & int = default(128) # number of predictive factors # [alias: -f] factors: Argument & int = default(8) # size of hidden layers for MLP layers: Argument = default("64,32,16,8") # number of negative examples per interaction # [alias: -n] negative_samples: Argument & int = default(4) # learning rate for optimizer # [alias: -l] learning_rate: Argument & float = default(0.001) # rank for test examples to be considered a hit # [alias: -k] topk: Argument & int = default(10) layer_sizes = [int(x) for x in layers.split(",")] torch_settings = init_torch() device = torch_settings.device # Load Data # ------------------------------------------------------------------------------------------------------------------ print('Loading data') with exp.time('loading_data'): t1 = time.time() train_dataset = exp.get_dataset(dataset, nb_neg=negative_samples).train # mlperf_log.ncf_print(key=# mlperf_log.INPUT_BATCH_SIZE, value=batch_size) # mlperf_log.ncf_print(key=# mlperf_log.INPUT_ORDER) # set shuffle=True in DataLoader train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=torch_settings.workers, pin_memory=True) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items print('Load data done [%.1f s]. #user=%d, #item=%d, #train=%d' % (time.time() - t1, nb_users, nb_items, train_dataset.mat.nnz)) # ------------------------------------------------------------------------------------------------------------------ # Create model model = NeuMF(nb_users, nb_items, mf_dim=factors, mf_reg=0., mlp_layer_sizes=layer_sizes, mlp_layer_regs=[0. for i in layer_sizes]).to(device) print(model) print("{} parameters".format(utils.count_parameters(model))) # Save model text description run_dir = exp.results_directory() with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) # Add optimizer and loss to graph # mlperf_log.ncf_print(key=# mlperf_log.OPT_LR, value=learning_rate) beta1, beta2, epsilon = 0.9, 0.999, 1e-8 optimizer = torch.optim.Adam(model.parameters(), betas=(beta1, beta2), lr=learning_rate, eps=epsilon) # mlperf_log.ncf_print(key=# mlperf_log.MODEL_HP_LOSS_FN, value=# mlperf_log.BCE) criterion = nn.BCEWithLogitsLoss().to(device) model.train() wrapper = iteration_wrapper(exp, sync=None) for it, (user, item, label) in dataloop(train_dataloader, wrapper=wrapper): it.set_count(batch_size) user = torch.autograd.Variable(user, requires_grad=False).to(device) item = torch.autograd.Variable(item, requires_grad=False).to(device) label = torch.autograd.Variable(label, requires_grad=False).to(device) outputs = model(user, item) loss = criterion(outputs, label) it.log(loss=loss.item()) optimizer.zero_grad() loss.backward() optimizer.step()
def main(exp): # discount factor (default: 0.99) gamma: Argument & float = default(0.99) # render the environment render: Argument & bool = default(False) # seed for the environment seed: Argument & int = default(1234) # length of one episode episode_length: Argument & int = default(500) torch_settings = init_torch() device = torch_settings.device env = gym.make('CartPole-v0') env.seed(seed) policy = Policy() optimizer = optim.Adam(policy.parameters(), lr=1e-2) eps = np.finfo(np.float32).eps.item() print(torch_settings) def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) probs = policy(state) m = Categorical(probs) action = m.sample() policy.saved_log_probs.append(m.log_prob(action)) return action.item() def finish_episode(): R = 0 policy_loss = [] returns = [] for r in policy.rewards[::-1]: R = r + gamma * R returns.insert(0, R) returns = torch.tensor(returns) returns = (returns - returns.mean()) / (returns.std() + eps) for log_prob, R in zip(policy.saved_log_probs, returns): policy_loss.append(-log_prob * R) optimizer.zero_grad() policy_loss = torch.cat(policy_loss).sum() policy_loss.backward() optimizer.step() del policy.rewards[:] del policy.saved_log_probs[:] running_reward = 10 wrapper = iteration_wrapper(exp, sync=torch_settings.sync) for it, _ in dataloop(count(), wrapper=wrapper): it.set_count(episode_length) state, ep_reward = env.reset(), 0 for t in range(episode_length): action = select_action(state) state, reward, done, _ = env.step(action) policy.rewards.append(reward) ep_reward += reward # we actually do not care about solving the thing if done: state, ep_reward = env.reset(), 0 running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward it.log(reward=running_reward) finish_episode()
def main(exp): # dataset to use dataset: Argument & str # type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU) model_name: Argument & str = default('LSTM') # size of word embeddings emsize: Argument & int = default(200) # number of hidden units per layer nhid: Argument & int = default(200) # number of layers nlayers: Argument & int = default(2) # initial learning rate lr: Argument & float = default(20) # gradient clipping clip: Argument & float = default(0.25) # upper epoch limit epochs: Argument & int = default(40) # sequence length bptt: Argument & int = default(35) # dropout applied to layers (0 = no dropout) dropout: Argument & float = default(0.2) # tie the word embedding and softmax weights tied: Argument & bool = default(False) # report interval log_interval: Argument & int = default(200) # Run model in pseudo-fp16 mode (fp16 storage fp32 math). fp16: Argument & bool = default(True) # Static loss scale, positive power of 2 values can improve fp16 convergence. static_loss_scale: Argument & float = default(128.0) # Use dynamic loss scaling. # If supplied, this argument supersedes --static-loss-scale. dynamic_loss_scale: Argument & bool = default(False) # path to save the final model save: Argument & str = default(None) # path to export the final model in onnx format batch_size: Argument & int = default(64) # Maximum count before stopping max_count: Argument & int = default(1000) # Number of seconds for sampling items/second sample_duration: Argument & float = default(0.5) torch_settings = init_torch() device = torch_settings.device ############################################################################### # Load data ############################################################################### # Ensure that the dictionary length is a multiple of 8, # so that the decoder's GEMMs will use Tensor Cores. corpus = exp.get_dataset(dataset, pad_to_multiple_of=8).corpus # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() if torch_settings.cuda: data = data.cuda() return data eval_batch_size = 10 train_data = batchify(corpus.train, batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if fp16 and torch_settings.cuda: if ntokens % 8 != 0: print( "Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure " "Tensor Core use for the decoder's GEMMs.".format(ntokens)) if emsize % 8 != 0 or nhid % 8 != 0 or batch_size % 8 != 0: print( "Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 " "to ensure Tensor Core use for the RNN's GEMMs.".format( emsize, nhid, batch_size)) model = model_module.RNNModel(model_name, ntokens, emsize, nhid, nlayers, dropout, tied).to(device) if torch_settings.cuda and fp16: model.type(torch.cuda.HalfTensor) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr) ############################################################################### # Create the FP16_Optimizer instance ############################################################################### if fp16 and torch_settings.cuda: # If dynamic_loss_scale is False, static_loss_scale will be used. # If dynamic_loss_scale is True, it will take precedence over static_loss_scale. optimizer = FP16_Optimizer(optimizer, static_loss_scale=static_loss_scale, dynamic_loss_scale=dynamic_loss_scale) ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Detaches hidden states from their history.""" if torch.is_tensor(h): return h.detach() else: return tuple(repackage_hidden(v) for v in h) # get_batch subdivides the source data into chunks of length bptt. # If source is equal to the example output of the batchify function, with # a bptt-limit of 2, we'd get the following two Variables for i = 0: # ┌ a g m s ┐ ┌ b h n t ┐ # └ b h n t ┘ └ c i o u ┘ # Note that despite the name of the function, the subdivison of data is not # done along the batch dimension (i.e. dimension 1), since that was handled # by the batchify function. The chunks are along dimension 0, corresponding # to the seq_len dimension in the LSTM. def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) return data, target def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0 ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, bptt): data, targets = get_batch(data_source, i) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) #total loss can overflow if accumulated in fp16. total_loss += len(data) * criterion(output_flat, targets).data.float() hidden = repackage_hidden(hidden) return to_python_float(total_loss) / len(data_source) def train(chrono): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for batch, i in enumerate(range(0, len(train_data), bptt)): if chrono.done(): break with chrono(count=batch_size) as it: data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) # Clipping gradients helps prevent the exploding gradient problem in RNNs / LSTMs. if fp16 and torch_settings.cuda: optimizer.backward(loss) optimizer.clip_master_grads(clip) else: loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. # apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm" # and "torch.nn.utils.clip_grad_norm_" based on Pytorch version. # It's not FP16-specific, just a small fix to avoid deprecation warnings. clip_grad_norm(model.parameters(), clip) optimizer.step() it.log(loss=loss.item()) total_loss += loss.data # if batch % args.log_interval == 0 and batch > 0: # cur_loss = to_python_float(total_loss) / args.log_interval # elapsed = time.time() - start_time # print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' # 'loss {:5.2f} | ppl {:8.2f}'.format( # epoch, batch, len(train_data) // args.bptt, lr, # elapsed * 1000 / args.log_interval, cur_loss, math.exp(min(cur_loss, 20)))) # total_loss = 0 # start_time = time.time() # Loop over epochs. best_val_loss = None chrono = exp.chronos.create( "train", type="rate", sync=torch_settings.sync, sample_duration=sample_duration, max_count=max_count, ) while not chrono.done(): train(chrono) val_loss = evaluate(val_data) exp.metrics["val_loss"] = val_loss # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 # Run on test data. test_loss = evaluate(test_data) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89)
def main(exp): # dataset to use dataset: Argument & str # type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU) model_name: Argument & str = default('LSTM') # size of word embeddings emsize: Argument & int = default(200) # number of hidden units per layer nhid: Argument & int = default(200) # number of layers nlayers: Argument & int = default(2) # initial learning rate lr: Argument & float = default(20) # gradient clipping clip: Argument & float = default(0.25) # upper epoch limit epochs: Argument & int = default(40) # sequence length bptt: Argument & int = default(35) # dropout applied to layers (0 = no dropout) dropout: Argument & float = default(0.2) # tie the word embedding and softmax weights tied: Argument & bool = default(False) # report interval log_interval: Argument & int = default(200) # path to save the final model save: Argument & str = default(None) # path to export the final model in onnx format onnx_export: Argument & str = default('') # path to export the final model in onnx format batch_size: Argument & int = default(64) # Maximum count before stopping max_count: Argument & int = default(1000) # Number of seconds for sampling items/second sample_duration: Argument & float = default(0.5) torch_settings = init_torch() device = torch_settings.device ############################################################################### # Load data ############################################################################### corpus = exp.get_dataset(dataset).corpus # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() return data.to(device) eval_batch_size = 10 train_data = batchify(corpus.train, batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) model = model_module.RNNModel(model_name, ntokens, emsize, nhid, nlayers, dropout, tied).to(device) criterion = nn.CrossEntropyLoss() ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h) # get_batch subdivides the source data into chunks of length bptt. # If source is equal to the example output of the batchify function, with # a bptt-limit of 2, we'd get the following two Variables for i = 0: # ┌ a g m s ┐ ┌ b h n t ┐ # └ b h n t ┘ └ c i o u ┘ # Note that despite the name of the function, the subdivison of data is not # done along the batch dimension (i.e. dimension 1), since that was handled # by the batchify function. The chunks are along dimension 0, corresponding # to the seq_len dimension in the LSTM. def get_batch(source, i): seq_len = min(bptt, len(source) - 1 - i) data = source[i:i + seq_len] target = source[i + 1:i + 1 + seq_len].view(-1) return data, target def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0. ntokens = len(corpus.dictionary) hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, bptt): data, targets = get_batch(data_source, i) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).item() hidden = repackage_hidden(hidden) return total_loss / (len(data_source) - 1) def train(chrono): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(batch_size) for batch, i in enumerate(range(0, len(train_data), bptt)): if chrono.done(): break with chrono(count=batch_size) as it: data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) it.log(loss=loss.item()) total_loss += loss.item() # if batch % log_interval == 0 and batch > 0: # cur_loss = total_loss / log_interval # elapsed = time.time() - start_time # print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' # 'loss {:5.2f} | ppl {:8.2f}'.format( # epoch, batch, len(train_data) // bptt, lr, # elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) # total_loss = 0 # start_time = time.time() def export_onnx(path, batch_size, seq_len): print('The model is also exported in ONNX format at {}'.format( os.path.realpath(onnx_export))) model.eval() dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view( -1, batch_size).to(device) hidden = model.init_hidden(batch_size) torch.onnx.export(model, (dummy_input, hidden), path) # Loop over epochs. best_val_loss = None chrono = exp.chronos.create( "train", type="rate", sync=torch_settings.sync, sample_duration=sample_duration, max_count=max_count, ) while not chrono.done(): train(chrono) val_loss = evaluate(val_data) exp.metrics["val_loss"] = val_loss # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: best_val_loss = val_loss else: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 # Run on test data. test_loss = evaluate(test_data) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89)
def main(exp): torch_settings = init_torch() # Degree of the polynomial poly_degree: Argument & int = default(4) # Number of examples per batch batch_size: Argument & int = default(64) torch_settings = init_torch() device = torch_settings.device W_target = torch.randn(poly_degree, 1) * 5 b_target = torch.randn(1) * 5 def make_features(x): """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4].""" x = x.unsqueeze(1) return torch.cat([x**i for i in range(1, poly_degree + 1)], 1) def f(x): """Approximated function.""" return x.mm(W_target) + b_target.item() def poly_desc(W, b): """Creates a string description of a polynomial.""" result = 'y = ' for i, w in enumerate(W): result += '{:+.2f} x^{} '.format(w, len(W) - i) result += '{:+.2f}'.format(b[0]) return result def get_batch(): """Builds a batch i.e. (x, f(x)) pair.""" random = torch.randn(batch_size) x = make_features(random) y = f(x) return x, y def dataset(): while True: yield get_batch() # Define model fc = torch.nn.Linear(W_target.size(0), 1) fc.to(device) wrapper = iteration_wrapper(exp, sync=torch_settings.sync) for it, (batch_x, batch_y) in dataloop(dataset(), wrapper=wrapper): it.set_count(batch_size) batch_x = batch_x.to(device) batch_y = batch_y.to(device) # Reset gradients fc.zero_grad() # Forward pass output = F.smooth_l1_loss(fc(batch_x), batch_y) loss = output.item() it.log(loss=loss) # Backward pass output.backward() # Apply gradients for param in fc.parameters(): param.data.add_(-0.01 * param.grad.data) print('==> Learned function:\t', poly_desc(fc.weight.view(-1), fc.bias)) print('==> Actual function:\t', poly_desc(W_target.view(-1), b_target))
def main(exp): models = [ 'alexnet', 'vgg11', 'vgg13', 'vgg16', 'vgg19', 'vgg11_bn', 'vgg13_bn', 'vgg16_bn', 'vgg19_bn', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'shufflenet', 'shufflenet_v2_x05', 'shufflenet_v2_x10', 'shufflenet_v2_x15', 'SqueezeNet', 'SqueezeNet1.1', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'inception', 'inception_v3', 'resnext50', 'resnext101', 'mobilenet_v2', 'googlenet', 'deeplabv3_resnet50', 'deeplabv3_resnet101', 'fcn_resnet50', 'fcn_resnet101' ] # Network to run. network: Argument & str # Batch size (will be split among devices used by this invocation) batch_size: Argument & int = default(64) # FP16 mixed precision benchmarking fp16: Argument & int = default(0) # Use torch.nn.DataParallel api to run single process on multiple devices. Use only one of --dataparallel or --distributed_dataparallel dataparallel: Argument & bool = default(False) # Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Use only one of --dataparallel or --distributed_dataparallel distributed_dataparallel: Argument & bool = default(False) # Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run dataparallel or distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes. device_ids: Argument & str = default(None) # Rank of this process. Required for --distributed_dataparallel rank: Argument & int = default(None) # Total number of ranks/processes. Required for --distributed_dataparallel world_size: Argument & int = default(None) # Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel dist_backend: Argument & str = default(None) # url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel dist_url: Argument & str = default(None) torch_settings = init_torch() if device_ids: device_ids_values = [int(x) for x in device_ids.split(",")] else: device_ids_values = None distributed_parameters = dict() distributed_parameters['rank'] = rank distributed_parameters['world_size'] = world_size distributed_parameters['dist_backend'] = dist_backend distributed_parameters['dist_url'] = dist_url # Some arguments are required for distributed_dataparallel if distributed_dataparallel: assert rank is not None and \ world_size is not None and \ dist_backend is not None and \ dist_url is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel" wrapper = iteration_wrapper(exp, sync=torch_settings.sync) run_benchmarking(exp, wrapper, network, batch_size, fp16, dataparallel, distributed_dataparallel, device_ids_values, distributed_parameters)
def main(exp): # Batch size batch_size: Argument & int = default(256) # Dataset to use dataset: Argument torch_settings = init_torch() device = torch_settings.device dataset = exp.get_dataset(dataset) kwargs = { 'num_workers': 1, 'pin_memory': True } if torch_settings.cuda else {} train_loader = torch.utils.data.DataLoader( dataset.train, batch_size=batch_size, shuffle=True, **kwargs, ) test_loader = torch.utils.data.DataLoader( dataset.test, batch_size=batch_size, shuffle=True, **kwargs, ) model = VAE().to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) # Reconstruction + KL divergence losses summed over all elements and batch def loss_function(recon_x, x, mu, logvar): BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum') # see Appendix B from VAE paper: # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014 # https://arxiv.org/abs/1312.6114 # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) return BCE + KLD def test(epoch): # Not tested model.eval() test_loss = 0 with torch.no_grad(): for i, (data, _) in enumerate(test_loader): data = data.to(device) recon_batch, mu, logvar = model(data) test_loss += loss_function(recon_batch, data, mu, logvar).item() if i == 0: n = min(data.size(0), 8) comparison = torch.cat([ data[:n], recon_batch.view(batch_size, 1, 28, 28)[:n] ]) save_image(comparison.cpu(), 'results/reconstruction_' + str(epoch) + '.png', nrow=n) test_loss /= len(test_loader.dataset) print('====> Test set loss: {:.4f}'.format(test_loss)) model.train() wrapper = iteration_wrapper(exp, sync=torch_settings.sync) for it, (data, target) in dataloop(train_loader, wrapper=wrapper): it.set_count(len(data)) data = data.to(device) optimizer.zero_grad() recon_batch, mu, logvar = model(data) loss = loss_function(recon_batch, data, mu, logvar) loss.backward() it.log(loss=loss.item()) optimizer.step()
def main(exp): # Algorithm to use: a2c | ppo | acktr algorithm: Argument = default("a2c") # Gail epochs (default: 5) gail_epoch: Argument & int = default(5) # Learning rate (default: 7e-4) lr: Argument & float = default(7e-4) # Directory that contains expert demonstrations for gail gail_experts_dir: Argument = default("./gail_experts") # Gail batch size (default: 128) gail_batch_size: Argument & int = default(128) # Do imitation learning with gail gail: Argument & bool = default(False) # RMSprop optimizer epsilon (default: 1e-5) eps: Argument & float = default(1e-5) # RMSprop optimizer apha (default: 0.99) alpha: Argument & float = default(0.99) # discount factor for rewards (default: 0.99) gamma: Argument & float = default(0.99) # use generalized advantage estimation use_gae: Argument & bool = default(False) # gae lambda parameter (default: 0.95) gae_lambda: Argument & float = default(0.95) # entropy term coefficient (default: 0.01) entropy_coef: Argument & float = default(0.01) # value loss coefficient (default: 0.5) value_loss_coef: Argument & float = default(0.5) # max norm of gradients (default: 0.5) max_grad_norm: Argument & float = default(0.5) # sets flags for determinism when using CUDA (potentially slow!) cuda_deterministic: Argument & bool = default(False) # how many training CPU processes to use (default: 16) num_processes: Argument & int = default(16) # number of forward steps in A2C (default: 5) num_steps: Argument & int = default(5) # number of ppo epochs (default: 4) ppo_epoch: Argument & int = default(4) # number of batches for ppo (default: 32) num_mini_batch: Argument & int = default(32) # ppo clip parameter (default: 0.2) clip_param: Argument & float = default(0.2) # # log interval, one log per n updates (default: 10) # log_interval: Argument & int = default(10) # # save interval, one save per n updates (default: 100) # save_interval: Argument & int = default(100) # # eval interval, one eval per n updates (default: None) # eval_interval: Argument & int = default(None) # number of environment steps to train (default: 10e6) num_env_steps: Argument & int = default(10e6) # environment to train on (default: PongNoFrameskip-v4) env_name: Argument = default('PongNoFrameskip-v4') # directory to save agent logs (default: /tmp/gym) log_dir: Argument = default(None) # directory to save agent logs (default: ./trained_models/) save_dir: Argument = default('./trained_models/') # compute returns taking into account time limits use_proper_time_limits: Argument & bool = default(False) # use a recurrent policy recurrent_policy: Argument & bool = default(False) # use a linear schedule on the learning rate') use_linear_lr_decay: Argument & bool = default(False) # Seed to use seed: Argument & int = default(1234) # Number of iterations iterations: Argument & int = default(10) # we compute steps/sec batch_size = num_processes torch_settings = init_torch() device = torch_settings.device assert algorithm in ['a2c', 'ppo', 'acktr'] if recurrent_policy: assert algorithm in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' num_updates = int(num_env_steps) // num_steps // num_processes envs = make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': recurrent_policy}) actor_critic.to(device) if algorithm == 'a2c': agent = algo.A2C_ACKTR(actor_critic, value_loss_coef, entropy_coef, lr=lr, eps=eps, alpha=alpha, max_grad_norm=max_grad_norm) elif algorithm == 'ppo': agent = algo.PPO(actor_critic, clip_param, ppo_epoch, num_mini_batch, value_loss_coef, entropy_coef, lr=lr, eps=eps, max_grad_norm=max_grad_norm) elif algorithm == 'acktr': agent = algo.A2C_ACKTR(actor_critic, value_loss_coef, entropy_coef, acktr=True) rollouts = RolloutStorage(num_steps, num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int(num_env_steps) // num_steps // num_processes wrapper = iteration_wrapper(exp, sync=torch_settings.sync) for it, j in dataloop(count(), wrapper=wrapper): it.set_count(batch_size) if use_linear_lr_decay: utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if algorithm == "acktr" else lr) for step in range(num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() # --- rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda, use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) it.log( value_loss=value_loss, action_loss=action_loss, ) rollouts.after_update() total_num_steps = (j + 1) * num_processes * num_steps # if j % log_interval == 0 and len(episode_rewards) > 1: # end = time.time() # print( # "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # len(episode_rewards), # np.mean(episode_rewards), # np.median(episode_rewards), # np.min(episode_rewards), # np.max(episode_rewards), dist_entropy, # value_loss, action_loss)) envs.close()