def apply_gradients(self, grads_and_vars): self._iterations += 1 grads, var_list = list(zip(*grads_and_vars)) new_grads = [] if self._summaries: summary.scalar("optimizer/scale", self._scale, utils.get_global_step()) for grad in grads: if grad is None: new_grads.append(None) continue norm = grad.data.norm() if not torch.isfinite(norm): self._update_if_not_finite_grads() return else: # Rescale gradients new_grads.append(grad.data.float().mul_(1.0 / self._scale)) self._update_if_finite_grads() self._optimizer.apply_gradients(zip(new_grads, var_list))
def __call__(self, step): if step <= self._warmup_steps: lr_step = self._maximum_learning_rate - self._initial_learning_rate lr_step /= self._warmup_steps lr = self._initial_learning_rate + lr_step * step else: lr = self._maximum_learning_rate if self._warmup_steps != 0: # approximately hidden_size ** -0.5 lr = lr * self._warmup_steps**0.5 lr = lr * (step**-0.5) if self._summary: summary.scalar("learning_rate", lr, utils.get_global_step()) return lr
def __call__(self, step): # See reference: The Best of Both Worlds: Combining Recent Advances # in Neural Machine Translation n = self._n p = self._warmup_steps / n s = n * self._start_decay_step e = n * self._end_decay_step learning_rate = self._learning_rate learning_rate *= min( 1.0 + (n - 1) * step / float(n * p), n, n * ((2 * n)**(float(s - n * step) / float(e - s)))) if self._summary: summary.scalar("learning_rate", learning_rate, utils.get_global_step()) return learning_rate
def __call__(self, step): boundaries = self._boundaries values = self._values learning_rate = values[0] if step <= boundaries[0]: learning_rate = values[0] elif step > boundaries[-1]: learning_rate = values[-1] else: for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]): if step > low and step <= high: learning_rate = v break if self._summary: summary.scalar("learning_rate", learning_rate, utils.get_global_step()) return learning_rate
def _save_summary(grads_and_vars): total_norm = 0.0 for grad, var in grads_and_vars: if grad is None: continue _, var = var grad_norm = grad.data.norm() total_norm += grad_norm**2 summary.histogram(var.tensor_name, var, utils.get_global_step()) summary.scalar("norm/" + var.tensor_name, var.norm(), utils.get_global_step()) summary.scalar("grad_norm/" + var.tensor_name, grad_norm, utils.get_global_step()) total_norm = total_norm**0.5 summary.scalar("grad_norm", total_norm, utils.get_global_step()) return float(total_norm)
def main(args): model_cls = models.get_model(args.model) # Import and override parameters # Priorities (low -> high): # default -> saved -> command params = default_params() params = merge_params(params, model_cls.default_params(args.hparam_set)) params = import_params(args.output, args.model, params) params = override_params(params, args) # Initialize distributed utility if args.distributed: dist.init_process_group("nccl") torch.cuda.set_device(args.local_rank) torch.set_default_tensor_type(torch.cuda.FloatTensor) else: dist.init_process_group("nccl", init_method=args.url, rank=args.local_rank, world_size=len(params.device_list)) torch.cuda.set_device(params.device_list[args.local_rank]) torch.set_default_tensor_type(torch.cuda.FloatTensor) # Export parameters if dist.get_rank() == 0: export_params(params.output, "params.json", params) export_params(params.output, "%s.json" % params.model, collect_params(params, model_cls.default_params())) model = model_cls(params).cuda() if args.half: model = model.half() torch.set_default_dtype(torch.half) torch.set_default_tensor_type(torch.cuda.HalfTensor) model.train() # Init tensorboard summary.init(params.output, params.save_summary) schedule = get_learning_rate_schedule(params) clipper = get_clipper(params) optimizer = get_optimizer(params, schedule, clipper) if args.half: optimizer = optimizers.LossScalingOptimizer(optimizer) optimizer = optimizers.MultiStepOptimizer(optimizer, params.update_cycle) trainable_flags = print_variables(model, params.pattern, dist.get_rank() == 0) dataset = data.get_dataset(params.input, "train", params) if params.validation: sorted_key, eval_dataset = data.get_dataset(params.validation, "infer", params) references = load_references(params.references) else: sorted_key = None eval_dataset = None references = None # Load checkpoint checkpoint = utils.latest_checkpoint(params.output) if args.checkpoint is not None: # Load pre-trained models state = torch.load(args.checkpoint, map_location="cpu") model.load_state_dict(state["model"]) step = params.initial_step epoch = 0 broadcast(model) elif checkpoint is not None: state = torch.load(checkpoint, map_location="cpu") step = state["step"] epoch = state["epoch"] model.load_state_dict(state["model"]) if "optimizer" in state: optimizer.load_state_dict(state["optimizer"]) else: step = 0 epoch = 0 broadcast(model) def train_fn(inputs): features, labels = inputs loss = model(features, labels) return loss counter = 0 while True: for features in dataset: if counter % params.update_cycle == 0: step += 1 utils.set_global_step(step) counter += 1 t = time.time() features = data.lookup(features, "train", params) loss = train_fn(features) gradients = optimizer.compute_gradients(loss, list(model.parameters())) grads_and_vars = exclude_variables( trainable_flags, zip(gradients, list(model.named_parameters()))) optimizer.apply_gradients(grads_and_vars) t = time.time() - t summary.scalar("loss", loss, step, write_every_n_steps=1) summary.scalar("global_step/sec", t, step) print("epoch = %d, step = %d, loss = %.3f (%.3f sec)" % (epoch + 1, step, float(loss), t)) if counter % params.update_cycle == 0: if step >= params.train_steps: utils.evaluate(model, sorted_key, eval_dataset, params.output, references, params) save_checkpoint(step, epoch, model, optimizer, params) if dist.get_rank() == 0: summary.close() return if step % params.eval_steps == 0: utils.evaluate(model, sorted_key, eval_dataset, params.output, references, params) if step % params.save_checkpoint_steps == 0: save_checkpoint(step, epoch, model, optimizer, params) epoch += 1
def evaluate(model, sorted_key, dataset, base_dir, references, params): if not references: return base_dir = base_dir.rstrip("/") save_path = os.path.join(base_dir, "eval") record_name = os.path.join(save_path, "record") log_name = os.path.join(save_path, "log") max_to_keep = params.keep_top_checkpoint_max if dist.get_rank() == 0: # Create directory and copy files if not os.path.exists(save_path): print("Making dir: %s" % save_path) os.makedirs(save_path) params_pattern = os.path.join(base_dir, "*.json") params_files = glob.glob(params_pattern) for name in params_files: new_name = name.replace(base_dir, save_path) shutil.copy(name, new_name) # Do validation here global_step = get_global_step() if dist.get_rank() == 0: print("-" * 90) print("Validating model at step %d" % global_step) score = _evaluate_model(model, sorted_key, dataset, references, params) # Save records if dist.get_rank() == 0: scalar("BLEU/score", score, global_step, write_every_n_steps=1) print("BLEU at step %d: %f" % (global_step, score)) # Save checkpoint to save_path save({"model": model.state_dict(), "step": global_step}, save_path) _save_log(log_name, ("BLEU", global_step, score)) records = _read_score_record(record_name) record = [latest_checkpoint(save_path).split("/")[-1], score] added, removed, records = _add_to_record(records, record, max_to_keep) if added is None: # Remove latest checkpoint filename = latest_checkpoint(save_path) print("Removing %s" % filename) files = glob.glob(filename + "*") for name in files: os.remove(name) if removed is not None: filename = os.path.join(save_path, removed) print("Removing %s" % filename) files = glob.glob(filename + "*") for name in files: os.remove(name) _save_score_record(record_name, records) best_score = records[0][1] print("Best score at step %d: %f" % (global_step, best_score)) print("-" * 90)
def main(args): model_cls = models.get_model(args.model) # Import and override parameters # Priorities (low -> high): # default -> saved -> command params = default_params() params = merge_params(params, model_cls.default_params(args.hparam_set)) params = import_params(args.output, args.model, params) params = override_params(params, args) # Initialize distributed utility if args.distributed: dist.init_process_group("nccl") torch.cuda.set_device(args.local_rank) torch.set_default_tensor_type(torch.cuda.FloatTensor) else: dist.init_process_group("nccl", init_method=args.url, rank=args.local_rank, world_size=len(params.device_list)) torch.cuda.set_device(params.device_list[args.local_rank]) torch.set_default_tensor_type(torch.cuda.FloatTensor) # Export parameters if dist.get_rank() == 0: export_params(params.output, "params.json", params) export_params(params.output, "%s.json" % params.model, collect_params(params, model_cls.default_params())) model = model_cls(params).cuda() if args.half: model = model.half() torch.set_default_dtype(torch.half) torch.set_default_tensor_type(torch.cuda.HalfTensor) model.train() # Init tensorboard summary.init(params.output, params.save_summary) schedule = get_learning_rate_schedule(params) clipper = get_clipper(params) if params.optimizer.lower() == "adam": optimizer = optimizers.AdamOptimizer(learning_rate=schedule, beta_1=params.adam_beta1, beta_2=params.adam_beta2, epsilon=params.adam_epsilon, clipper=clipper, summaries=params.save_summary) elif params.optimizer.lower() == "adadelta": optimizer = optimizers.AdadeltaOptimizer( learning_rate=schedule, rho=params.adadelta_rho, epsilon=params.adadelta_epsilon, clipper=clipper, summaries=params.save_summary) elif params.optimizer.lower() == "sgd": optimizer = optimizers.SGDOptimizer(learning_rate=schedule, clipper=clipper, summaries=params.save_summary) else: raise ValueError("Unknown optimizer %s" % params.optimizer) if args.half: optimizer = optimizers.LossScalingOptimizer(optimizer) optimizer = optimizers.MultiStepOptimizer(optimizer, params.update_cycle) if dist.get_rank() == 0: print_variables(model) if params.from_torchtext: dataset = data.get_dataset_torchtext(params.input, "train", params) else: dataset = data.get_dataset(params.input, "train", params) if params.validation: if params.from_torchtext: eval_dataset = data.get_dataset_torchtext(params.validation, "infer", params) else: eval_dataset = data.get_dataset(params.validation, "infer", params) references = load_references(params.references) else: eval_dataset = None references = None # Load checkpoint checkpoint = utils.latest_checkpoint(params.output) if args.checkpoint is not None: # Load pre-trained models state = torch.load(args.checkpoint, map_location="cpu") model.load_state_dict(state["model"], strict=False) step = params.initial_step epoch = 0 broadcast(model) elif checkpoint is not None: state = torch.load(checkpoint, map_location="cpu") step = state["step"] epoch = state["epoch"] model.load_state_dict(state["model"]) if "optimizer" in state: optimizer.load_state_dict(state["optimizer"]) else: step = 0 epoch = 0 broadcast(model) def train_fn(inputs): features, labels = inputs loss, state = model(features, labels) return loss, state counter = 0 state = None if params.model == "cachedtransformer": last_feature = None while True: start_time = time.time() for features in dataset: if counter % params.update_cycle == 0: step += 1 utils.set_global_step(step) counter += 1 t = time.time() features = data.lookup(features, "train", params, from_torchtext=params.from_torchtext) if model.name == "cachedtransformer": features = utils.update_cache(model, features, state, last_feature) last_feature = features[0] loss, state = train_fn(features) gradients = optimizer.compute_gradients(loss, list(model.parameters())) grads_and_vars = optimizers.exclude_variables( params.pattern, zip(gradients, list(model.named_parameters()))) optimizer.apply_gradients(grads_and_vars) t = time.time() - t summary.scalar("loss", loss, step, write_every_n_steps=1) summary.scalar("global_step/sec", t, step) if counter % params.update_cycle == 0: if step > 0 and step % args.log_interval == 0: elapsed = time.time() - start_time print('| epoch {:2d} | step {:8d} | lr {:02.2e} | ' 'ms/step {:4.0f} | loss {:8.4f} '.format( epoch + 1, step, optimizer._optimizer._learning_rate(step), elapsed * 1000 / args.log_interval, loss.item())) start_time = time.time() if step >= params.train_steps: utils.evaluate(model, eval_dataset, params.output, references, params) save_checkpoint(step, epoch, model, optimizer, params) if dist.get_rank() == 0: summary.close() return if step % params.eval_steps == 0: utils.evaluate(model, eval_dataset, params.output, references, params) start_time = time.time() if step % params.save_checkpoint_steps == 0: save_checkpoint(step, epoch, model, optimizer, params) start_time = time.time() epoch += 1