示例#1
0
    def apply_gradients(self, grads_and_vars):
        self._iterations += 1
        grads, var_list = list(zip(*grads_and_vars))
        new_grads = []

        if self._summaries:
            summary.scalar("optimizer/scale", self._scale,
                           utils.get_global_step())

        for grad in grads:
            if grad is None:
                new_grads.append(None)
                continue

            norm = grad.data.norm()

            if not torch.isfinite(norm):
                self._update_if_not_finite_grads()
                return
            else:
                # Rescale gradients
                new_grads.append(grad.data.float().mul_(1.0 / self._scale))

        self._update_if_finite_grads()
        self._optimizer.apply_gradients(zip(new_grads, var_list))
示例#2
0
    def __call__(self, step):
        if step <= self._warmup_steps:
            lr_step = self._maximum_learning_rate - self._initial_learning_rate
            lr_step /= self._warmup_steps
            lr = self._initial_learning_rate + lr_step * step
        else:
            lr = self._maximum_learning_rate

            if self._warmup_steps != 0:
                # approximately hidden_size ** -0.5
                lr = lr * self._warmup_steps**0.5

            lr = lr * (step**-0.5)

        if self._summary:
            summary.scalar("learning_rate", lr, utils.get_global_step())

        return lr
示例#3
0
    def __call__(self, step):
        # See reference: The Best of Both Worlds: Combining Recent Advances
        # in Neural Machine Translation
        n = self._n
        p = self._warmup_steps / n
        s = n * self._start_decay_step
        e = n * self._end_decay_step

        learning_rate = self._learning_rate

        learning_rate *= min(
            1.0 + (n - 1) * step / float(n * p), n,
            n * ((2 * n)**(float(s - n * step) / float(e - s))))

        if self._summary:
            summary.scalar("learning_rate", learning_rate,
                           utils.get_global_step())

        return learning_rate
示例#4
0
    def __call__(self, step):
        boundaries = self._boundaries
        values = self._values
        learning_rate = values[0]

        if step <= boundaries[0]:
            learning_rate = values[0]
        elif step > boundaries[-1]:
            learning_rate = values[-1]
        else:
            for low, high, v in zip(boundaries[:-1], boundaries[1:],
                                    values[1:-1]):

                if step > low and step <= high:
                    learning_rate = v
                    break

        if self._summary:
            summary.scalar("learning_rate", learning_rate,
                           utils.get_global_step())

        return learning_rate
示例#5
0
def _save_summary(grads_and_vars):
    total_norm = 0.0

    for grad, var in grads_and_vars:
        if grad is None:
            continue

        _, var = var
        grad_norm = grad.data.norm()
        total_norm += grad_norm**2
        summary.histogram(var.tensor_name, var, utils.get_global_step())
        summary.scalar("norm/" + var.tensor_name, var.norm(),
                       utils.get_global_step())
        summary.scalar("grad_norm/" + var.tensor_name, grad_norm,
                       utils.get_global_step())

    total_norm = total_norm**0.5
    summary.scalar("grad_norm", total_norm, utils.get_global_step())

    return float(total_norm)
示例#6
0
def main(args):
    model_cls = models.get_model(args.model)

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = default_params()
    params = merge_params(params, model_cls.default_params(args.hparam_set))
    params = import_params(args.output, args.model, params)
    params = override_params(params, args)

    # Initialize distributed utility
    if args.distributed:
        dist.init_process_group("nccl")
        torch.cuda.set_device(args.local_rank)
        torch.set_default_tensor_type(torch.cuda.FloatTensor)
    else:
        dist.init_process_group("nccl",
                                init_method=args.url,
                                rank=args.local_rank,
                                world_size=len(params.device_list))
        torch.cuda.set_device(params.device_list[args.local_rank])
        torch.set_default_tensor_type(torch.cuda.FloatTensor)

    # Export parameters
    if dist.get_rank() == 0:
        export_params(params.output, "params.json", params)
        export_params(params.output, "%s.json" % params.model,
                      collect_params(params, model_cls.default_params()))

    model = model_cls(params).cuda()

    if args.half:
        model = model.half()
        torch.set_default_dtype(torch.half)
        torch.set_default_tensor_type(torch.cuda.HalfTensor)

    model.train()

    # Init tensorboard
    summary.init(params.output, params.save_summary)

    schedule = get_learning_rate_schedule(params)
    clipper = get_clipper(params)
    optimizer = get_optimizer(params, schedule, clipper)

    if args.half:
        optimizer = optimizers.LossScalingOptimizer(optimizer)

    optimizer = optimizers.MultiStepOptimizer(optimizer, params.update_cycle)

    trainable_flags = print_variables(model, params.pattern,
                                      dist.get_rank() == 0)

    dataset = data.get_dataset(params.input, "train", params)

    if params.validation:
        sorted_key, eval_dataset = data.get_dataset(params.validation, "infer",
                                                    params)
        references = load_references(params.references)
    else:
        sorted_key = None
        eval_dataset = None
        references = None

    # Load checkpoint
    checkpoint = utils.latest_checkpoint(params.output)

    if args.checkpoint is not None:
        # Load pre-trained models
        state = torch.load(args.checkpoint, map_location="cpu")
        model.load_state_dict(state["model"])
        step = params.initial_step
        epoch = 0
        broadcast(model)
    elif checkpoint is not None:
        state = torch.load(checkpoint, map_location="cpu")
        step = state["step"]
        epoch = state["epoch"]
        model.load_state_dict(state["model"])

        if "optimizer" in state:
            optimizer.load_state_dict(state["optimizer"])
    else:
        step = 0
        epoch = 0
        broadcast(model)

    def train_fn(inputs):
        features, labels = inputs
        loss = model(features, labels)
        return loss

    counter = 0

    while True:
        for features in dataset:
            if counter % params.update_cycle == 0:
                step += 1
                utils.set_global_step(step)

            counter += 1
            t = time.time()
            features = data.lookup(features, "train", params)
            loss = train_fn(features)
            gradients = optimizer.compute_gradients(loss,
                                                    list(model.parameters()))
            grads_and_vars = exclude_variables(
                trainable_flags, zip(gradients,
                                     list(model.named_parameters())))
            optimizer.apply_gradients(grads_and_vars)

            t = time.time() - t

            summary.scalar("loss", loss, step, write_every_n_steps=1)
            summary.scalar("global_step/sec", t, step)

            print("epoch = %d, step = %d, loss = %.3f (%.3f sec)" %
                  (epoch + 1, step, float(loss), t))

            if counter % params.update_cycle == 0:
                if step >= params.train_steps:
                    utils.evaluate(model, sorted_key, eval_dataset,
                                   params.output, references, params)
                    save_checkpoint(step, epoch, model, optimizer, params)

                    if dist.get_rank() == 0:
                        summary.close()

                    return

                if step % params.eval_steps == 0:
                    utils.evaluate(model, sorted_key, eval_dataset,
                                   params.output, references, params)

                if step % params.save_checkpoint_steps == 0:
                    save_checkpoint(step, epoch, model, optimizer, params)

        epoch += 1
示例#7
0
def evaluate(model, sorted_key, dataset, base_dir, references, params):
    if not references:
        return

    base_dir = base_dir.rstrip("/")
    save_path = os.path.join(base_dir, "eval")
    record_name = os.path.join(save_path, "record")
    log_name = os.path.join(save_path, "log")
    max_to_keep = params.keep_top_checkpoint_max

    if dist.get_rank() == 0:
        # Create directory and copy files
        if not os.path.exists(save_path):
            print("Making dir: %s" % save_path)
            os.makedirs(save_path)

            params_pattern = os.path.join(base_dir, "*.json")
            params_files = glob.glob(params_pattern)

            for name in params_files:
                new_name = name.replace(base_dir, save_path)
                shutil.copy(name, new_name)

    # Do validation here
    global_step = get_global_step()

    if dist.get_rank() == 0:
        print("-" * 90)
        print("Validating model at step %d" % global_step)

    score = _evaluate_model(model, sorted_key, dataset, references, params)

    # Save records
    if dist.get_rank() == 0:
        scalar("BLEU/score", score, global_step, write_every_n_steps=1)
        print("BLEU at step %d: %f" % (global_step, score))

        # Save checkpoint to save_path
        save({"model": model.state_dict(), "step": global_step}, save_path)

        _save_log(log_name, ("BLEU", global_step, score))
        records = _read_score_record(record_name)
        record = [latest_checkpoint(save_path).split("/")[-1], score]

        added, removed, records = _add_to_record(records, record, max_to_keep)

        if added is None:
            # Remove latest checkpoint
            filename = latest_checkpoint(save_path)
            print("Removing %s" % filename)
            files = glob.glob(filename + "*")

            for name in files:
                os.remove(name)

        if removed is not None:
            filename = os.path.join(save_path, removed)
            print("Removing %s" % filename)
            files = glob.glob(filename + "*")

            for name in files:
                os.remove(name)

        _save_score_record(record_name, records)

        best_score = records[0][1]
        print("Best score at step %d: %f" % (global_step, best_score))
        print("-" * 90)
示例#8
0
def main(args):
    model_cls = models.get_model(args.model)

    # Import and override parameters
    # Priorities (low -> high):
    # default -> saved -> command
    params = default_params()
    params = merge_params(params, model_cls.default_params(args.hparam_set))
    params = import_params(args.output, args.model, params)
    params = override_params(params, args)

    # Initialize distributed utility
    if args.distributed:
        dist.init_process_group("nccl")
        torch.cuda.set_device(args.local_rank)
        torch.set_default_tensor_type(torch.cuda.FloatTensor)
    else:
        dist.init_process_group("nccl",
                                init_method=args.url,
                                rank=args.local_rank,
                                world_size=len(params.device_list))
        torch.cuda.set_device(params.device_list[args.local_rank])
        torch.set_default_tensor_type(torch.cuda.FloatTensor)

    # Export parameters
    if dist.get_rank() == 0:
        export_params(params.output, "params.json", params)
        export_params(params.output, "%s.json" % params.model,
                      collect_params(params, model_cls.default_params()))

    model = model_cls(params).cuda()

    if args.half:
        model = model.half()
        torch.set_default_dtype(torch.half)
        torch.set_default_tensor_type(torch.cuda.HalfTensor)

    model.train()

    # Init tensorboard
    summary.init(params.output, params.save_summary)

    schedule = get_learning_rate_schedule(params)
    clipper = get_clipper(params)

    if params.optimizer.lower() == "adam":
        optimizer = optimizers.AdamOptimizer(learning_rate=schedule,
                                             beta_1=params.adam_beta1,
                                             beta_2=params.adam_beta2,
                                             epsilon=params.adam_epsilon,
                                             clipper=clipper,
                                             summaries=params.save_summary)
    elif params.optimizer.lower() == "adadelta":
        optimizer = optimizers.AdadeltaOptimizer(
            learning_rate=schedule,
            rho=params.adadelta_rho,
            epsilon=params.adadelta_epsilon,
            clipper=clipper,
            summaries=params.save_summary)
    elif params.optimizer.lower() == "sgd":
        optimizer = optimizers.SGDOptimizer(learning_rate=schedule,
                                            clipper=clipper,
                                            summaries=params.save_summary)
    else:
        raise ValueError("Unknown optimizer %s" % params.optimizer)

    if args.half:
        optimizer = optimizers.LossScalingOptimizer(optimizer)

    optimizer = optimizers.MultiStepOptimizer(optimizer, params.update_cycle)

    if dist.get_rank() == 0:
        print_variables(model)

    if params.from_torchtext:
        dataset = data.get_dataset_torchtext(params.input, "train", params)
    else:
        dataset = data.get_dataset(params.input, "train", params)

    if params.validation:
        if params.from_torchtext:
            eval_dataset = data.get_dataset_torchtext(params.validation,
                                                      "infer", params)
        else:
            eval_dataset = data.get_dataset(params.validation, "infer", params)
        references = load_references(params.references)
    else:
        eval_dataset = None
        references = None

    # Load checkpoint
    checkpoint = utils.latest_checkpoint(params.output)

    if args.checkpoint is not None:
        # Load pre-trained models
        state = torch.load(args.checkpoint, map_location="cpu")
        model.load_state_dict(state["model"], strict=False)
        step = params.initial_step
        epoch = 0
        broadcast(model)
    elif checkpoint is not None:
        state = torch.load(checkpoint, map_location="cpu")
        step = state["step"]
        epoch = state["epoch"]
        model.load_state_dict(state["model"])

        if "optimizer" in state:
            optimizer.load_state_dict(state["optimizer"])
    else:
        step = 0
        epoch = 0
        broadcast(model)

    def train_fn(inputs):
        features, labels = inputs
        loss, state = model(features, labels)
        return loss, state

    counter = 0
    state = None
    if params.model == "cachedtransformer":
        last_feature = None

    while True:
        start_time = time.time()

        for features in dataset:
            if counter % params.update_cycle == 0:
                step += 1
                utils.set_global_step(step)

            counter += 1
            t = time.time()
            features = data.lookup(features,
                                   "train",
                                   params,
                                   from_torchtext=params.from_torchtext)
            if model.name == "cachedtransformer":
                features = utils.update_cache(model, features, state,
                                              last_feature)
                last_feature = features[0]
            loss, state = train_fn(features)
            gradients = optimizer.compute_gradients(loss,
                                                    list(model.parameters()))
            grads_and_vars = optimizers.exclude_variables(
                params.pattern, zip(gradients, list(model.named_parameters())))
            optimizer.apply_gradients(grads_and_vars)

            t = time.time() - t

            summary.scalar("loss", loss, step, write_every_n_steps=1)
            summary.scalar("global_step/sec", t, step)

            if counter % params.update_cycle == 0:
                if step > 0 and step % args.log_interval == 0:
                    elapsed = time.time() - start_time
                    print('| epoch {:2d} | step {:8d} | lr {:02.2e} | '
                          'ms/step {:4.0f} | loss {:8.4f} '.format(
                              epoch + 1, step,
                              optimizer._optimizer._learning_rate(step),
                              elapsed * 1000 / args.log_interval, loss.item()))
                    start_time = time.time()

                if step >= params.train_steps:
                    utils.evaluate(model, eval_dataset, params.output,
                                   references, params)
                    save_checkpoint(step, epoch, model, optimizer, params)

                    if dist.get_rank() == 0:
                        summary.close()

                    return

                if step % params.eval_steps == 0:
                    utils.evaluate(model, eval_dataset, params.output,
                                   references, params)
                    start_time = time.time()

                if step % params.save_checkpoint_steps == 0:
                    save_checkpoint(step, epoch, model, optimizer, params)
                    start_time = time.time()

        epoch += 1