Exemplo n.º 1
0
    def evaluate(self, report_individual=False):
        overall_total_loss = 0.0
        overall_total_timesteps = 0.0
        ppls = []

        for round_no in range(self.nb_rounds):
            self.lm.eval()

            total_loss = 0.0
            total_timesteps = 0
            hidden = self.lm.model.init_hidden(self.batch_size)

            for X, targets in self.data:
                hidden = repackage_hidden(hidden)

                output, hidden = self.lm.model(X, hidden)
                losses = self.lm.decoder.neg_log_prob_raw(output, targets)

                total_loss += losses.sum().detach()
                total_timesteps += targets.numel()

            eval_report = EvaluationReport(total_loss.item(), total_timesteps, 1.0)
            ppl = math.exp(eval_report.loss_per_token)
            if report_individual:
                self.logger.info('total loss {:.1f} | per token loss {:5.2f} | ppl {:8.2f}'.format(eval_report.total_loss, eval_report.loss_per_token, ppl))

            overall_total_loss += total_loss
            overall_total_timesteps += total_timesteps
            ppls.append(ppl)

        ppls = np.asarray(ppls)
        self.logger.info(f'PPLs summary: {np.min(ppls):.2f} / {np.mean(ppls):.2f} / {np.max(ppls):.2f} , stddev: {np.std(ppls):.3f}')
        return EvaluationReport(overall_total_loss.item(), overall_total_timesteps, 1.0)
Exemplo n.º 2
0
    def evaluate(self):
        self.lm.eval()

        total_loss = 0.0
        total_timesteps = 0
        hidden = self.lm.model.init_hidden(self.batch_size)

        for X, targets in self.data:
            hidden = repackage_hidden(hidden)

            output, hidden = self.lm.model(X, hidden)
            losses = self.lm.decoder.neg_log_prob_raw(output, targets)

            total_loss += losses.sum().detach()
            total_timesteps += targets.numel()

        return EvaluationReport(total_loss.item(), total_timesteps, 1.0)
Exemplo n.º 3
0
def main(args):
    print(args)

    init_seeds(args.seed, args.cuda)

    print("loading model...")
    device = torch.device('cuda') if args.cuda else torch.device('cpu')
    lm = torch.load(args.load).to(device)
    print(lm.model)

    print("preparing training data...")

    if args.train_yaml:
        train_data_stream, single_stream_len = yaml_factory_noepoch(
            args.train_yaml, lm, device)
    else:
        train_data_stream, single_stream_len = plain_factory_noepoch(
            data_fn=args.train,
            lm=lm,
            tokenize_regime=args.tokenize_regime,
            batch_size=args.batch_size,
            device=device,
            target_seq_len=args.target_seq_len,
        )

    print("preparing validation data...")
    evaluator = EnblockEvaluator(lm,
                                 args.valid,
                                 10,
                                 args.target_seq_len,
                                 tokenize_regime=args.tokenize_regime)

    def val_loss_fn():
        return evaluator.evaluate().loss_per_token

    print("computing initial PPL...")
    initial_val_loss = val_loss_fn()
    print('Initial perplexity {:.2f}'.format(math.exp(initial_val_loss)))

    print("training...")
    lr = args.lr
    best_val_loss = None

    val_watcher = ValidationWatcher(val_loss_fn, initial_val_loss,
                                    args.val_interval, args.workdir, lm)
    best_val_loss = initial_val_loss

    optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta)
    patience_ticks = 0

    logger = InfinityLogger(0, args.log_interval, lr)

    hidden = None
    for X, targets in train_data_stream:
        if hidden is None:
            hidden = lm.model.init_hidden(args.batch_size)

        hidden = repackage_hidden(hidden)

        lm.train()
        output, hidden = lm.model(X, hidden)
        loss, nb_words = lm.decoder.neg_log_prob(output, targets)
        loss /= nb_words

        val_watcher.log_training_update(loss.data, nb_words)

        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(lm.parameters(), args.clip)

        optim.step()
        logger.log(loss.data)

    val_loss = val_loss_fn()

    # Save the model if the validation loss is the best we've seen so far.
    if val_loss < best_val_loss:
        torch.save(lm, args.save)
        best_val_loss = val_loss
        patience_ticks = 0
    else:
        patience_ticks += 1
        if patience_ticks > args.patience:
            lr /= 2.0
            if lr < args.min_lr:
                print(
                    f"Learning has reached {lr}, training was supposed to stop at {args.min_lr}, stopping."
                )
            for p in optim.param_groups:
                p['lr'] = lr
            patience_ticks = 0
Exemplo n.º 4
0
def main(args):
    print(args)
    logging.basicConfig(level=logging.INFO, format='[%(levelname)s::%(name)s] %(message)s')

    init_seeds(args.seed, args.cuda)

    print("loading model...")
    lm = torch.load(args.load)
    if args.cuda:
        lm.cuda()
    lm.decoder.core_loss.amount = args.label_smoothing

    print(lm.model)
    print('Label smoothing power', lm.decoder.core_loss.amount)

    tokenize_regime = 'words'

    print("preparing training data...")
    train_ids = tokens_from_fn(args.train, lm.vocab, randomize=False, regime=tokenize_regime)
    train_streams = form_input_targets(train_ids)
    corrupted_provider = InputTargetCorruptor(train_streams, args.subs_rate, args.target_subs_rate, len(lm.vocab), args.del_rate, args.ins_rate, protected=[lm.vocab['</s>']])
    batch_former = LazyBatcher(args.batch_size, corrupted_provider)
    train_data = TemplSplitterClean(args.target_seq_len, batch_former)
    train_data_stream = OndemandDataProvider(TransposeWrapper(train_data), args.cuda)

    print("preparing validation data...")
    evaluator = EnblockEvaluator(lm, args.valid, 10, args.target_seq_len)
    # Evaluation (de facto LR scheduling) with input corruption did not
    # help during the CHiMe-6 evaluation
    # evaluator = SubstitutionalEnblockEvaluator(
    #     lm, args.valid,
    #     batch_size=10, target_seq_len=args.target_seq_len,
    #     corruptor=lambda data: Corruptor(data, args.corruption_rate, len(lm.vocab)),
    #     nb_rounds=args.eval_rounds,
    # )

    def val_loss_fn():
        return evaluator.evaluate().loss_per_token

    print("computing initial PPL...")
    initial_val_loss = val_loss_fn()
    print('Initial perplexity {:.2f}'.format(math.exp(initial_val_loss)))

    print("training...")
    lr = args.lr
    best_val_loss = None

    val_watcher = ValidationWatcher(val_loss_fn, initial_val_loss, args.val_interval, args.workdir, lm)

    optim = torch.optim.SGD(lm.parameters(), lr, weight_decay=args.beta)
    for epoch in range(1, args.epochs + 1):
        logger = ProgressLogger(epoch, args.log_interval, lr, len(list(train_data)) // args.target_seq_len)

        hidden = None
        for X, targets in train_data_stream:
            if hidden is None:
                hidden = lm.model.init_hidden(args.batch_size)

            hidden = repackage_hidden(hidden)

            lm.train()
            output, hidden = lm.model(X, hidden)
            loss, nb_words = lm.decoder.neg_log_prob(output, targets)
            loss /= nb_words

            val_watcher.log_training_update(loss.data, nb_words)

            optim.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(lm.parameters(), args.clip)

            optim.step()
            logger.log(loss.data)

        val_loss = val_loss_fn()
        print(epoch_summary(epoch, logger.nb_updates(), logger.time_since_creation(), val_loss))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            torch.save(lm, args.save)
            best_val_loss = val_loss
            patience_ticks = 0
        else:
            patience_ticks += 1
            if patience_ticks > args.patience:
                lr /= 2.0
                patience_ticks = 0