示例#1
0
    def __init__(self, lm, data_fn, batch_size, target_seq_len, corruptor, nb_rounds, logger=None, tokenize_regime='words'):
        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger('SubstitutionalEnblockEvaluator_v2')
        self.batch_size = batch_size
        self.lm = lm
        self.nb_rounds = nb_rounds

        ids = tokens_from_fn(data_fn, lm.vocab, regime=tokenize_regime, randomize=False)
        oov_mask = ids == lm.vocab.unk_ind
        nb_oovs = oov_mask.sum().item()

        nb_tokens = len(ids)
        oov_msg = 'Nb oovs: {} / {} ({:.2f} %)\n'.format(nb_oovs, len(ids), 100.0 * nb_oovs/nb_tokens)
        if nb_oovs / nb_tokens > 0.05:
            self.logger.warning(oov_msg)
        else:
            self.logger.info(oov_msg)

        streams = form_input_targets(ids)
        corrupted_provider = corruptor(streams)
        batch_former = LazyBatcher(batch_size, corrupted_provider)
        data_tb = TemplSplitterClean(target_seq_len, batch_former)

        self.data = CudaStream(TransposeWrapper(data_tb))
示例#2
0
    if args.cuda:
        lm.cuda()
    print(lm.model)

    print("loading SMM iVector extractor ...")
    with open(args.ivec_extractor, 'rb') as f:
        ivec_extractor = smm_ivec_extractor.load(f)
    if args.ivec_nb_iters is not None:
        ivec_extractor._nb_iters = args.ivec_nb_iters
    print(ivec_extractor)

    print("preparing data...")

    def ts_from_file(f):
        return TokenizedSplitFFBase(
            f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                    target_seq_len))

    tss = filelist_to_objects(args.file_list, ts_from_file)
    data = BatchBuilder(tss,
                        args.batch_size,
                        discard_h=not args.concat_articles)
    if args.cuda:
        data = CudaStream(data)
    data_ivecs = ivec_appenders.ParalelIvecAppender(
        data, ivec_extractor, ivec_extractor.build_translator(lm.vocab))

    print("evaluating...")
    loss = evaluate(lm, data_ivecs, use_ivecs=True)
    print('loss {:5.2f} | ppl {:8.2f}'.format(loss, math.exp(loss)))
示例#3
0
    def ivec_ts_from_file(f):
        ts = TokenizedSplitFFBase(
            f, lm.vocab, lambda seq: TemporalSplits(seq, lm.model.in_len, args.
                                                    target_seq_len))
        return ivec_appenders.CheatingIvecAppender(ts, ivec_extractor)

    train_data_ivecs = filelist_to_objects(args.train_list, ivec_ts_from_file)

    print("\tvalidation...")
    valid_data_ivecs = filelist_to_objects(args.valid_list, ivec_ts_from_file)
    valid_data = BatchBuilder(valid_data_ivecs,
                              args.batch_size,
                              discard_h=not args.concat_articles)
    if args.cuda:
        valid_data = CudaStream(valid_data)

    print("training...")
    lr = args.lr
    best_val_loss = None

    for epoch in range(1, args.epochs + 1):
        random.shuffle(train_data_ivecs)
        train_data = BatchBuilder(train_data_ivecs,
                                  args.batch_size,
                                  discard_h=not args.concat_articles)
        if args.cuda:
            train_data = CudaStream(train_data)

        logger = InfinityLogger(epoch, args.log_interval, lr)
        train_data_filtered = BatchFilter(train_data, args.batch_size,