예제 #1
0
def translate(opt):
    ArgumentParser.validate_translate_opts(opt)
    ArgumentParser._get_all_transform_translate(opt)
    ArgumentParser._validate_transforms_opts(opt)
    ArgumentParser.validate_translate_opts_dynamic(opt)
    logger = init_logger(opt.log_file)

    translator = build_translator(opt, logger=logger, report_score=True)

    data_reader = InferenceDataReader(opt.src, opt.tgt, opt.src_feats)

    # Build transforms
    transforms_cls = get_transforms_cls(opt._all_transform)
    transforms = make_transforms(opt, transforms_cls, translator.fields)
    data_transform = [
        transforms[name] for name in opt.transforms if name in transforms
    ]
    transform = TransformPipe.build_from(data_transform)

    for i, (src_shard, tgt_shard, feats_shard) in enumerate(data_reader):
        logger.info("Translating shard %d." % i)
        translator.translate_dynamic(src=src_shard,
                                     transform=transform,
                                     src_feats=feats_shard,
                                     tgt=tgt_shard,
                                     batch_size=opt.batch_size,
                                     batch_type=opt.batch_type,
                                     attn_debug=opt.attn_debug,
                                     align_debug=opt.align_debug)
예제 #2
0
def build_corpora_iters(corpora, transforms, corpora_info, is_train=False,
                        skip_empty_level='warning', stride=1, offset=0):
    """Return `ParallelCorpusIterator` for all corpora defined in opts."""
    corpora_iters = dict()
    for c_id, corpus in corpora.items():
        transform_names = corpora_info[c_id].get('transforms', [])
        corpus_transform = [
            transforms[name] for name in transform_names if name in transforms
        ]
        transform_pipe = TransformPipe.build_from(corpus_transform)
        logger.info(f"{c_id}'s transforms: {str(transform_pipe)}")
        corpus_iter = ParallelCorpusIterator(
            corpus, transform_pipe, infinitely=is_train,
            skip_empty_level=skip_empty_level, stride=stride, offset=offset)
        corpora_iters[c_id] = corpus_iter
    return corpora_iters
예제 #3
0
 def test_transform_pipe(self):
     # 1. Init first transform in the pipe
     prefix_cls = get_transforms_cls(["prefix"])["prefix"]
     corpora = yaml.safe_load("""
         trainset:
             path_src: data/src-train.txt
             path_tgt: data/tgt-train.txt
             transforms: [prefix, filtertoolong]
             weight: 1
             src_prefix: "⦅_pf_src⦆"
             tgt_prefix: "⦅_pf_tgt⦆"
     """)
     opt = Namespace(data=corpora, seed=-1)
     prefix_transform = prefix_cls(opt)
     prefix_transform.warm_up()
     # 2. Init second transform in the pipe
     filter_cls = get_transforms_cls(["filtertoolong"])["filtertoolong"]
     opt = Namespace(src_seq_length=4, tgt_seq_length=4)
     filter_transform = filter_cls(opt)
     # 3. Sequential combine them into a transform pipe
     transform_pipe = TransformPipe.build_from(
         [prefix_transform, filter_transform])
     ex = {
         "src": ["Hello", ",", "world", "."],
         "tgt": ["Bonjour", "le", "monde", "."],
     }
     # 4. apply transform pipe for example
     ex_after = transform_pipe.apply(copy.deepcopy(ex),
                                     corpus_name="trainset")
     # 5. example after the pipe exceed the length limit, thus filtered
     self.assertIsNone(ex_after)
     # 6. Transform statistics registed (here for filtertoolong)
     self.assertTrue(len(transform_pipe.statistics.observables) > 0)
     msg = transform_pipe.statistics.report()
     self.assertIsNotNone(msg)
     # 7. after report, statistics become empty as a fresh start
     self.assertTrue(len(transform_pipe.statistics.observables) == 0)