예제 #1
0
def apply_bpe_function(codes_file, train_file, apply_out, vocabulary=None):
    parser = apply_bpe.create_parser()
    args = parser.parse_args([
        "--codes",
        codes_file,
        "--input",
        train_file,
        "--output",
        apply_out,
        # "--vocabulary", vocabulary
    ])

    if vocabulary:
        args.vocabulary = codecs.open(vocabulary, encoding='utf-8')

    if vocabulary:
        vocabulary = apply_bpe.read_vocabulary(args.vocabulary,
                                               args.vocabulary_threshold)
    else:
        vocabulary = None

    args.codes = codecs.open(args.codes.name, encoding='utf-8')
    bpe = apply_bpe.BPE(args.codes, args.merges, args.separator, vocabulary,
                        args.glossaries)
    args.input = codecs.open(args.input.name, encoding='utf-8')
    args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
    for line in args.input:
        args.output.write(bpe.process_line(line, args.dropout))
예제 #2
0
    def __init__(self, cfg):
        if cfg.bpe_codes is None:
            raise ValueError("--bpe-codes is required for --bpe=subword_nmt")
        codes = file_utils.cached_path(cfg.bpe_codes)
        try:
            from subword_nmt import apply_bpe

            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args([
                "--codes",
                codes,
                "--separator",
                cfg.bpe_separator,
            ])
            self.bpe = apply_bpe.BPE(
                bpe_args.codes,
                bpe_args.merges,
                bpe_args.separator,
                None,
                bpe_args.glossaries,
            )
            self.bpe_symbol = bpe_args.separator + " "
        except ImportError:
            raise ImportError(
                "Please install subword_nmt with: pip install subword-nmt")
예제 #3
0
 def __init__(self, args):
     if args.bpe_codes is None:
         raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
     codes = file_utils.cached_path(args.bpe_codes)
     try:
         from subword_nmt import apply_bpe
         bpe_parser = apply_bpe.create_parser()
         bpe_args = bpe_parser.parse_args([
             '--codes',
             codes,
             '--separator',
             args.bpe_separator,
         ])
         import codecs
         bpe_args.codes = codecs.open(codes, encoding='utf-8')
         self.bpe = apply_bpe.BPE(
             bpe_args.codes,
             bpe_args.merges,
             bpe_args.separator,
             None,
             bpe_args.glossaries,
         )
         self.bpe_symbol = bpe_args.separator + ' '
     except ImportError:
         raise ImportError(
             'Please install subword_nmt with: pip install subword-nmt')
예제 #4
0
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        self.args.remove_bpe = bpe_symbol

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        if hasattr(args, 'source_lang'):
            self.tokenizer = MosesTokenizer(lang=args.source_lang)
        else:
            self.tokenizer = MosesTokenizer()

        if src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            self.bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
        else:
            self.bpe = None
 def __init__(self, args):
     codes = file_utils.cached_path(args.bpe_codes)
     try:
         from subword_nmt import apply_bpe
         bpe_parser = apply_bpe.create_parser()
         bpe_args = bpe_parser.parse_args([
             '--codes',
             codes,
             '--separator',
             args.bpe_separator,
         ])
         self.bpe = apply_bpe.BPE(
             bpe_args.codes,
             bpe_args.merges,
             bpe_args.separator,
             None,
             bpe_args.glossaries,
         )
         self.bpe_symbol = bpe_args.separator + ' '
     except ImportError:
         raise ImportError(
             'Please install subword_nmt with: pip install subword-nmt')
예제 #6
0
파일: generator.py 프로젝트: yf1291/nlp4
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        self.in_transforms = []
        self.out_transforms = []

        if getattr(args, 'moses', False):
            tokenizer = MosesTokenizer(lang=args.source_lang or 'en')
            detokenizer = MosesDetokenizer(lang=args.target_lang or 'en')
            self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True))
            self.out_transforms.append(lambda s: detokenizer.detokenize(s.split()))
        elif getattr(args, 'nltk', False):
            from nltk.tokenize import word_tokenize
            self.in_transforms.append(lambda s: ' '.join(word_tokenize(s)))

        if getattr(args, 'gpt2_bpe', False):
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json')
            vocab_bpe = src_bpe
            encoder = get_encoder(encoder_json, vocab_bpe)
            self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s))))
            self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>'))
            self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split())))
        elif getattr(args, 'sentencepiece', False):
            import sentencepiece as spm
            sp = spm.SentencePieceProcessor()
            sp.Load(src_bpe)
            self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s)))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece'))
        elif src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
            self.in_transforms.append(lambda s: bpe.process_line(s))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
예제 #7
0
def create_subword_bpe(codes):
    bpe_parser = apply_bpe.create_parser()
    bpe_args = bpe_parser.parse_args(['--codes', str(codes)])
    bpe = apply_bpe.BPE(bpe_args.codes)
    return bpe