Пример #1
0
    def string(self, tensor, bpe_symbol=None, escape_unk=False):
        """Helper for converting a tensor of token indices to a string.

        Can optionally remove BPE symbols or escape <unk> words.
        """
        if torch.is_tensor(tensor) and tensor.dim() == 2:
            return '\n'.join(self.string(t) for t in tensor)

        def token_string(i):
            if i == self.unk():
                return self.unk_string(escape_unk)
            else:
                return self[i]

        sent = ' '.join(token_string(i) for i in tensor if i != self.eos())
        return data_utils.process_bpe_symbol(sent, bpe_symbol)
Пример #2
0
 def tokens_to_sentence(self,
                        line,
                        line_tokenizer=tokenize_line,
                        use_unk_sym=True,
                        bpe_symbol=None):
     if bpe_symbol is not None:
         return data_utils.process_bpe_symbol(line, bpe_symbol)
     # use_unk_sym=False when we want to restore original transcripts from
     # token sequences, e.g., obtain reference to compute WER
     tokens = line_tokenizer(line)
     sent = ""
     for token in tokens:
         if token == self.space_word:
             sent += " "
         elif use_unk_sym and self.index(token) == self.unk_index:
             sent += self.unk_word
         elif token != self.pad_word and token != self.eos_word:
             sent += token
     return sent.strip()
Пример #3
0
def create_ner_from_output_tokens(tokens, src_dict, ent_pad_idx=0, ent_eos_idx=21):

    #if tokens.shape[1] == 1:

    # bsz X seq_len
    txts = []
    token_lists = []
    pad_idx = src_dict.pad()
    eos_idx = src_dict.eos()

    if tokens.shape[1] == 1:
        # initial inference step, we need to return EOS
        return torch.empty_like(tokens).fill_(ent_eos_idx)
        

    output_entities = torch.empty_like(tokens)

    for i in range(len(tokens)):
        token_list = [src_dict[idx] for idx in tokens[i][1:]]
        token_lists.append(token_list)
        txt = ' '.join(token_list)
        txt = data_utils.process_bpe_symbol(txt, ' ##')
        txts.append(txt)

    docs = list(nlp.pipe(txts))

    for i in range(len(token_lists)):
        doc = docs[i]
        _, alignments = align_tokens(doc, token_lists[i])
        entities = torch.zeros_like(tokens[i])
        entities[0] = ent_eos_idx

        for j in range(1, len(alignments)):
            spacy_token = doc[j]
            ent_type = pad_idx + 1
            if spacy_token.ent_type_ in ENTITY_TYPES:
                ent_type += ENTITY_TYPES[spacy_token.ent_type_]

            for wp_idx in alignments[j]:
                entities[wp_idx] = ent_type
        output_entities[i] = entities

    return output_entities
Пример #4
0
    def string(self, tensor, bpe_symbol=None, escape_unk=False):
        """Helper for converting a tensor of token indices to a string.

        Can optionally remove BPE symbols or escape <unk> words.
        """
        raise Exception(
            "(BERT dict) string function will not work for all indices are IDs instead of words"
        )
        if torch.is_tensor(tensor) and tensor.dim() == 2:
            return '\n'.join(
                self.string(t, bpe_symbol, escape_unk) for t in tensor)

        def token_string(i):
            if i == self.unk():
                return self.unk_string(escape_unk)
            else:
                return self[i]

        sent = ' '.join(token_string(i) for i in tensor if i != self.eos())
        return data_utils.process_bpe_symbol(sent, bpe_symbol)
Пример #5
0
    def string(
        self,
        tensor,
        bpe_symbol=None,
        escape_unk=False,
        extra_symbols_to_ignore=None,
        unk_string=None,
    ):
        """Helper for converting a tensor of token indices to a string.

        Can optionally remove BPE symbols or escape <unk> words.
        """
        if torch.is_tensor(tensor) and tensor.dim() == 2:
            return "\n".join(
                self.string(t, bpe_symbol, escape_unk, extra_symbols_to_ignore)
                for t in tensor
            )

        extra_symbols_to_ignore = set(extra_symbols_to_ignore or [])
        #extra_symbols_to_ignore.add(self.eos())

        def token_string(i):
            if i == self.unk():
                if unk_string is not None:
                    return unk_string
                else:
                    return self.unk_string(escape_unk)
            else:
                return self[i]

        if hasattr(self, "bos_index"):
            extra_symbols_to_ignore.add(self.bos())

        sent = " ".join(
            token_string(i)
            for i in tensor
            if utils.item(i) not in extra_symbols_to_ignore
        )

        return data_utils.process_bpe_symbol(sent, bpe_symbol)
Пример #6
0
def create_ner(tokens, src_dict):
    """ Given a 1d Tensor (with token indices), return a list of NER tags
    
    """
    entities = torch.zeros_like(tokens)
    token_list = [src_dict[idx] for idx in tokens]
    pad_idx = src_dict.pad()
    txt = ' '.join(token_list)
    txt = data_utils.process_bpe_symbol(txt, ' ##')
    doc = nlp(txt)

    _, alignments = align_tokens(doc, token_list)

    for i in range(len(alignments)):
        spacy_token = doc[i]
        ent_type = pad_idx + 1
        if spacy_token.ent_type_ in ENTITY_TYPES:
            ent_type += ENTITY_TYPES[spacy_token.ent_type_]

        for wp_idx in alignments[i]:
            entities[wp_idx] = ent_type

    return entities
Пример #7
0
    def string(self, tensor, bpe_symbol=None, escape_unk=False):
        """Helper for converting a tensor of token indices to a string.

        Can optionally remove BPE symbols or escape <unk> words.
        """
        #print("self is",self.indices)
        if torch.is_tensor(tensor) and tensor.dim() == 2:
            return "\n".join(
                self.string(t, bpe_symbol, escape_unk) for t in tensor)

        def token_string(i):
            if i == self.unk():
                return self.unk_string(escape_unk)
            else:
                return self[i]

        if hasattr(self, "bos_index"):
            sent = " ".join(
                token_string(i) for i in tensor
                if (i != self.eos()) and (i != self.bos()))
        else:
            sent = " ".join(token_string(i) for i in tensor if i != self.eos())
        return data_utils.process_bpe_symbol(sent, bpe_symbol)
Пример #8
0
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        self.in_transforms = []
        self.out_transforms = []

        if getattr(args, 'moses', False):
            tokenizer = MosesTokenizer(lang=args.source_lang or 'en')
            detokenizer = MosesDetokenizer(lang=args.target_lang or 'en')
            self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True))
            self.out_transforms.append(lambda s: detokenizer.detokenize(s.split()))
        elif getattr(args, 'nltk', False):
            from nltk.tokenize import word_tokenize
            self.in_transforms.append(lambda s: ' '.join(word_tokenize(s)))

        if getattr(args, 'gpt2_bpe', False):
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json')
            vocab_bpe = src_bpe
            encoder = get_encoder(encoder_json, vocab_bpe)
            self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s))))
            self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>'))
            self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split())))
        elif getattr(args, 'sentencepiece', False):
            import sentencepiece as spm
            sp = spm.SentencePieceProcessor()
            sp.Load(src_bpe)
            self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s)))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece'))
        elif src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
            self.in_transforms.append(lambda s: bpe.process_line(s))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))