def test_basic_tokenizer_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
    def test_basic_tokenizer_lower_strip_accents_default(self):
        tokenizer = BasicTokenizer(do_lower_case=True)

        self.assertListEqual(
            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
        )
        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
 def finish_deserializing(self):
     self.ids_to_tokens = OrderedDict([(ids, tok)
                                       for tok, ids in self.vocab.items()])
     if self.do_basic_tokenize:
         self.basic_tokenizer = BasicTokenizer(
             do_lower_case=self.do_lower_case,
             never_split=self.never_split,
             tokenize_chinese_chars=self.tokenize_chinese_chars,
         )
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                   unk_token=self.unk_token)
     super().finish_deserializing()
예제 #4
0
    def __init__(self, config):
        super(WordEmbedding, self).__init__()
        self.config = config
        self.tokenizer = BasicTokenizer(do_lower_case=True)
        # standard deviation of initialization
        init_std = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.initializer_range

        self.words = []
        self.word2idx = {}
        self.embeddings = []
        with open(config.MODEL.LANGUAGE_BACKBONE.EMBEDDING_PATH, 'r') as fin:
            for row in fin:
                row_tk = row.split()
                self.words.append(row_tk[0])
                self.word2idx[row_tk[0]] = len(self.words) - 1
                self.embeddings.append([float(num) for num in row_tk[1:]])

        self.embeddings = torch.tensor(
            np.asarray(self.embeddings, dtype=np.float32)).cuda()
        self.embeddings = nn.Parameter(self.embeddings)
        self.out_channels = self.embeddings.shape[-1]
        if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE:
            self.embeddings.requires_grad = False

        self.words.extend(['[OOV]', '[PAD]', '[CLS]', '[SEP]', '[MASK]'])
        self.oov_idx = len(self.words) - 5
        self.pad_idx = len(self.words) - 4
        self.cls_idx = len(self.words) - 3
        self.sep_idx = len(self.words) - 2
        self.mask_idx = len(self.words) - 1
        self.special_tokens = set([
            self.oov_idx, self.pad_idx, self.cls_idx, self.sep_idx,
            self.mask_idx
        ])
        self.special_embeddings = nn.Parameter(
            torch.zeros(5, self.out_channels).cuda())
        self.special_embeddings.data.normal_(mean=0.0, std=init_std)
        self.aug_embeddings = torch.cat(
            [self.embeddings, self.special_embeddings], dim=0)
        head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER
        self.mlm = head_config.MASKED_LANGUAGE_MODELING
        self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB
        self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK
        self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE
        self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION
        self.add_position_embedding = config.MODEL.LANGUAGE_BACKBONE.ADD_POSITION_EMBEDDING
        if self.add_position_embedding:
            # maximum length of a sentence
            m = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.max_position_embeddings
            self.position_embedding = nn.Parameter(
                torch.zeros(m, self.out_channels))
            self.position_embedding.data.normal_(mean=0.0, std=init_std)
예제 #5
0
def customize_tokenizer(text, do_lower_case=False):
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    temp_x = ""
    #text = convert_to_unicode(text)
    for c in text:
        if (tokenizer._is_chinese_char(ord(c)) or _is_punctuation(c)
                or _is_whitespace(c) or _is_control(c)):
            temp_x += " " + c + " "
        else:
            temp_x += c
    if do_lower_case:
        temp_x = temp_x.lower()
    return temp_x.split()
예제 #6
0
def build_from_p_e_m_file(p_e_m_file, dump_db_file, wiki_mention_db_file,
                          **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = BasicTokenizer(do_lower_case=False)
    normalizer = BertLowercaseNormalizer()
    wiki_mention_db = MentionDB(wiki_mention_db_file)
    MentionDB.build_from_p_e_m_file(p_e_m_file, dump_db, wiki_mention_db,
                                    tokenizer, normalizer, **kwargs)
예제 #7
0
class BertLowercaseNormalizer(object):
    def __init__(self,
                 never_lowercase=("[UNK]", "[SEP]", "[PAD]", "[CLS]",
                                  "[MASK]")):
        self._tokenizer = BasicTokenizer()
        self._never_lowercase = frozenset(never_lowercase)

    def normalize(self, token):
        if token not in self._never_lowercase:
            token = token.lower()
            token = self._tokenizer._run_strip_accents(token)
        return token
예제 #8
0
 def from_config(cls, config: Config):
     basic_tokenizer = BasicTokenizer(
         do_lower_case=config.lowercase,
         never_split=(
             "[UNK]",
             "[SEP]",
             "[PAD]",
             "[CLS]",
             "[MASK]",
         ),  # compatibility with HF v0.5
     )
     return cls(basic_tokenizer)
예제 #9
0
    def __init__(self,
                 do_lower_case: bool = True,
                 never_split: Optional[List[str]] = None) -> None:

        if never_split is None:
            never_split = self.default_never_split
        else:
            never_split = never_split + self.default_never_split

        self.basic_tokenizer = BertTokenizer(do_lower_case, never_split)
        self.basic_tokenizer._run_split_on_punc = self._run_split_on_punc
        self.never_split = never_split
예제 #10
0
class BertPreTokenizer(Tokenizer):
    """
    The ``BasicTokenizer`` from the BERT implementation.
    This is used to split a sentence into words.
    Then the ``BertTokenIndexer`` converts each word into wordpieces.
    """

    default_never_split = ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    def __init__(self,
                 do_lower_case: bool = True,
                 never_split: Optional[List[str]] = None) -> None:

        if never_split is None:
            never_split = self.default_never_split
        else:
            never_split = never_split + self.default_never_split

        self.basic_tokenizer = BertTokenizer(do_lower_case, never_split)
        self.basic_tokenizer._run_split_on_punc = self._run_split_on_punc
        self.never_split = never_split

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        return [Token(text) for text in self.basic_tokenizer.tokenize(text)]

    # HACK: Monkeypatch for huggingface's broken BasicTokenizer.
    # TODO(Mark): Remove this once https://github.com/huggingface/transformers/pull/2557
    # is merged.
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        if never_split is None:
            never_split = self.never_split
        if never_split is not None and text in never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]
class SerializableBertTokenizer(transformers.BertTokenizer,
                                SerializationMixin):
    serialization_fields = list(BASE_CLASS_FIELDS) + [
        "vocab",
        "do_basic_tokenize",
        "do_lower_case",
        "never_split",
        "tokenize_chinese_chars",
    ]

    @classmethod
    def blank(cls):
        self = cls.__new__(cls)
        for field in self.serialization_fields:
            setattr(self, field, None)
        self.ids_to_tokens = None
        self.basic_tokenizer = None
        self.wordpiece_tokenizer = None
        return self

    def prepare_for_serialization(self):
        if self.basic_tokenizer is not None:
            self.do_lower_case = self.basic_tokenizer.do_lower_case
            self.never_split = self.basic_tokenizer.never_split
            self.tokenize_chinese_chars = self.basic_tokenizer.tokenize_chinese_chars
        super().prepare_for_serialization()

    def finish_deserializing(self):
        self.ids_to_tokens = OrderedDict([(ids, tok)
                                          for tok, ids in self.vocab.items()])
        if self.do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=self.do_lower_case,
                never_split=self.never_split,
                tokenize_chinese_chars=self.tokenize_chinese_chars,
            )
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,
                                                      unk_token=self.unk_token)
        super().finish_deserializing()

    def clean_token(self, text):
        if self.do_basic_tokenize:
            text = self.basic_tokenizer._clean_text(text)
        text = text.strip()
        return clean_accents(text)

    def clean_wp_token(self, token):
        return token.replace("##", "", 1).strip()

    def add_special_tokens(self, segments):
        output = []
        for segment in segments:
            output.extend(segment)
            if segment:
                output.append(self.sep_token)
        if output:
            # If we otherwise would have an empty output, don't add cls
            output.insert(0, self.cls_token)
        return output

    def fix_alignment(self, segments):
        """Turn a nested segment alignment into an alignment for the whole input,
        by offsetting and accounting for special tokens."""
        offset = 0
        output = []
        for segment in segments:
            if segment:
                offset += 1
            seen = set()
            for idx_group in segment:
                output.append([idx + offset for idx in idx_group])
                seen.update({idx for idx in idx_group})
            offset += len(seen)
        return output
    def test_basic_tokenizer_no_lower(self):
        tokenizer = BasicTokenizer(do_lower_case=False)

        self.assertListEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"])
예제 #13
0
 def __init__(self,
              never_lowercase=("[UNK]", "[SEP]", "[PAD]", "[CLS]",
                               "[MASK]")):
     self._tokenizer = BasicTokenizer()
     self._never_lowercase = frozenset(never_lowercase)
    def test_chinese(self):
        tokenizer = BasicTokenizer()

        self.assertListEqual(tokenizer.tokenize(u"ah\u535A\u63A8zz"),
                             [u"ah", u"\u535A", u"\u63A8", u"zz"])
예제 #15
0
    def test_basic_tokenizer_respects_never_split_tokens(self):
        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])

        self.assertListEqual(
            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"),
            ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"])
예제 #16
0
    def test_basic_tokenizer_no_lower_strip_accents_true(self):
        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)

        self.assertListEqual(tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "),
                             ["HaLLo", "!", "how", "Are", "yoU", "?"])
예제 #17
0
def get_final_text(pred_text, orig_text, do_lower_case):
    """Project the tokenized prediction back to the original text."""
    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heuristic between
    # `pred_text` and `orig_text` to get a character-to-character alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for idx, c in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = idx
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return ns_text, ns_to_s_map

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        return orig_text

    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
    return output_text
예제 #18
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    tok_text = " ".join(tokenizer.tokenize(orig_text))

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, orig_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
예제 #19
0
    def read_sequence(self, dataset, path, is_train, max_sents):
        """
        Reads conllu-like files. It relies heavily on reader_utils.seqs2data.
        Can also read sentence classification tasks for which the labels should
        be specified in the comments.
        Note that this read corresponds to a variety of task_types, but the
        differences between them during data reading are kept minimal
        """
        data = []
        word_idx = self.datasets[dataset]['word_idx']
        sent_counter = 0
        tknzr = BasicTokenizer()

        for sent, full_data in seqs2data(path, self.do_lowercase):
            task2type = {}
            sent_counter += 1
            if max_sents != 0 and sent_counter > max_sents:
                break
            sent_tasks = {}
            tokens = [token[word_idx] for token in sent]
            for tokenIdx in range(len(tokens)):
                if len(tknzr._clean_text(tokens[tokenIdx])) == 0:
                    tokens[tokenIdx] = self.tokenizer.tokenizer.unk_token
            sent_tasks['tokens'] = [Token(token) for token in tokens]

            col_idxs = {'word_idx': word_idx}
            for task in self.datasets[dataset]['tasks']:
                sent_tasks[task] = []
                task_type = self.datasets[dataset]['tasks'][task]['task_type']
                task_idx = self.datasets[dataset]['tasks'][task]['column_idx']
                task2type[task] = task_type
                col_idxs[task] = task_idx
                if task_type == 'classification' and task_idx == -1:
                    start = '# ' + task + ': '
                    for line in full_data:
                        if line[0].startswith(start):
                            sent_tasks[task] = line[0][len(start):]
                elif task_type in ['seq', 'multiseq', 'seq_bio']:
                    for word_data in sent:
                        sent_tasks[task].append(word_data[task_idx])
                elif task_type == 'string2string':
                    for word_data in sent:
                        task_label = gen_lemma_rule(word_data[word_idx],
                                                    word_data[task_idx])
                        sent_tasks[task].append(task_label)
                elif task_type == 'dependency':
                    heads = []
                    rels = []
                    for word_data in sent:
                        if not word_data[task_idx].isdigit():
                            logger.error(
                                "Your dependency file " + path +
                                " seems to contain invalid structures sentence "
                                + str(sent_counter) +
                                " contains a non-integer head: " +
                                word_data[task_idx] +
                                "\nIf you directly used UD data, this could be due to special EUD constructions which we do not support, you can clean your conllu file by using scripts/misc/cleanconl.py"
                            )
                            exit(1)
                        heads.append(int(word_data[task_idx]))
                        rels.append(word_data[task_idx + 1])
                    sent_tasks[task] = list(zip(rels, heads))
                else:
                    logger.error('Task type ' + task_type + ' for task ' +
                                 task + ' in dataset ' + dataset +
                                 ' is unknown')
            data.append(
                self.text_to_instance(sent_tasks, full_data, col_idxs,
                                      is_train, task2type, dataset))
        return data
예제 #20
0
import sys
from collections import OrderedDict
import json
from transformers.tokenization_bert import BasicTokenizer
tokenizer = BasicTokenizer(do_lower_case=True)

data1 = json.load(open(sys.argv[1]),object_pairs_hook=OrderedDict)
data2 = json.load(open(sys.argv[2]))

for key,value in data1.items():
    value = tokenizer.tokenize(value)
    if len(value) >30 and data2[key] != "" and  data2[key] != "empty":
        data1[key] = data2[key]

json.dump(data1,open(sys.argv[3],"w"),ensure_ascii=False,indent=4)
예제 #21
0
class WordEmbedding(nn.Module):
    def __init__(self, config):
        super(WordEmbedding, self).__init__()
        self.config = config
        self.tokenizer = BasicTokenizer(do_lower_case=True)
        # standard deviation of initialization
        init_std = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.initializer_range

        self.words = []
        self.word2idx = {}
        self.embeddings = []
        with open(config.MODEL.LANGUAGE_BACKBONE.EMBEDDING_PATH, 'r') as fin:
            for row in fin:
                row_tk = row.split()
                self.words.append(row_tk[0])
                self.word2idx[row_tk[0]] = len(self.words) - 1
                self.embeddings.append([float(num) for num in row_tk[1:]])

        self.embeddings = torch.tensor(
            np.asarray(self.embeddings, dtype=np.float32)).cuda()
        self.embeddings = nn.Parameter(self.embeddings)
        self.out_channels = self.embeddings.shape[-1]
        if self.config.MODEL.LANGUAGE_BACKBONE.FREEZE:
            self.embeddings.requires_grad = False

        self.words.extend(['[OOV]', '[PAD]', '[CLS]', '[SEP]', '[MASK]'])
        self.oov_idx = len(self.words) - 5
        self.pad_idx = len(self.words) - 4
        self.cls_idx = len(self.words) - 3
        self.sep_idx = len(self.words) - 2
        self.mask_idx = len(self.words) - 1
        self.special_tokens = set([
            self.oov_idx, self.pad_idx, self.cls_idx, self.sep_idx,
            self.mask_idx
        ])
        self.special_embeddings = nn.Parameter(
            torch.zeros(5, self.out_channels).cuda())
        self.special_embeddings.data.normal_(mean=0.0, std=init_std)
        self.aug_embeddings = torch.cat(
            [self.embeddings, self.special_embeddings], dim=0)
        head_config = self.config.MODEL.MMSS_HEAD.TRANSFORMER
        self.mlm = head_config.MASKED_LANGUAGE_MODELING
        self.mlm_prob = head_config.MASKED_LANGUAGE_MODELING_PROB
        self.mlm_prob_mask = head_config.MASKED_LANGUAGE_MODELING_PROB_MASK
        self.mlm_prob_noise = head_config.MASKED_LANGUAGE_MODELING_PROB_NOISE
        self.mlm_during_validation = head_config.MASKED_LANGUAGE_MODELING_VALIDATION
        self.add_position_embedding = config.MODEL.LANGUAGE_BACKBONE.ADD_POSITION_EMBEDDING
        if self.add_position_embedding:
            # maximum length of a sentence
            m = config.MODEL.MMSS_HEAD.TRANSFORMER.BERT_CONFIG.max_position_embeddings
            self.position_embedding = nn.Parameter(
                torch.zeros(m, self.out_channels))
            self.position_embedding.data.normal_(mean=0.0, std=init_std)

    def forward(self, text_list):
        tokenized_batch = {
            'input_ids': [],
            'attention_mask': [],
            'encoded_tokens': [],
            'input_embeddings': [],
            'special_tokens_mask': [],
        }
        for i in range(len(text_list)):
            tokens = self.tokenizer.tokenize(text_list[i])
            ids = [self.word2idx.get(t, self.oov_idx) for t in tokens]
            ids = [self.cls_idx] + ids + [self.sep_idx]
            tokenized_batch['input_ids'].append(ids)

        max_len = max([len(i) for i in tokenized_batch['input_ids']])
        for i in range(len(text_list)):
            ids = tokenized_batch['input_ids'][i]
            l = len(ids)
            ids.extend([self.pad_idx] * (max_len - l))

        if self.mlm:
            tokenized_batch['target_ids'] = deepcopy(
                tokenized_batch['input_ids'])
            tokenized_batch['mlm_mask'] = []
            for i, item in enumerate(tokenized_batch['input_ids']):
                mlm_mask = []
                for j in range(len(item)):
                    if (item[j] in self.special_tokens or
                            not (self.training or self.mlm_during_validation)):
                        mlm_mask.append(0)
                        continue
                    prob = np.random.rand()
                    if prob < self.mlm_prob:
                        mlm_mask.append(1)
                        prob /= self.mlm_prob
                        if prob < self.mlm_prob_mask:
                            item[j] = self.mask_idx
                        elif prob < self.mlm_prob_mask + self.mlm_prob_noise:
                            # assuming special tokens are at the end of the words list
                            item[j] = np.random.randint(
                                len(self.words) - len(self.special_tokens))
                    else:
                        mlm_mask.append(0)
                tokenized_batch['mlm_mask'].append(mlm_mask)

        for i in range(len(text_list)):
            ids = np.asarray(tokenized_batch['input_ids'][i])
            tokenized_batch['attention_mask'].append(
                (ids != self.pad_idx).astype(np.int64))
            enc = self.aug_embeddings[ids]
            tokenized_batch['input_embeddings'].append(enc)
            if self.add_position_embedding:
                enc = enc + self.position_embedding[:max_len]
            tokenized_batch['encoded_tokens'].append(enc)
            sp_mask = []
            for tk in ids:
                if tk in self.special_tokens:
                    sp_mask.append(1)
                else:
                    sp_mask.append(0)
            tokenized_batch['special_tokens_mask'].append(sp_mask)

        tokenized_batch['input_embeddings'] = torch.stack(
            tokenized_batch['input_embeddings'], dim=0)
        tokenized_batch['encoded_tokens'] = torch.stack(
            tokenized_batch['encoded_tokens'], dim=0)
        tokenized_batch['input_ids'] = torch.tensor(
            tokenized_batch['input_ids']).cuda()
        tokenized_batch['attention_mask'] = torch.tensor(
            tokenized_batch['attention_mask']).cuda()
        tokenized_batch['special_tokens_mask'] = torch.tensor(
            tokenized_batch['special_tokens_mask']).cuda()
        if self.mlm:
            tokenized_batch['mlm_mask'] = torch.tensor(
                tokenized_batch['mlm_mask']).cuda()
            tokenized_batch['target_ids'] = torch.tensor(
                tokenized_batch['target_ids']).cuda()
        return tokenized_batch
예제 #22
0
def basic_tokenize(string):
    """Use Bert BasicTokenizer as the tokenizer."""

    return BasicTokenizer().tokenize(string)
예제 #23
0
import logging
import json
import csv
import sys

import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import jieba.posseg as psg

from transformers.tokenization_bert import BasicTokenizer

__author__ = "*****@*****.**"

tokenizer = BasicTokenizer(do_lower_case=True)

logging.basicConfig(
    level=logging.INFO,
    format=
    '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger()

eps = 10e-8

# ensemble_list = ["14", "17", "21", "22", "23", "25", "26", "27", "28", "29", "33", "34", "35", "36", "37"]
ensemble_list = [
    "14", "17", "21", "22", "23", "25", "26", "27", "28", "29", "33", "34",
    "35", "36", "37", "38", "39"
]
예제 #24
0
from dataclasses import dataclass, field
import regex as re
from typing import List, Union

from relogic.structures.structure import Structure
from relogic.structures.token import Token
from relogic.structures.span import Span
from transformers.tokenization_bert import BasicTokenizer

basic_tokenizer = BasicTokenizer(do_lower_case=False)

PAT = re.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)


@dataclass
class Sentence(Structure):
    idx: int = None
    text: str = None
    tokens: List[Token] = field(default_factory=list)
    text_: str = None
    pos: List = field(default_factory=list)
    spans: List[Span] = field(default_factory=list)
    predicate_text: str = None
    predicate_index: int = None
    predicates: List = field(default_factory=list)
    srl_labels: List = field(default_factory=list)

    tokenizer: str = "space"
예제 #25
0
def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # When we created the data, we kept track of the alignment between original
    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
    # now `orig_text` contains the span of our original text corresponding to the
    # span that we predicted.
    #
    # However, `orig_text` may contain extra characters that we don't want in
    # our prediction.
    #
    # For example, let's say:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # We don't want to return `orig_text` because it contains the extra "'s".
    #
    # We don't want to return `pred_text` because it's already been normalized
    # (the SQuAD eval script also does punctuation stripping/lower casing but
    # our tokenizer does additional normalization like stripping accent
    # characters).
    #
    # What we really want to return is "Steve Smith".
    #
    # Therefore, we have to apply a semi-complicated alignment heuristic between
    # `pred_text` and `orig_text` to get a character-to-character alignment. This
    # can fail in certain cases in which case we just return `orig_text`.

    def _strip_spaces(text):
        ns_chars = []
        ns_to_s_map = collections.OrderedDict()
        for (i, c) in enumerate(text):
            if c == " ":
                continue
            ns_to_s_map[len(ns_chars)] = i
            ns_chars.append(c)
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # We first tokenize `orig_text`, strip whitespace from the result
    # and `pred_text`, and check if they are the same length. If they are
    # NOT the same length, the heuristic has failed. If they are the same
    # length, we assume the characters are one-to-one aligned.

    # There are some unicodes that causes an error, replace it as ' ' or ''.
    orig_text = orig_text.replace(u'\xa0', u' ')
    orig_text = orig_text.replace('', '')
    orig_text = orig_text.replace(u'\u200e', u'')

    pred_text = pred_text.replace(u'\xa0', u' ')
    pred_text = pred_text.replace('', '')
    pred_text = pred_text.replace(u'\u200e', u'')

    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
    tok_text = " ".join(tokenizer.tokenize(orig_text))

    # Also, at some cases, if pred_text contains [UNK] or '##', it causes error and this function return just orig_text.
    # To fix this, we did some pre-processing for pred_text based on tok_text.
    correct_text = word_correction(pred_text, tok_text)
    if correct_text == -1:
        if verbose_logging:
            logger.info("Fail to correction: '%s' using '%s'" %
                        (pred_text, tok_text))
        return orig_text

    pred_text = correct_text

    start_position = tok_text.find(pred_text)
    if start_position == -1:
        if verbose_logging:
            logger.info("Unable to find text: '%s' in '%s'" %
                        (pred_text, tok_text))
        return orig_text
    end_position = start_position + len(pred_text) - 1

    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
                "Length not equal after stripping spaces: '%s' vs '%s'",
                orig_ns_text, tok_ns_text)
            logger.info("Lenth error: '%s' vs '%s'", orig_text, tok_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
    # the character-to-character alignment.
    tok_s_to_ns_map = {}
    for (i, tok_index) in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
    return output_text
예제 #26
0
def build_from_wikipedia(dump_db_file, **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = BasicTokenizer(do_lower_case=False)
    normalizer = BertLowercaseNormalizer()
    MentionDB.build_from_wikipedia(dump_db, tokenizer, normalizer, **kwargs)