Exemplo n.º 1
0
def run_dataloader():
    """test dataloader"""
    parser = get_parser()

    # add model specific args
    parser = BertLabeling.add_model_specific_args(parser)

    # add all the available trainer options to argparse
    # ie: now --gpus --num_nodes ... --fast_dev_run all work in the cli
    parser = Trainer.add_argparse_args(parser)

    args = parser.parse_args()
    args.workers = 0
    args.default_root_dir = "/scratch/shravya.k/train_logs/debug"

    model = BertLabeling(args)
    from tokenizers import BertWordPieceTokenizer
    tokenizer = BertWordPieceTokenizer(
        os.path.join(args.bert_config_dir, "vocab.txt"))

    loader = model.get_dataloader("dev", limit=1000)
    for d in loader:
        input_ids = d[0][0].tolist()
        match_labels = d[-1][0]
        start_positions, end_positions = torch.where(match_labels > 0)
        start_positions = start_positions.tolist()
        end_positions = end_positions.tolist()
        if not start_positions:
            continue
        print("=" * 20)
        print(tokenizer.decode(input_ids, skip_special_tokens=False))
        for start, end in zip(start_positions, end_positions):
            print(tokenizer.decode(input_ids[start:end + 1]))
def run_zh():
    from tokenizers import BertWordPieceTokenizer

    data_path = "/data/nfsdata2/sunzijun/glyce/glyce/data/small_bin"
    bert_path = "/data/nfsdata2/sunzijun/glyce/glyce/bert_chinese_base_large_vocab"
    config_path = "/data/nfsdata2/sunzijun/glyce/glyce/config"

    tokenizer = BertWordPieceTokenizer(os.path.join(bert_path, "vocab.txt"))
    prefix = "small"
    fields = None

    dataset = DynamicGlyceMaskedLMDataset(config_path=config_path,
                                          directory=data_path,
                                          vocab_file=os.path.join(
                                              bert_path, "vocab.txt"),
                                          prefix=prefix,
                                          fields=fields,
                                          max_length=512)
    print(len(dataset))
    from tqdm import tqdm
    for d in tqdm(dataset):
        print([v.shape for v in d])
        print(tokenizer.decode(d[0].tolist(), skip_special_tokens=False))
        tgt = [
            src if label == -100 else label
            for src, label in zip(d[0].tolist(), d[2].tolist())
        ]
        print(tokenizer.decode(tgt, skip_special_tokens=False))
Exemplo n.º 3
0
def main(args):
    print(args)
    if args['train']:
        tokenizer = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=True,
            strip_accents=True,  # Must be False if cased model
            lowercase=True,
            wordpieces_prefix="##"
        )

        tokenizer.train(
            files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'],
            limit_alphabet=6000,
            vocab_size=32000
        )

        print(tokenizer.save_model("../BertWordPieceTokenizer_32000"))

    elif args['test']:
        test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.'

        print("=========== tokenizer ===========")
        tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str.ids)
        decoded_str = tokenizer.decode(encoded_str.ids)
        print(decoded_str)

        print("=========== BertTokenizer ===========")
        tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)

        print("=========== BertTokenizer2 ===========")
        tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000")
        print(tokenizer)
        encoded_str = tokenizer.encode(test_str)
        print('encoding: ', encoded_str)
        decoded_str = tokenizer.decode(encoded_str)
        print(decoded_str)
def run_dataset():
    """test dataset"""
    import os
    from datasets.collate_functions import collate_to_max_length
    from torch.utils.data import DataLoader
    # zh datasets
    # bert_path = "/mnt/mrc/chinese_L-12_H-768_A-12"
    # json_path = "/mnt/mrc/zh_msra/mrc-ner.test"
    # # json_path = "/mnt/mrc/zh_onto4/mrc-ner.train"
    # is_chinese = True

    # en datasets
    bert_path = "/mnt/mrc/bert-base-uncased"
    json_path = "/mnt/mrc/ace2004/mrc-ner.train"
    # json_path = "/mnt/mrc/genia/mrc-ner.train"
    is_chinese = False

    vocab_file = os.path.join(bert_path, "vocab.txt")
    tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file)
    dataset = MRCNERDataset(json_path=json_path,
                            tokenizer=tokenizer,
                            is_chinese=is_chinese)

    dataloader = DataLoader(dataset,
                            batch_size=32,
                            collate_fn=collate_to_max_length)

    for batch in dataloader:
        for tokens, token_type_ids, start_labels, end_labels, start_label_mask, end_label_mask, match_labels, sample_idx, label_idx in zip(
                *batch):
            tokens = tokens.tolist()
            start_positions, end_positions = torch.where(match_labels > 0)
            start_positions = start_positions.tolist()
            end_positions = end_positions.tolist()
            if not start_positions:
                continue
            print("=" * 20)
            print(f"len: {len(tokens)}",
                  tokenizer.decode(tokens, skip_special_tokens=False))
            for start, end in zip(start_positions, end_positions):
                print(
                    str(sample_idx.item()),
                    str(label_idx.item()) + "\t" +
                    tokenizer.decode(tokens[start:end + 1]))
    def generate_custom_vocab(self):

        try:
            tokenizer = None
            # root dir path check and generate
            if not os.path.isdir(self.vocab_root_dir):
                os.makedirs(self.vocab_root_dir, exist_ok=True)

            # generate models directory
            self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/'
            os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True)

            user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]',
                                    '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]']
            unused_token_num = 200
            unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)]
            user_defined_symbols = user_defined_symbols + unused_list

            if self.tokenizer_type == 'word':
                # if lowercase is False must set strip_accents option as 'False'
                tokenizer = BertWordPieceTokenizer(strip_accents=False,
                                                   lowercase=True,
                                                   clean_text=True,
                                                   handle_chinese_chars=True,
                                                   wordpieces_prefix="##"
                                                   )

            # when selected 'base' going to use bert-base-uncased tokenizer... close function

            # training vocab start
            corpus_file = [self.corpus_path]
            vocab_size = 32000
            limit_alphabet = 6000
            min_frequency = 3
            tokenizer.train(files=corpus_file,
                            vocab_size=vocab_size,
                            special_tokens=user_defined_symbols,
                            min_frequency=min_frequency,  # 단어의 최소 발생 빈도, 3
                            limit_alphabet=limit_alphabet,  # ByteLevelBPETokenizer 학습시엔 주석처리 필요
                            show_progress=True)

            self.setPrint('Customer Tokenizer Training is completed')

            sentence = '전화 통화가 정상적으로 안됨.'
            output = tokenizer.encode(sentence)
            self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence))
            self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'.
                          format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids)))

            # save tokenizer
            tokenizer.save_model(self.vocab_root_dir + self.vocab_dir)

        except:
            self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0],
                                                           sys.exc_info()[1],
                                                           sys.exc_info()[2].tb_lineno))
Exemplo n.º 6
0
def run():
    data_path = "/data/nfsdata2/sunzijun/glyce/glyce/data/bin"
    bert_path = "/data/nfsdata2/sunzijun/glyce/glyce/bert_chinese_base_large_vocab"

    tokenizer = BertWordPieceTokenizer(os.path.join(bert_path, "vocab.txt"))
    prefix = "dev"
    dataset = StaticGlyceMaskLMDataset(data_path,
                                       vocab_file=os.path.join(
                                           bert_path, "vocab.txt"),
                                       prefix=prefix,
                                       max_length=512)
    print(len(dataset))
    from tqdm import tqdm
    for d in tqdm(dataset):
        print([v.shape for v in d])
        print(tokenizer.decode(d[0].tolist(), skip_special_tokens=False))
        tgt = [
            src if label == -100 else label
            for src, label in zip(d[0].tolist(), d[2].tolist())
        ]
        print(tokenizer.decode(tgt, skip_special_tokens=False))
Exemplo n.º 7
0
class CheckerDecoder:
    def __init__(self, model_dir):
        self.detector = DetectorModel(os.path.join(model_dir, 'detector'))
        self.corrector = CorrectorModel(os.path.join(model_dir, 'corrector'))
        self.tokenizer = BertWordPieceTokenizer(
            os.path.join(model_dir, 'vocab.txt'))
        mask_id = self.tokenizer.encode('[MASK]').ids[1:-1]
        assert len(mask_id) == 1
        self.mask_id = mask_id[0]

    def predict(self, text, suggest=False, k=5, max_k=200):
        tokenized = self.tokenizer.encode(text)
        if len(tokenized.tokens) > MAX_LEN:
            raise ValueError('The text is too long (>512) to process')
        token_ids = tokenized.ids
        segment_ids = tokenized.type_ids
        mapping = rematch(tokenized.offsets)
        token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids])
        probas = self.detector.predict(token_ids, segment_ids)[0][0]
        incorrect_ids = np.where(probas > 0.5)[0]
        token_ids[0, incorrect_ids] = self.mask_id

        if not suggest:
            ret = []
            for i in incorrect_ids:
                ret.append((i - 1, tokenized.tokens[i]))
            return ret

        probas = self.corrector.predict(token_ids, segment_ids)[0][0]
        sorted_probas, sort_indexs = topK(probas, max_k)
        ret = {}
        for i in incorrect_ids:
            if i == 0 or i == len(tokenized.tokens) - 1:
                continue
            current_token = text[mapping[i][0]:mapping[i][-1] + 1]
            current_pinyin = ' '.join(xmnlp.pinyin(current_token))
            cands = []
            for proba, token in zip(
                    sorted_probas[i],
                    self.tokenizer.decode(sort_indexs[i]).split()):
                pinyin = ' '.join(xmnlp.pinyin(token))
                score = 0
                if current_pinyin == pinyin:
                    score = 1
                cands.append((token, proba + score))
            cands.sort(key=lambda x: x[1], reverse=True)
            ret[(i - 1, current_token)] = cands[:k]
        return dict(ret)
Exemplo n.º 8
0
def run_dataset():
    """test dataset"""
    import os

    #from datasets.collate_functions import collate_to_max_length
    from collate_functions import collate_to_max_length, collate_to_max_length_demo

    from torch.utils.data import DataLoader
    # zh datasets
    # bert_path = "/mnt/mrc/chinese_L-12_H-768_A-12"
    # json_path = "/mnt/mrc/zh_msra/mrc-ner.test"
    # # json_path = "/mnt/mrc/zh_onto4/mrc-ner.train"
    # is_chinese = True

    # en datasets
    bert_path = "../chinese_roberta_wwm_large_ext_pytorch"
    json_path = "zh_msra/mrc-ner.demo"
    # json_path = "/mnt/mrc/genia/mrc-ner.train"
    is_chinese = False

    vocab_file = os.path.join(bert_path, "vocab.txt")
    #assert os.path.exists(vocab_file)
    tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file)

    dataset = MRCNERDataset_demo(json_path=json_path,
                                 tokenizer=tokenizer,
                                 is_chinese=is_chinese)

    dataloader = DataLoader(dataset,
                            batch_size=32,
                            collate_fn=collate_to_max_length_demo)
    # dataloader = DataLoader(dataset, batch_size=32)

    for batch in dataloader:
        for tokens, token_type_ids in zip(*batch):
            tokens = tokens.tolist()

            print("=" * 20)
            print(f"len: {len(tokens)}",
                  tokenizer.decode(tokens, skip_special_tokens=False))
Exemplo n.º 9
0
class PreTrainedTokenizer(GenericTokenizer):
    vocab_files = {
        "bert-base-uncased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
        "bert-large-uncased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
        "bert-base-cased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
        "bert-large-cased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
        "bert-base-multilingual-uncased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
        "bert-base-multilingual-cased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
        "bert-base-chinese":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
        "bert-base-german-cased":
        "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
        "bert-large-uncased-whole-word-masking":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
        "bert-large-cased-whole-word-masking":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
        "bert-large-uncased-whole-word-masking-finetuned-squad":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
        "bert-large-cased-whole-word-masking-finetuned-squad":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
        "bert-base-cased-finetuned-mrpc":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
        "bert-base-german-dbmdz-cased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
        "bert-base-german-dbmdz-uncased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
        "bert-base-finnish-cased-v1":
        "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
        "bert-base-finnish-uncased-v1":
        "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
        "bert-base-dutch-cased":
        "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
        "moses-pre-tokenized-wmt-uncased-fr":
        "https://drive.google.com/uc?export=download&id=1kYxOhJh4UshVE_SGYMANjLn_oEB6RMYC",
        "moses-pre-tokenized-wmt-uncased-en":
        "https://drive.google.com/uc?export=download&id=1hIURG9eiIXQYCm8cS4vJM3RLVl6UcW32",
        "moses-pre-tokenized-paracrawl-uncased-accented-de":
        "https://drive.google.com/uc?export=download&id=15EKdo2IXyyfZvrpOEwtx4KgeeL6Ot-Gi"
    }

    def __init__(self,
                 lang,
                 root='../.data',
                 clean_text=False,
                 handle_chinese_chars=True,
                 strip_accents=False,
                 lowercase=True):
        """
        Example instantiation: PreTrainedTokenizer("bert-base-uncased", root="../.data")
        """
        pre_trained_model_name = self.get_default_model_name(lang, lowercase)
        self._model_name_ = pre_trained_model_name
        if not os.path.exists(root):
            os.mkdir(root)
        assert pre_trained_model_name in self.vocab_files, \
            "The requested pre_trained tokenizer model {} does not exist!".format(pre_trained_model_name)
        url = self.vocab_files[pre_trained_model_name]
        f_name = root + "/" + pre_trained_model_name + ".txt"
        if not os.path.exists(f_name):
            with open(f_name, "wb") as file_:
                response = get(url)
                file_.write(response.content)
        self.moses_tkn = PyMosesTokenizer(lang, lowercase)
        self.tokenizer = BertWordPieceTokenizer(
            f_name,
            clean_text=clean_text,
            lowercase=lowercase,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents)
        self.mid_tokens = {
            ".": "&md;",
            "-": "&hp;",
            "\'": "&ma;",
            ",": "&mc;",
            " ": "&fs;"
        }
        self.reverse_mid_tokens = {v: k for k, v in self.mid_tokens.items()}
        self.lang = lang

    def get_tokenized_sub_tokens(self, token, mid_sign):
        result = []
        if mid_sign in self.mid_tokens:
            sub_tokens = token.split(mid_sign)
            assert len(sub_tokens) > 1
            if not len("".join(sub_tokens)):
                result.append(self.mid_tokens[" "])
            for sub_token in sub_tokens[:-1]:
                result.append(sub_token)
                result.append(self.mid_tokens[mid_sign])
            if len(sub_tokens[-1]) or (mid_sign == '\'' and self.lang == "fr"):
                result.append(sub_tokens[-1])
            else:  # case like "p.m." where the last token is empty
                result.append(self.mid_tokens[" "])
        else:
            result.append(token)
        return result

    def tokenize_token(self, tokens, mid_sign):
        res = []
        for token in tokens:
            if len(token) > 1 and mid_sign in token:
                for sub_token in self.get_tokenized_sub_tokens(
                        token, mid_sign):
                    res.append(sub_token)
            else:
                res.append(token)
        return res

    def tokenize(self, text):
        """
        You can recover the output of this function using " ".join(encoded_list).replace(" ##", "")
        :param text: one line of text in type of str
        :return a list of tokenized "str"s
        """
        if not len(text.strip()):
            return [""]
        tokens = []
        for token in self.moses_tkn.tokenize(text):
            if token.startswith("'") and token != "'":
                token = token.replace("'", "\'")
            if self.lang == "fr" and len(token) > 1 and token[1:] == "'":
                token = token.replace("'", "\'")
            elif self.lang == "fr" and "qu'" in token:
                token = token.replace("'", "\'")
            sub_ts = [token]
            for mid_sign in self.mid_tokens:
                sub_ts = self.tokenize_token(sub_ts, mid_sign)
            for sub_token in sub_ts:
                tokens.append(sub_token)
        # encoding = self.tokenizer.encode(n_text, add_special_tokens=False)
        encoding = self.tokenizer.encode(tokens,
                                         is_pretokenized=True,
                                         add_special_tokens=False)
        # encoding contains "ids", "tokens", and "offsets"
        return encoding.tokens

    def detokenize(self, tokenized_list):
        # TODO make it work on more test examples
        temp_result = []
        # Merging sub-tokens
        for token in tokenized_list:
            if len(temp_result) and token.startswith("##"):
                temp_result[-1] = temp_result[-1] + token[2:]
            else:
                temp_result.append(token)
        result = []
        index = 0
        t_len = len(temp_result)
        # merging & tokens for moses decoder
        while index < t_len:
            if temp_result[index] == "&" and index < t_len - 2 and temp_result[
                    index + 2] == ";":
                result.append("".join(temp_result[index:index + 3]))
                index += 3
            elif temp_result[
                    index] == "&" and index < t_len - 3 and temp_result[
                        index + 3] == ";":
                result.append("".join(temp_result[index:index + 4]))
                index += 4
            else:
                result.append(temp_result[index])
                index += 1
        del temp_result[:]
        index = 0
        t_len = len(result)
        # merging &hyphen; tokens for moses decoder
        while index < t_len:
            if result[index] in self.reverse_mid_tokens:
                if not len(temp_result):
                    temp_result.append("")
                if index + 1 < t_len and result[
                        index +
                        1] in self.reverse_mid_tokens:  # final dot in "p.m."
                    temp_result[-1] += self.reverse_mid_tokens[
                        result[index]] + self.reverse_mid_tokens[result[index +
                                                                        1]]
                    index += 2
                elif index + 1 < t_len:  # middle dot in "p.m."
                    temp_result[-1] += self.reverse_mid_tokens[
                        result[index]] + result[index + 1]
                    index += 2
                else:  # any thing else"
                    temp_result[-1] += self.reverse_mid_tokens[result[index]]
                    index += 1
            else:
                temp_result.append(result[index])
                index += 1
        return self.moses_tkn.detokenize(temp_result)

    def decode(self, encoded_ids_list):
        """
        :param encoded_ids_list: list of int ids
        :return a decoded str
        """
        decoded = self.tokenizer.decode(encoded_ids_list)
        return decoded

    @staticmethod
    def get_default_model_name(lang, lowercase):
        if lang == "en" and lowercase:
            return "bert-base-uncased"
        elif lang == "en" and not lowercase:
            return "bert-base-cased"
        elif lang == "zh":
            return "bert-base-chinese"
        elif lang == "de" and lowercase:
            return "bert-base-german-dbmdz-uncased"
        elif lang == "de" and not lowercase:
            return "bert-base-german-dbmdz-cased"
        elif lang == "fi" and lowercase:
            return "bert-base-finnish-uncased-v1"
        elif lang == "fi" and not lowercase:
            return "bert-base-finnish-cased-v1"
        elif lang == "fr" and lowercase:
            return "moses-pre-tokenized-wmt-uncased-fr"
        else:
            raise ValueError(
                "No pre-trained tokenizer found for language {} in {} mode".
                format(lang, "lowercased" if lowercase else "cased"))

    @property
    def model_name(self):
        return self._model_name_
Exemplo n.º 10
0
class TFLiteNLU:
    """Abstraction for using TFLite NLU models

    Args:
        model_dir (str): path to the model directory containing nlu.tflite,
                         metadata.json, and vocab.txt
    """
    def __init__(self, model_dir: str) -> None:
        self._model = TFLiteModel(
            model_path=os.path.join(model_dir, "nlu.tflite"))
        self._metadata = utils.load_json(
            os.path.join(model_dir, "metadata.json"))
        self._tokenizer = BertWordPieceTokenizer(
            os.path.join(model_dir, "vocab.txt"))
        self._max_length = self._model.input_details[0]["shape"][-1]
        self._intent_decoder = {
            i: intent["name"]
            for i, intent in enumerate(self._metadata["intents"])
        }
        self._tag_decoder = {
            i: tag
            for i, tag in enumerate(self._metadata["tags"])
        }
        self._intent_meta = {
            intent.pop("name"): intent
            for intent in self._metadata["intents"]
        }
        self._slot_meta = {}
        for intent in self._intent_meta:
            for slot in self._intent_meta[intent]["slots"]:
                self._slot_meta[slot.pop("name")] = slot
        self._warm_up()

    def __call__(self, utterance: str) -> Result:
        """Classifies a string utterance into an intent and identifies any associated
            slots contained in the utterance. The slots get parsed based on type and
            then returned along with the intent and its associated confidence value.

        Args:
            utterance (str): string that needs to be understood

        Returns (Result): A class with properties for the identified intent, along with
                        raw, parsed slots and model confidence in prediction

        """
        inputs, input_ids = self._encode(utterance)
        outputs = self._model(inputs)
        intent, tags, confidence = self._decode(outputs)

        # slice off special tokens: [CLS], [SEP]
        tags = tags[:len(input_ids) - 2]
        _LOG.debug(f"{tags}")
        input_ids = input_ids[1:-1]
        _LOG.debug(f"{input_ids}")
        # retrieve slots from the tagged positions and decode slots back
        # into original values
        slots = [(token_id, tag[2:]) for token_id, tag in zip(input_ids, tags)
                 if tag != "o"]
        _LOG.debug(f"{slots}")

        slot_map: dict = {}
        for (token, tag) in slots:
            if tag in slot_map:
                slot_map[tag].append(token)
            else:
                slot_map[tag] = [token]

        for key, value in slot_map.items():
            slot_map[key] = self._tokenizer.decode(value)

        # attempt to resolve tagged tokens into slots and
        # collect the successful ones
        parsed_slots = {}
        for key in slot_map:
            parsed = self._parse_slots(self._slot_meta[key], slot_map[key])
            parsed_slots[key] = {
                "name": key,
                "parsed_value": parsed,
                "raw_value": slot_map[key],
            }
        _LOG.debug(f"parsed slots: {parsed_slots}")
        return Result(
            utterance=utterance,
            intent=intent,
            confidence=confidence,
            slots=parsed_slots,
        )

    def _warm_up(self) -> None:
        # make an array the same size as the inputs to warm the
        # model since first inference is always slower than subsequent
        warm = np.zeros((self._model.input_details[0]["shape"]),
                        dtype=np.int32)
        _ = self._model(warm)

    def _encode(self, utterance: str) -> Tuple[np.ndarray, List[int]]:
        inputs = self._tokenizer.encode(utterance)
        # get the non-padded/truncated token ids to match the
        # original utterance to the respective labels and
        # use the length to slice the results
        input_ids = inputs.ids
        # it's (max_length + 1) because the [CLS]
        # token gets appended inside the model
        # notice the slice [1:] when we convert to an array
        inputs.truncate(max_length=self._max_length + 1)
        inputs.pad(length=self._max_length + 1)
        inputs = np.array(inputs.ids[1:], np.int32)
        # add the batch dimension for the TFLite model
        inputs = np.expand_dims(inputs, 0)
        return inputs, input_ids

    def _decode(self, outputs: list) -> Tuple[str, List[str], float]:
        # to get the index of the highest probability we
        # apply argmax to the posteriors which allows the
        # labels to be decoded with an integer to string mapping
        # we derive the confidence from the highest probability
        intent_posterior, tag_posterior = outputs
        intents, confidence = self._decode_intent(intent_posterior)
        tags = self._decode_tags(tag_posterior)
        _LOG.debug(f"decoded tags: {tags}")
        _LOG.debug(f"decoded intent: {intents}")
        _LOG.debug(f"confidence: {confidence}")
        return intents, tags, confidence

    def _decode_tags(self, posterior: np.ndarray) -> List[Any]:
        posterior = np.squeeze(posterior, 0)
        tags = np.argmax(posterior, -1)
        return [self._tag_decoder.get(tag) for tag in tags]

    def _decode_intent(self, posterior: np.ndarray) -> Any:
        posterior = np.squeeze(posterior, 0)
        intent = np.argmax(posterior, -1)
        return self._intent_decoder.get(intent), posterior[intent]

    def _parse_slots(self, slot_meta: Dict[str, Any], slots: Dict[str,
                                                                  Any]) -> Any:
        slot_type = slot_meta["type"]
        parser = import_module(f"spokestack.nlu.parsers.{slot_type}")
        facets = json.loads(slot_meta["facets"])
        return parser.parse(facets, slots)  # type: ignore
Exemplo n.º 11
0
# data_00150000_00150539.gif,Place all the blocks individually on the surface.,Disjoint the given stack of blocks.
# data_00110000_00110725.gif,"Separate the given stack to form yellow, red blocks stack.",Remove 2nd and 4th blocks from the given stack.
# data_00120000_00120478.gif,Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block

# Now, let's use it:
#input = "I can feel the magic, can you?"
#input = "Disjoint the given stacks to form a new stack with blue, red blocks."
#input = "Make a new stack with blue, red blocks."
input = "Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block"
print(input)
encoded = tokenizer.encode(input)  #, return_tensors="pt")
print(encoded)

print(encoded.ids)
print(encoded.tokens)
print(tokenizer.decode(encoded.ids))


# Unit testing ;)
def compare(filename, debug=False):
    # Iterate through all commands.
    diffs = 0
    total = 0
    with open(filename, "r") as f:
        csvreader = csv.reader(f, delimiter=';')
        for row in csvreader:
            for command in row:
                total += 1
                # "Custom" processing for comparison - remove commas and three dots.
                command = command.replace(",", "")
                command = command.replace("...", "")
Exemplo n.º 12
0
    with torch.no_grad():
        for batch in dataloader:          
            tokens, token_type_ids = batch

            attention_mask = (tokens != 0).long()
            start_logits, end_logits, span_logits = model(tokens, attention_mask, token_type_ids)

            ls_start=start_logits.squeeze().cpu().numpy().tolist()
            ls_end=end_logits.squeeze().cpu().numpy().tolist()

            for s, e, t in zip(ls_start, ls_end, tokens):
                ss=[i for i,v in enumerate(s) if v>0]
                ee=[i for i,v in enumerate(e) if v>0]

                t=t.tolist()
                t_d=tokenizer.decode(t, skip_special_tokens=True)
                print('\n', t_d[len(query)*2:])

                # print(ss, ee)

                if len(ss)==len(ee) and len(ss)>0:
                    for i, j in zip(ss, ee):
                        print('【Company】: ', tokenizer.decode(t[i:j+1]))
                else:
                    print('【Company】: None')



                

                
class FastBERTTokenizer:
    r"""
    Constructs a tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
    Args:
        vocab_file (:obj:`str`):
            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        sp_model_kwargs (:obj:`dict`, `optional`):
            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
            - ``enable_sampling``: Enable subword regularization.
            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
              - ``nbest_size = {0,1}``: No sampling is performed.
              - ``nbest_size > 1``: samples from the nbest_size results.
              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.
            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
    """
    def __init__(self,
                 vocab_file,
                 fb_model_kwargs: Optional[Dict[str, Any]] = None):
        self.vocab_file = vocab_file
        self.fb_model_kwargs = {} if fb_model_kwargs is None else fb_model_kwargs

        assert os.path.exists(vocab_file), "no existing vocab file."

        #spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
        #spm.load(vocab_file)
        #bpe_vocab_size = spm.GetPieceSize()
        #self.spm = spm

        self.bert_tokenizer = BertWordPieceTokenizer(vocab_file,
                                                     clean_text=False,
                                                     strip_accents=False,
                                                     lowercase=False)

        self.vocab = {}
        self.ids_to_tokens = []
        self.add_from_file(open(vocab_file, 'r'))
        self.add_symbol('<mask>')
        self.vocab_size = len(self.vocab)

    def add_from_file(self, f):
        """
        Loads a pre-existing dictionary from a text file and adds its symbols
        to this instance.
        """

        lines = f.readlines()

        for line in lines:
            line = line.rstrip()
            word = line
            self.add_symbol(word, overwrite=False)

    def add_symbol(self, word, overwrite=False):
        """Adds a word to the dictionary"""
        if word in self.vocab and not overwrite:
            idx = self.vocab[word]
            return idx
        else:
            idx = len(self.ids_to_tokens)
            self.vocab[word] = idx
            self.ids_to_tokens.append(word)
            return idx

    def tokenize(self, text):
        return self.bert_tokenizer.encode(text,
                                          add_special_tokens=False).tokens

    def convert_ids_to_tokens(self, index):
        return self.ids_to_tokens[
            index] if index < self.vocab_size else self.unk

    def _convert_token_to_id(self, token):
        return self.vocab[token]

    def decode(self, x: str) -> str:
        return self.bert_tokenizer.decode([int(tok) for tok in x.split()])

    def pad(self):
        return "[PAD]"

    def bos(self):
        return "[CLS]"

    def eos(self):
        return "[SEP]"

    def unk(self):
        return "[UNK]"

    def mask(self):
        return "<mask>"

    def sym(self, id):
        return self.ids_to_tokens[id]

    def id(self, sym):
        return self.vocab[sym] if sym in self.vocab else 1

    def save_pretrained(self, path: str, filename_prefix: str = None):
        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
        if filename_prefix is not None:
            filename = filename_prefix + "-" + filename
        full_path = os.path.join(path, filename)
        with open(full_path, "wb") as fs:
            #fs.write(self.spm.serialized_model_proto())
            for item in self.ids_to_tokens:
                fs.write(str(item) + '\n')
        return (full_path, )
        #pass

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]
Exemplo n.º 14
0
    with open(metrics_path, 'rb') as metrics_handle:
        metrics_obj = pickle.load(metrics_handle)
        
    return (metrics_obj['token_pairs'],
            metrics_obj['decoded_pairs'],
            metrics_obj['jaccard_similarities'],
            metrics_obj['levenshtein_distances'])
        
token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances = load_metrics_obj()

if not token_pairs:
    token_pairs = [([tokenizer.id_to_token(x) for x in ocr_tokens[i]], [tokenizer.id_to_token(x) for x in gs_tokens[i]]) for i in range(len(ocr_tokens))]
    save_metrics_obj(token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances)
    
if not decoded_pairs:
    decoded_pairs = [(tokenizer.decode(ocr_tokens[i]), tokenizer.decode(gs_tokens[i])) for i in range(len(ocr_tokens))]
    save_metrics_obj(token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances)
    
all_pairs = len(token_pairs)
if not jaccard_similarities:
    jaccard_similarities = []
    for i, token_pair in enumerate(token_pairs):
        jaccard_similarities.append(calculate_jaccard_similarity(token_pair[0], token_pair[1]))
    
    save_metrics_obj(token_pairs, decoded_pairs, jaccard_similarities, levenshtein_distances)
    
if not levenshtein_distances:
    levenshtein_distances = []
    
if len(levenshtein_distances) < all_pairs:
    for i, decoded_pair in enumerate(decoded_pairs):
Exemplo n.º 15
0
#     tokenizer.save('./', 'token_test')
# else:

#     tokenizer = ByteLevelBPETokenizer( "./{}-vocab.json".format('token_test'), "./{}-merges.txt".format('token_test'),
#         add_prefix_space=True,
#     )

# # Now we can encode
# encoded = tokenizer.encode("will be back later.  http://plurk.com/p/rp3k7,will be back later, loooove u @mahboi #blessed")
# print(encoded.tokens)
# print(encoded.offsets)

from tokenizers import BertWordPieceTokenizer
# My arbitrary sentence
sentence = "[CLS] will be back later.  www.facebook.com ,will be back later, loooove u @mahboi #blessed"
# Bert vocabularies
# Instantiate a Bert tokenizers
tokenizer = BertWordPieceTokenizer("bert-large-uncased-vocab.txt",
                                   lowercase=True,
                                   clean_text=True)
tokenizer.add_tokens(['[LINK]'])

tokenizer.enable_padding(max_length=100)
WordPieceEncoder = tokenizer.encode(sentence)
# Print the ids, tokens and offsets
print(WordPieceEncoder.ids)
print(WordPieceEncoder.tokens)
print(WordPieceEncoder.offsets)
print(tokenizer.get_vocab()['[PAD]'])
print(tokenizer.decode(WordPieceEncoder.ids))
Exemplo n.º 16
0
                dec_seq_len=512)
checkpoint = torch.load(
    'checkpoints/amadeus-performer-2020-11-25-00.20.57-300.pt')
model.eval(True)
# model.load_state_dict(torch.load('models/amadeus-performer-2020-11-06-12.47.52.pt'))
model.load_state_dict(checkpoint['model_state_dict'])
model.cuda()

run = True

sentences = []

while run:
    try:
        sentence = input('> ')
        if sentence in ['quit', 'exit']:
            run = False
            continue
        sentences.append(tokenizer.encode(sentence))
        if len(sentences) > 3:
            sentences = sentences[-3:]
        input_seq = torch.tensor(Encoding.merge(sentences[:]).ids).cuda()
        start_tokens = torch.tensor([tokenizer.token_to_id('[CLS]')]).cuda()
        out = model.generate(input_seq=input_seq,
                             start_tokens=start_tokens,
                             eos_token=tokenizer.token_to_id('[SEP]'))
        response = tokenizer.decode(out.tolist())
        sentences.append(tokenizer.encode(response))
        print(response)
    except KeyboardInterrupt:
        run = False