示例#1
0
def test_vocab_with_special_token(tokens, frequencies):
    vocab = Vocab(
        tokens,
        frequencies=frequencies,
        min_frequency=5,
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
    )
    assert len(vocab) == len(tokens) - 1
    assert "c" not in vocab
    assert "##b" in vocab
    assert vocab["a"] == 2
    assert vocab.token2idx(["a", "##b", "<unk>"]) == [2, 3, 5]
    assert vocab.idx2token([2, 3, 5]) == ["a", "##b", "<unk>"]
    assert vocab.sorted_tokens == [
        "<s>",
        "</s>",
        "a",
        "##b",
        "ddd",
        "<unk>",
        "<pad>",
    ]
    assert vocab.sorted_token_lengths == [1, 1, 1, 1, 3, 1, 1]
示例#2
0
def test_vocab_ichar2itoken(
    char_start, char_end, token_start, token_end, input_tokens, tokens, frequencies
):
    vocab = Vocab(
        tokens,
        frequencies=frequencies,
        min_frequency=5,
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<s>",
        eos_token="</s>",
    )
    start_mapping, end_mapping = vocab.create_ichar2itoken_mapping(input_tokens)
    assert start_mapping[char_start] == token_start
    assert end_mapping[char_end] == token_end
示例#3
0
def test_word2vec():
    vocab = Vocab(["a", "b", "c"])
    wv = Word2vec(vocab, 10)
    token_ids = np.array([1, 2, 0, 0])
    embeddings = wv(token_ids)
    assert embeddings.shape == (4, 10)
    assert embeddings._keras_mask.numpy().tolist() == [True, True, False, False]
def test_jieba_tokenizer():
    vocab = Vocab(["你们", "我们", "好"])
    tokenizer = JiebaTokenizer(vocab)
    tokens = tokenizer.tokenize("你们与我们好")
    assert tokens[0] == "你们"
    assert tokens[1] == "与"
    assert tokens[2] == "我们"
    assert tokens[3] == "好"
示例#5
0
def test_vocab_serialization(tokens, frequencies):
    vocab = Vocab(
        tokens,
        frequencies,
        min_frequency=5,
        unk_token="<unk>",
        pad_token="<pad>",
        bos_token="<bos>",
        eos_token="<eos>",
    )
    new_vocab = Vocab.from_json(vocab.to_json())
    assert len(vocab) == len(new_vocab)
    assert vocab.sorted_tokens == new_vocab.sorted_tokens
    assert vocab.pad == new_vocab.pad
    assert vocab.unk == new_vocab.unk
    assert vocab.bos == new_vocab.bos
    assert vocab.eos == new_vocab.eos
    assert str(vocab) == str(new_vocab)
示例#6
0
 def build_vocab(
     texts: Sequence[str],
     segment_func: Callable[[str], Sequence[str]],
     min_frequency=5,
 ) -> Vocab:
     counter = Counter(
         itertools.chain.from_iterable(segment_func(text) for text in texts)
     )
     return Vocab(counter, min_frequency=min_frequency)
示例#7
0
def test_empty_vocab():
    vocab = Vocab([])
    assert vocab.pad == "<pad>"
    assert vocab.unk == "<unk>"
    assert vocab.bos == "<bos>"
    assert vocab.eos == "<eos>"
    assert vocab[vocab.pad] == 0
    assert vocab[vocab.unk] == 1
    assert vocab[vocab.bos] == 2
    assert vocab[vocab.eos] == 3
示例#8
0
 def load(cls, directory: str, epoch: Optional[int] = None) -> "BaseNLPModel":
     with open(os.path.join(directory, "meta.json"), encoding="UTF-8") as f:
         meta = json.loads(f.read())
     with open(os.path.join(directory, "vocab.json")) as f:
         vocab = Vocab.from_json(f.read())
     module = cls.from_config({"vocab": vocab, **meta})
     module._model = tf.keras.models.load_model(
         os.path.join(directory, cls._get_model_filename(epoch=epoch))
     )
     module._built = True
     return module
def test_transform_func(text_with_empty):
    vocab = Vocab(["x", "y"])
    d = ClassificationDataset(
        vocab,
        ["1", "2"],
        segmenter="char",
        is_multilabel=True,
        csv_file=text_with_empty,
        max_length=2,
    )
    data = d.py_transform(tf.constant("xz"), tf.constant("1|2"))
    np.testing.assert_array_equal(data[0], [vocab["x"], vocab[vocab.unk]])
    np.testing.assert_array_equal(data[1], [1, 1])
def test_classification_dataset_transform():
    vocab = Vocab(["x", "y"])
    df = pd.DataFrame({"text": ["xxx", "yyyyy"], "label": ["1|2", "2"]})
    d = ClassificationDataset(vocab, ["2", "1"],
                              segmenter="char",
                              is_multilabel=True,
                              X=df.text,
                              y=df.label)
    dataset = d.batchify(2, shuffle=False)
    for text, label in dataset:
        text = text.numpy()
        assert text[0][-1] == vocab[vocab.pad]
        assert text[1][0] == vocab["y"]
        label = label.numpy()
        np.testing.assert_array_equal(label, [[1, 1], [1, 0]])
def test_create_from_csv(text_without_empty):
    vocab = Vocab(["你", "啊", "拿", "好", "我"])
    labels = ["1", "2"]
    for in_memory in (True, False):
        d = ClassificationDataset(
            vocab,
            labels,
            is_multilabel=True,
            csv_file=text_without_empty,
            in_memory=in_memory,
        )
        dataset = d.batchify(2, shuffle=False)
        for text, label in dataset:
            text = text.numpy()
            assert text[1][-1] == vocab[vocab.pad]
            assert text[1][0] == vocab["我"]
            label = label.numpy()
            np.testing.assert_array_equal(label, [[1, 0], [1, 1]])
示例#12
0
 def from_checkpoint_file(
     cls,
     model_type: BertFamily,
     checkpoint_directory: str,
     config_filename: Optional[str] = None,
 ) -> "ModelCheckpoint":
     cls.create_checkpoint_file(checkpoint_directory)
     with open(os.path.join(checkpoint_directory, "vocab.txt"),
               encoding="UTF-8") as f:
         token_list = f.read().strip("\n").split("\n")
         vocab = Vocab(
             token_list,
             pad_token=token_list[0],
             unk_token=token_list[100],
             bos_token=token_list[101],
             eos_token=token_list[102],
         )
     if not config_filename:
         config_filename = cls.search_config_file(checkpoint_directory)
     config = BertConfig.from_json_file(
         os.path.join(checkpoint_directory, config_filename))
     return cls(model_type, checkpoint_directory, config, vocab)
示例#13
0
def vocab():
    return Vocab("甲乙丙丁葵", bos_token="[CLS]", eos_token="[SEP]")
示例#14
0
    def from_word2vec_format(cls,
                             filename: str,
                             segmenter: str = "jieba") -> "Word2vec":
        pad = "<pad>"
        unk = "<unk>"
        bos = "<s>"
        eos = "</s>"
        num_special_tokens = 0
        vocab_size = 0
        embedding_size = 0
        has_header = False
        has_unk = False
        with open(filename) as f:
            for line in f:
                line = line.strip("\n")
                if line == "":
                    continue
                if embedding_size == 0 and not line.startswith(" "):
                    if len(line.split()) == 2:
                        embedding_size = int(line.split(" ")[1])
                        has_header = True
                        continue
                    else:
                        embedding_size = len(line.split(" ")) - 1
                if (line.startswith(pad) or line.startswith(unk)
                        or line.startswith(bos) or line.startswith(eos)):
                    num_special_tokens += 1
                vocab_size += 1

        num_adding_tokens = 4 - num_special_tokens
        weights = np.zeros((vocab_size + num_adding_tokens, embedding_size))
        weights[2:4, :] = np.random.uniform(-0.1,
                                            0.1,
                                            size=(2, embedding_size))

        tokens = [pad, unk, bos, eos]
        with open(filename) as f:
            for i, line in enumerate(f):
                line = line.strip("\n")
                if line == "":
                    continue
                if has_header:
                    i -= 1
                if i < 0:
                    continue

                cells = line.split(" ")
                if cells[0] == "":
                    del cells[0]
                    cells[0] = " "
                token = cells[0]
                idx = i + num_adding_tokens + num_special_tokens
                if token == pad:
                    idx = 0
                    num_special_tokens -= 1
                elif token == unk:
                    idx = 1
                    has_unk = True
                    num_special_tokens -= 1
                elif token == bos:
                    idx = 2
                    num_special_tokens -= 1
                elif token == eos:
                    idx = 3
                    num_special_tokens -= 1
                else:
                    tokens.append(token)
                vec = list(map(float, cells[1:]))
                weights[idx, :] = vec
        vocab = Vocab(
            tokens,
            pad_token=pad,
            unk_token=unk,
            bos_token=bos,
            eos_token=eos,
        )
        if not has_unk:
            weights[1, :] = weights[4:, :].mean(axis=0)
        word2vec = cls(vocab, embedding_size, segmenter=segmenter)
        word2vec.build()
        word2vec._model.set_weights([weights])
        return word2vec
def test_label_binarizer(text_with_empty):
    vocab = Vocab(["x", "y"])
    d = ClassificationDataset(vocab, ["a", "c", "b"], csv_file=text_with_empty)
    np.testing.assert_array_equal(d.py_label_binarizer(["c", "a", "d"]),
                                  [1, 1, 0])
示例#16
0
def test_vocab_without_special_token(tokens):
    vocab = Vocab(tokens[:-4])
    assert len(vocab) == len(tokens)
    assert vocab.pad == "<pad>"