예제 #1
0
 def test_print_stats_works(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance,
                           max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     vocab_builder.print_stats()
예제 #2
0
 def test_orig_vocab_len(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 0
     vocab_builder = Vocab(instances=single_instance,
                           max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     vocab_len = vocab_builder.get_orig_vocab_len()
     assert vocab_len == 3 + len(vocab_builder.special_vocab)
예제 #3
0
 def test_idx2token_out_of_bounds(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
     vocab_builder.build_vocab()
     print(vocab_builder.get_idx2token_mapping())
     with pytest.raises(ValueError):
         vocab_builder.get_token_from_idx(100)
예제 #4
0
 def test_print_stats_works(self, instances, include_special_vocab):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = None
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         include_special_vocab=include_special_vocab,
     )
     vocab_builder.build_vocab()
     vocab_builder.print_stats()
예제 #5
0
    def test_get_topn(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        words_freqs = vocab_builder.get_topn_frequent_words(n=1)

        assert words_freqs[0][0] == "i"
        assert words_freqs[0][1] == 3
예제 #6
0
 def test_max_num_tokens_unset(self, instances, include_special_vocab):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = None
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         include_special_vocab=include_special_vocab,
     )
     vocab.build_vocab()
     assert vocab.max_num_tokens == 3 + len(vocab.special_vocab.keys())
예제 #7
0
    def test_save_vocab(self, instances, tmpdir):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=MAX_NUM_WORDS)

        vocab_builder.build_vocab()
        vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
        vocab_builder.save_to_file(vocab_file)

        assert os.path.isfile(vocab_file)
예제 #8
0
    def test_load_vocab(self, instances, tmpdir):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
        vocab_builder.save_to_file(vocab_file)

        vocab = Vocab.load_from_file(filename=vocab_file)

        assert vocab.get_vocab_len() == 3 + len(vocab_builder.special_vocab)
def setup_lstm2seqdecoder(request, ):
    HIDDEN_DIM = 1024
    NUM_LAYERS = request.param[0]
    BIDIRECTIONAL = request.param[1]
    TEACHER_FORCING_RATIO = request.param[3]
    MAX_LENGTH = 5

    lines = []
    words = []
    # texts = ["First", "second", "Third"]
    texts = ["First sentence", "second sentence", "Third long sentence here"]
    for text in texts:
        line = Line(text=text)
        word = Line(text=text.split()[0])
        lines.append(line)
        words.append(word)
    flat_texts = [[word for sentence in texts for word in sentence]]
    vocab = Vocab(flat_texts)
    vocab.build_vocab()

    num_direction = 2 if BIDIRECTIONAL else 1
    h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1
    c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2

    embedder = WordEmbedder(embedding_type="glove_6B_50")
    encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) *
                       0.5 if request.param[2] else None)
    decoder = Lstm2SeqDecoder(
        embedder=embedder,
        vocab=vocab,
        max_length=MAX_LENGTH,
        attn_module=request.param[2],
        dropout_value=0.0,
        hidden_dim=HIDDEN_DIM,
        bidirectional=BIDIRECTIONAL,
        rnn_bias=False,
        num_layers=NUM_LAYERS,
    )

    return (
        decoder,
        {
            "HIDDEN_DIM": HIDDEN_DIM,
            "NUM_LAYERS": NUM_LAYERS,
            "MAX_LENGTH": MAX_LENGTH,
            "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO,
            "LINES": lines,
            "WORDS": words,
            "VOCAB_SIZE": vocab.get_vocab_len(),
            "BIDIRECTIONAL": BIDIRECTIONAL,
        },
        encoder_outputs,
        (h0, c0),
    )
예제 #10
0
 def test_load_embedding_has_all_words(self, instances, embedding_type):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         embedding_type=embedding_type,
     )
     vocab.build_vocab()
     embedding = vocab.load_embedding()
     assert embedding.size(0) == vocab.get_vocab_len()
예제 #11
0
    def test_vocab_length_min_freq_1_max_words_1(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 1
        MIN_FREQ = 1

        vocab_builder = Vocab(instances=single_instance,
                              min_count=MIN_FREQ,
                              max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        len_vocab = vocab_builder.get_vocab_len()
        assert len_vocab == 1 + len(vocab_builder.special_vocab)
예제 #12
0
 def test_random_embeddinng_has_2dimensions(self, instances):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         embedding_type=None,
         embedding_dimension=300,
     )
     vocab.build_vocab()
     embeddings = vocab.load_embedding()
     assert embeddings.ndimension() == 2
예제 #13
0
    def test_single_instance_clip_on_max_num(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 1
        vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS)
        vocab_builder.build_vocab()
        vocab = vocab_builder.map_tokens_to_freq_idx()

        vocab = vocab_builder.clip_on_max_num(vocab)

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == MAX_NUM_WORDS + len(vocab_builder.special_vocab)
예제 #14
0
    def test_single_instance_min_count(self, instances):
        single_instance = instances["single_instance"]

        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=1000,
                              min_count=2)
        vocab_builder.build_vocab()
        vocab = vocab_builder.map_tokens_to_freq_idx()
        vocab = vocab_builder.clip_on_mincount(vocab)

        # check that is mapped to unk
        nlp_freq, nlp_idx = vocab["nlp"]
        assert nlp_idx == vocab_builder.token2idx["<UNK>"]
예제 #15
0
    def test_get_topn(self, instances, include_special_vocab):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = None
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            include_special_vocab=include_special_vocab,
        )
        vocab_builder.build_vocab()
        words_freqs = vocab_builder.get_topn_frequent_words(n=1)

        assert words_freqs[0][0] == "i"
        assert words_freqs[0][1] == 3
예제 #16
0
 def test_disp_sentences_from_indices(
     self, instances, tmpdir, include_special_vocab
 ):
     instance_dict = instances
     single_instance = instance_dict["single_instance"]
     MAX_NUM_WORDS = None
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         include_special_vocab=include_special_vocab,
     )
     vocab.build_vocab()
     sent = vocab.get_disp_sentence_from_indices([0, 1, 2])
     assert type(sent) is str
예제 #17
0
 def test_disp_sentences_from_indices(self, instances, tmpdir):
     instance_dict = instances
     single_instance = instance_dict["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         embedding_type=None,
         embedding_dimension=300,
         store_location=vocab_file,
     )
     vocab.build_vocab()
     sent = vocab.get_disp_sentence_from_indices([0, 1, 2, 3])
     assert type(sent) is str
def get_numericalized_instances(get_preprocessed_instances):
    instances, labels = get_preprocessed_instances
    MAX_NUM_WORDS = 3000
    MAX_LENGTH = 15
    vocab = Vocab(instances=instances, max_num_tokens=MAX_NUM_WORDS)
    vocab.build_vocab()
    numericalizer = Numericalizer(vocabulary=vocab)
    numericalized_instances = numericalizer.numericalize_batch_instances(
        instances[:32])
    return {
        "numericalized_instances": numericalized_instances,
        "labels": labels,
        "max_length": MAX_LENGTH,
        "max_num_words": MAX_NUM_WORDS,
        "vocab": vocab,
    }
예제 #19
0
    def test_add_token(self, instances, tmpdir, save_vocab):
        instance_dict = instances
        single_instance = instance_dict["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
        vocab = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            store_location=vocab_file,
        )
        vocab.build_vocab()
        vocab._add_token("very", save_vocab=save_vocab)

        assert "very" in vocab.vocab.keys()
        assert vocab.vocab["very"] == (1, 7)
        assert vocab.token2idx["very"] == 7
        assert vocab.idx2token[7] == "very"
예제 #20
0
 def test_idx2token_for_unk(self, instances):
     """" Many words map to UNK in the vocab. For example say the index for UNK is 3.
     Then mapping 3 to the token should always map to UNK and not any other word
     """
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         start_token="<SOS>",
         end_token="<EOS>",
         pad_token="<PAD>",
         unk_token="<UNK>",
     )
     vocab_builder.build_vocab()
     UNK_IDX = vocab_builder.special_vocab[vocab_builder.unk_token][1]
     assert vocab_builder.get_token_from_idx(UNK_IDX) == "<UNK>"
예제 #21
0
 def test_token2idx(self, instances, start_token, end_token, unk_token, pad_token):
     single_instance = instances["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_builder = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         start_token=start_token,
         end_token=end_token,
         pad_token=pad_token,
         unk_token=unk_token,
     )
     vocab_builder.build_vocab()
     token2idx = vocab_builder.token2idx
     len_indices = len(token2idx.keys())
     indices = token2idx.values()
     indices = sorted(indices)
     assert indices == list(range(len_indices))
예제 #22
0
    def test_idx2token(self, instances, start_token, end_token, unk_token, pad_token):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            start_token=start_token,
            end_token=end_token,
            pad_token=pad_token,
            unk_token=unk_token,
        )
        vocab_builder.build_vocab()
        idx2token = vocab_builder.idx2token
        len_idx2token = len(idx2token)
        indices = idx2token.keys()
        indices = sorted(indices)

        # tests all indices are in order
        assert indices == list(range(len_idx2token))
예제 #23
0
 def test_add_tokens(self, instances, tmpdir):
     instance_dict = instances
     single_instance = instance_dict["single_instance"]
     MAX_NUM_WORDS = 100
     vocab_file = tmpdir.mkdir("tempdir").join("vocab.json")
     vocab = Vocab(
         instances=single_instance,
         max_num_tokens=MAX_NUM_WORDS,
         store_location=vocab_file,
     )
     vocab.build_vocab()
     vocab.add_tokens(["very", "much"])
     assert "very" in vocab.vocab.keys()
     assert "much" in vocab.vocab.keys()
     assert vocab.vocab["very"] == (1, 7)
     assert vocab.vocab["much"] == (1, 8)
     assert vocab.get_token_from_idx(7) == "very"
     assert vocab.get_token_from_idx(8) == "much"
     assert vocab.get_idx_from_token("very") == 7
     assert vocab.get_idx_from_token("much") == 8
예제 #24
0
    def test_build_vocab_single_instance_min_freq_2(self, instances):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = 100
        MIN_FREQ = 2
        vocab_builder = Vocab(instances=single_instance,
                              max_num_tokens=MAX_NUM_WORDS,
                              min_count=MIN_FREQ)
        vocab = vocab_builder.build_vocab()

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == 2 + len(vocab_builder.special_vocab)
예제 #25
0
    def test_single_instance_build_vocab(self, instances, include_special_vocab):
        single_instance = instances["single_instance"]
        MAX_NUM_WORDS = None
        MIN_FREQ = 1
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=MAX_NUM_WORDS,
            min_count=MIN_FREQ,
            include_special_vocab=include_special_vocab,
        )

        vocab = vocab_builder.build_vocab()

        assert "i" in vocab.keys()
        assert "like" in vocab.keys()
        assert "nlp" in vocab.keys()

        vocab_len = len(set(idx for freq, idx in vocab.values()))

        assert vocab_len == 3 + len(vocab_builder.special_vocab)
예제 #26
0
class sprinkle_dataset:
    def __init__(self, vocab_pipe=None, autoset_attrs=True, get_label_stats_table=True):
        if vocab_pipe is None:
            vocab_pipe = ["word_vocab"]
        self.autoset_attrs = autoset_attrs
        self.vocab_pipe = vocab_pipe
        self.is_get_label_stats_table = get_label_stats_table
        self.wrapped_cls = None
        self.init_signature = None
        self.filename = None

        self.word_tokenization_type = None
        self.word_tokenizer = None
        self.word_instances = None
        self.word_vocab = None
        self.max_num_words = None
        self.word_vocab_store_location = None
        self.word_embedding_type = None
        self.word_embedding_dimension = None
        self.word_numericalizer = None
        self.word_unk_token = None
        self.word_pad_token = None
        self.word_start_token = None
        self.word_end_token = None

        self.char_tokenizer = None
        self.char_instances = None
        self.char_vocab = None
        self.max_num_chars = None
        self.char_vocab_store_location = None
        self.char_embedding_type = None
        self.char_embedding_dimension = None
        self.char_numericalizer = None
        self.char_unk_token = None
        self.char_pad_token = None
        self.char_start_token = None
        self.char_end_token = None

        self.word_vocab_required_attributes = [
            "max_num_words",
            "word_vocab_store_location",
            "word_embedding_type",
            "word_embedding_dimension",
        ]

    def set_word_vocab(self):
        if not all(
            [
                attribute in dir(self)
                for attribute in self.word_vocab_required_attributes
            ]
        ):
            raise ValueError(
                f"For building word vocab, "
                f"please pass these attributes in your "
                f"dataset construction {self.word_vocab_required_attributes}"
            )
        self.word_instances = self.word_tokenizer.tokenize_batch(self.lines)
        self.word_vocab = Vocab(
            instances=self.word_instances,
            max_num_tokens=self.max_num_words,
            unk_token=self.word_unk_token,
            pad_token=self.word_pad_token,
            start_token=self.word_start_token,
            end_token=self.word_end_token,
            store_location=self.word_vocab_store_location,
            embedding_type=self.word_embedding_type,
            embedding_dimension=self.word_embedding_dimension,
        )
        self.word_numericalizer = Numericalizer(self.word_vocab)
        self.word_vocab.build_vocab()
        self.word_vocab.print_stats()

    def set_char_vocab(self):
        self.char_instances = self.char_tokenizer.tokenize_batch(self.lines)

        self.char_vocab = Vocab(
            instances=self.char_instances,
            max_num_tokens=1e6,
            min_count=1,
            store_location=self.char_vocab_store_location,
            embedding_type=self.char_embedding_type,
            embedding_dimension=self.char_embedding_dimension,
            start_token=self.char_start_token,
            end_token=self.char_end_token,
            unk_token=self.char_unk_token,
            pad_token=self.char_pad_token,
        )
        self.char_vocab.build_vocab()

        # adding these to help conversion to characters later
        self.char_vocab.add_tokens(
            list(self.word_start_token)
            + list(self.word_end_token)
            + list(self.word_unk_token)
            + list(self.word_pad_token)
        )
        self.char_numericalizer = Numericalizer(vocabulary=self.char_vocab)
        self.char_vocab.print_stats()

    def _get_label_stats_table(self):
        all_labels = []
        for label in self.labels:
            all_labels.extend(label.split())

        labels_stats = dict(collections.Counter(all_labels))
        classes = list(set(labels_stats.keys()))
        classes = sorted(classes)
        header = ["label index", "label name", "count"]
        classname2idx = self.wrapped_cls.get_classname2idx()
        rows = [
            (classname2idx[class_], class_, labels_stats[class_]) for class_ in classes
        ]
        formatted = wasabi.table(data=rows, header=header, divider=True)
        return formatted

    @wrapt.decorator
    def __call__(self, wrapped, instance, args, kwargs):
        self.wrapped_cls = wrapped
        self.init_signature = inspect.signature(wrapped.__init__)
        instance = wrapped(*args, **kwargs)
        for idx, (name, param) in enumerate(self.init_signature.parameters.items()):
            if name == "self":
                continue

            # These are values that must be passed
            if name in [
                "filename",
                "dataset_type",
                "max_num_words",
                "max_instance_length",
                "word_vocab_store_location",
            ]:
                try:
                    value = args[idx]
                except IndexError:
                    try:
                        value = kwargs[name]
                    except KeyError:
                        raise ValueError(
                            f"Dataset {self.cls.__name__} should be instantiated with {name}"
                        )
                if self.autoset_attrs:
                    setattr(instance, name, value)
                setattr(self, name, value)

            # These can be passed but have default values
            else:
                try:
                    value = args[idx]
                except IndexError:
                    try:
                        value = kwargs[name]
                    except KeyError:
                        value = param.default

                if self.autoset_attrs:
                    setattr(instance, name, value)
                setattr(self, name, value)

        # set the lines and labels
        self.lines, self.labels = instance.get_lines_labels(self.filename)
        self.word_instances = None
        self.word_vocab = None

        if "word_vocab" in self.vocab_pipe:
            self.word_tokenizer = WordTokenizer(self.word_tokenization_type)
            self.set_word_vocab()
            instance.word_tokenizer = self.word_tokenizer
            instance.word_numericalizer = self.word_numericalizer
            instance.word_vocab = copy.deepcopy(self.word_vocab)
            instance.word_instances = copy.deepcopy(self.word_instances)
            instance.num_instances = len(self.word_instances)
            instance.instance_max_len = max(
                [len(instance) for instance in self.word_instances]
            )

        if "char_vocab" in self.vocab_pipe:
            self.char_tokenizer = CharacterTokenizer()
            self.set_char_vocab()
            instance.char_vocab = copy.deepcopy(self.char_vocab)
            instance.char_instances = copy.deepcopy(self.char_instances)
            instance.char_tokenizer = self.char_tokenizer
            instance.char_numericalizer = self.char_numericalizer

        if self.is_get_label_stats_table:
            label_stats_table = self._get_label_stats_table()
            instance.label_stats_table = label_stats_table

        return instance
예제 #27
0
def setup_word_emb_loader():
    instances = [["load", "vocab"]]
    vocab = Vocab(instances=instances, max_num_tokens=1000)
    vocab.build_vocab()
    return vocab