def test_print_stats_works(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_builder.print_stats()
def test_orig_vocab_len(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 0 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_len = vocab_builder.get_orig_vocab_len() assert vocab_len == 3 + len(vocab_builder.special_vocab)
def test_idx2token_out_of_bounds(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() print(vocab_builder.get_idx2token_mapping()) with pytest.raises(ValueError): vocab_builder.get_token_from_idx(100)
def test_print_stats_works(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab_builder.build_vocab() vocab_builder.print_stats()
def test_get_topn(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() words_freqs = vocab_builder.get_topn_frequent_words(n=1) assert words_freqs[0][0] == "i" assert words_freqs[0][1] == 3
def test_max_num_tokens_unset(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab.build_vocab() assert vocab.max_num_tokens == 3 + len(vocab.special_vocab.keys())
def test_save_vocab(self, instances, tmpdir): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab_builder.save_to_file(vocab_file) assert os.path.isfile(vocab_file)
def test_load_vocab(self, instances, tmpdir): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab_builder.save_to_file(vocab_file) vocab = Vocab.load_from_file(filename=vocab_file) assert vocab.get_vocab_len() == 3 + len(vocab_builder.special_vocab)
def setup_lstm2seqdecoder(request, ): HIDDEN_DIM = 1024 NUM_LAYERS = request.param[0] BIDIRECTIONAL = request.param[1] TEACHER_FORCING_RATIO = request.param[3] MAX_LENGTH = 5 lines = [] words = [] # texts = ["First", "second", "Third"] texts = ["First sentence", "second sentence", "Third long sentence here"] for text in texts: line = Line(text=text) word = Line(text=text.split()[0]) lines.append(line) words.append(word) flat_texts = [[word for sentence in texts for word in sentence]] vocab = Vocab(flat_texts) vocab.build_vocab() num_direction = 2 if BIDIRECTIONAL else 1 h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1 c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2 embedder = WordEmbedder(embedding_type="glove_6B_50") encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) * 0.5 if request.param[2] else None) decoder = Lstm2SeqDecoder( embedder=embedder, vocab=vocab, max_length=MAX_LENGTH, attn_module=request.param[2], dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, rnn_bias=False, num_layers=NUM_LAYERS, ) return ( decoder, { "HIDDEN_DIM": HIDDEN_DIM, "NUM_LAYERS": NUM_LAYERS, "MAX_LENGTH": MAX_LENGTH, "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO, "LINES": lines, "WORDS": words, "VOCAB_SIZE": vocab.get_vocab_len(), "BIDIRECTIONAL": BIDIRECTIONAL, }, encoder_outputs, (h0, c0), )
def test_load_embedding_has_all_words(self, instances, embedding_type): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=embedding_type, ) vocab.build_vocab() embedding = vocab.load_embedding() assert embedding.size(0) == vocab.get_vocab_len()
def test_vocab_length_min_freq_1_max_words_1(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 1 MIN_FREQ = 1 vocab_builder = Vocab(instances=single_instance, min_count=MIN_FREQ, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() len_vocab = vocab_builder.get_vocab_len() assert len_vocab == 1 + len(vocab_builder.special_vocab)
def test_random_embeddinng_has_2dimensions(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=None, embedding_dimension=300, ) vocab.build_vocab() embeddings = vocab.load_embedding() assert embeddings.ndimension() == 2
def test_single_instance_clip_on_max_num(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 1 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS) vocab_builder.build_vocab() vocab = vocab_builder.map_tokens_to_freq_idx() vocab = vocab_builder.clip_on_max_num(vocab) vocab_len = len(set(idx for freq, idx in vocab.values())) assert vocab_len == MAX_NUM_WORDS + len(vocab_builder.special_vocab)
def test_single_instance_min_count(self, instances): single_instance = instances["single_instance"] vocab_builder = Vocab(instances=single_instance, max_num_tokens=1000, min_count=2) vocab_builder.build_vocab() vocab = vocab_builder.map_tokens_to_freq_idx() vocab = vocab_builder.clip_on_mincount(vocab) # check that is mapped to unk nlp_freq, nlp_idx = vocab["nlp"] assert nlp_idx == vocab_builder.token2idx["<UNK>"]
def test_get_topn(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab_builder.build_vocab() words_freqs = vocab_builder.get_topn_frequent_words(n=1) assert words_freqs[0][0] == "i" assert words_freqs[0][1] == 3
def test_disp_sentences_from_indices( self, instances, tmpdir, include_special_vocab ): instance_dict = instances single_instance = instance_dict["single_instance"] MAX_NUM_WORDS = None vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, include_special_vocab=include_special_vocab, ) vocab.build_vocab() sent = vocab.get_disp_sentence_from_indices([0, 1, 2]) assert type(sent) is str
def test_disp_sentences_from_indices(self, instances, tmpdir): instance_dict = instances single_instance = instance_dict["single_instance"] MAX_NUM_WORDS = 100 vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, embedding_type=None, embedding_dimension=300, store_location=vocab_file, ) vocab.build_vocab() sent = vocab.get_disp_sentence_from_indices([0, 1, 2, 3]) assert type(sent) is str
def get_numericalized_instances(get_preprocessed_instances): instances, labels = get_preprocessed_instances MAX_NUM_WORDS = 3000 MAX_LENGTH = 15 vocab = Vocab(instances=instances, max_num_tokens=MAX_NUM_WORDS) vocab.build_vocab() numericalizer = Numericalizer(vocabulary=vocab) numericalized_instances = numericalizer.numericalize_batch_instances( instances[:32]) return { "numericalized_instances": numericalized_instances, "labels": labels, "max_length": MAX_LENGTH, "max_num_words": MAX_NUM_WORDS, "vocab": vocab, }
def test_add_token(self, instances, tmpdir, save_vocab): instance_dict = instances single_instance = instance_dict["single_instance"] MAX_NUM_WORDS = 100 vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, store_location=vocab_file, ) vocab.build_vocab() vocab._add_token("very", save_vocab=save_vocab) assert "very" in vocab.vocab.keys() assert vocab.vocab["very"] == (1, 7) assert vocab.token2idx["very"] == 7 assert vocab.idx2token[7] == "very"
def test_idx2token_for_unk(self, instances): """" Many words map to UNK in the vocab. For example say the index for UNK is 3. Then mapping 3 to the token should always map to UNK and not any other word """ single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, start_token="<SOS>", end_token="<EOS>", pad_token="<PAD>", unk_token="<UNK>", ) vocab_builder.build_vocab() UNK_IDX = vocab_builder.special_vocab[vocab_builder.unk_token][1] assert vocab_builder.get_token_from_idx(UNK_IDX) == "<UNK>"
def test_token2idx(self, instances, start_token, end_token, unk_token, pad_token): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, start_token=start_token, end_token=end_token, pad_token=pad_token, unk_token=unk_token, ) vocab_builder.build_vocab() token2idx = vocab_builder.token2idx len_indices = len(token2idx.keys()) indices = token2idx.values() indices = sorted(indices) assert indices == list(range(len_indices))
def test_idx2token(self, instances, start_token, end_token, unk_token, pad_token): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, start_token=start_token, end_token=end_token, pad_token=pad_token, unk_token=unk_token, ) vocab_builder.build_vocab() idx2token = vocab_builder.idx2token len_idx2token = len(idx2token) indices = idx2token.keys() indices = sorted(indices) # tests all indices are in order assert indices == list(range(len_idx2token))
def test_add_tokens(self, instances, tmpdir): instance_dict = instances single_instance = instance_dict["single_instance"] MAX_NUM_WORDS = 100 vocab_file = tmpdir.mkdir("tempdir").join("vocab.json") vocab = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, store_location=vocab_file, ) vocab.build_vocab() vocab.add_tokens(["very", "much"]) assert "very" in vocab.vocab.keys() assert "much" in vocab.vocab.keys() assert vocab.vocab["very"] == (1, 7) assert vocab.vocab["much"] == (1, 8) assert vocab.get_token_from_idx(7) == "very" assert vocab.get_token_from_idx(8) == "much" assert vocab.get_idx_from_token("very") == 7 assert vocab.get_idx_from_token("much") == 8
def test_build_vocab_single_instance_min_freq_2(self, instances): single_instance = instances["single_instance"] MAX_NUM_WORDS = 100 MIN_FREQ = 2 vocab_builder = Vocab(instances=single_instance, max_num_tokens=MAX_NUM_WORDS, min_count=MIN_FREQ) vocab = vocab_builder.build_vocab() vocab_len = len(set(idx for freq, idx in vocab.values())) assert vocab_len == 2 + len(vocab_builder.special_vocab)
def test_single_instance_build_vocab(self, instances, include_special_vocab): single_instance = instances["single_instance"] MAX_NUM_WORDS = None MIN_FREQ = 1 vocab_builder = Vocab( instances=single_instance, max_num_tokens=MAX_NUM_WORDS, min_count=MIN_FREQ, include_special_vocab=include_special_vocab, ) vocab = vocab_builder.build_vocab() assert "i" in vocab.keys() assert "like" in vocab.keys() assert "nlp" in vocab.keys() vocab_len = len(set(idx for freq, idx in vocab.values())) assert vocab_len == 3 + len(vocab_builder.special_vocab)
class sprinkle_dataset: def __init__(self, vocab_pipe=None, autoset_attrs=True, get_label_stats_table=True): if vocab_pipe is None: vocab_pipe = ["word_vocab"] self.autoset_attrs = autoset_attrs self.vocab_pipe = vocab_pipe self.is_get_label_stats_table = get_label_stats_table self.wrapped_cls = None self.init_signature = None self.filename = None self.word_tokenization_type = None self.word_tokenizer = None self.word_instances = None self.word_vocab = None self.max_num_words = None self.word_vocab_store_location = None self.word_embedding_type = None self.word_embedding_dimension = None self.word_numericalizer = None self.word_unk_token = None self.word_pad_token = None self.word_start_token = None self.word_end_token = None self.char_tokenizer = None self.char_instances = None self.char_vocab = None self.max_num_chars = None self.char_vocab_store_location = None self.char_embedding_type = None self.char_embedding_dimension = None self.char_numericalizer = None self.char_unk_token = None self.char_pad_token = None self.char_start_token = None self.char_end_token = None self.word_vocab_required_attributes = [ "max_num_words", "word_vocab_store_location", "word_embedding_type", "word_embedding_dimension", ] def set_word_vocab(self): if not all( [ attribute in dir(self) for attribute in self.word_vocab_required_attributes ] ): raise ValueError( f"For building word vocab, " f"please pass these attributes in your " f"dataset construction {self.word_vocab_required_attributes}" ) self.word_instances = self.word_tokenizer.tokenize_batch(self.lines) self.word_vocab = Vocab( instances=self.word_instances, max_num_tokens=self.max_num_words, unk_token=self.word_unk_token, pad_token=self.word_pad_token, start_token=self.word_start_token, end_token=self.word_end_token, store_location=self.word_vocab_store_location, embedding_type=self.word_embedding_type, embedding_dimension=self.word_embedding_dimension, ) self.word_numericalizer = Numericalizer(self.word_vocab) self.word_vocab.build_vocab() self.word_vocab.print_stats() def set_char_vocab(self): self.char_instances = self.char_tokenizer.tokenize_batch(self.lines) self.char_vocab = Vocab( instances=self.char_instances, max_num_tokens=1e6, min_count=1, store_location=self.char_vocab_store_location, embedding_type=self.char_embedding_type, embedding_dimension=self.char_embedding_dimension, start_token=self.char_start_token, end_token=self.char_end_token, unk_token=self.char_unk_token, pad_token=self.char_pad_token, ) self.char_vocab.build_vocab() # adding these to help conversion to characters later self.char_vocab.add_tokens( list(self.word_start_token) + list(self.word_end_token) + list(self.word_unk_token) + list(self.word_pad_token) ) self.char_numericalizer = Numericalizer(vocabulary=self.char_vocab) self.char_vocab.print_stats() def _get_label_stats_table(self): all_labels = [] for label in self.labels: all_labels.extend(label.split()) labels_stats = dict(collections.Counter(all_labels)) classes = list(set(labels_stats.keys())) classes = sorted(classes) header = ["label index", "label name", "count"] classname2idx = self.wrapped_cls.get_classname2idx() rows = [ (classname2idx[class_], class_, labels_stats[class_]) for class_ in classes ] formatted = wasabi.table(data=rows, header=header, divider=True) return formatted @wrapt.decorator def __call__(self, wrapped, instance, args, kwargs): self.wrapped_cls = wrapped self.init_signature = inspect.signature(wrapped.__init__) instance = wrapped(*args, **kwargs) for idx, (name, param) in enumerate(self.init_signature.parameters.items()): if name == "self": continue # These are values that must be passed if name in [ "filename", "dataset_type", "max_num_words", "max_instance_length", "word_vocab_store_location", ]: try: value = args[idx] except IndexError: try: value = kwargs[name] except KeyError: raise ValueError( f"Dataset {self.cls.__name__} should be instantiated with {name}" ) if self.autoset_attrs: setattr(instance, name, value) setattr(self, name, value) # These can be passed but have default values else: try: value = args[idx] except IndexError: try: value = kwargs[name] except KeyError: value = param.default if self.autoset_attrs: setattr(instance, name, value) setattr(self, name, value) # set the lines and labels self.lines, self.labels = instance.get_lines_labels(self.filename) self.word_instances = None self.word_vocab = None if "word_vocab" in self.vocab_pipe: self.word_tokenizer = WordTokenizer(self.word_tokenization_type) self.set_word_vocab() instance.word_tokenizer = self.word_tokenizer instance.word_numericalizer = self.word_numericalizer instance.word_vocab = copy.deepcopy(self.word_vocab) instance.word_instances = copy.deepcopy(self.word_instances) instance.num_instances = len(self.word_instances) instance.instance_max_len = max( [len(instance) for instance in self.word_instances] ) if "char_vocab" in self.vocab_pipe: self.char_tokenizer = CharacterTokenizer() self.set_char_vocab() instance.char_vocab = copy.deepcopy(self.char_vocab) instance.char_instances = copy.deepcopy(self.char_instances) instance.char_tokenizer = self.char_tokenizer instance.char_numericalizer = self.char_numericalizer if self.is_get_label_stats_table: label_stats_table = self._get_label_stats_table() instance.label_stats_table = label_stats_table return instance
def setup_word_emb_loader(): instances = [["load", "vocab"]] vocab = Vocab(instances=instances, max_num_tokens=1000) vocab.build_vocab() return vocab