Exemplo n.º 1
0
    def test_set_from_file_reads_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR / 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('<S>\n')
            vocab_file.write('</S>\n')
            vocab_file.write('<UNK>\n')
            vocab_file.write('a\n')
            vocab_file.write('tricky\x0bchar\n')
            vocab_file.write('word\n')
            vocab_file.write('another\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=True, oov_token="<UNK>")

        assert vocab._oov_token == DEFAULT_OOV_TOKEN
        assert vocab.get_token_index("random string") == 3
        assert vocab.get_token_index("<S>") == 1
        assert vocab.get_token_index("</S>") == 2
        assert vocab.get_token_index(DEFAULT_OOV_TOKEN) == 3
        assert vocab.get_token_index("a") == 4
        assert vocab.get_token_index("tricky\x0bchar") == 5
        assert vocab.get_token_index("word") == 6
        assert vocab.get_token_index("another") == 7
        assert vocab.get_token_from_index(0) == vocab._padding_token
        assert vocab.get_token_from_index(1) == "<S>"
        assert vocab.get_token_from_index(2) == "</S>"
        assert vocab.get_token_from_index(3) == DEFAULT_OOV_TOKEN
        assert vocab.get_token_from_index(4) == "a"
        assert vocab.get_token_from_index(5) == "tricky\x0bchar"
        assert vocab.get_token_from_index(6) == "word"
        assert vocab.get_token_from_index(7) == "another"
Exemplo n.º 2
0
    def test_add_word_to_index_gives_consistent_results(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1

        # Now add it again, and make sure nothing changes.
        vocab.add_token_to_namespace("word")
        assert "word" in vocab.get_index_to_token_vocabulary().values()
        assert vocab.get_token_index("word") == word_index
        assert vocab.get_token_from_index(word_index) == "word"
        assert vocab.get_vocab_size() == initial_vocab_size + 1
Exemplo n.º 3
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index, namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace='tokens')):
         token = self.vocab.get_token_from_index(index=index, namespace='tokens')
         archived_token_index = archived_vocab.get_token_index(token, namespace='tokens')
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if archived_vocab.get_token_from_index(archived_token_index, namespace="tokens") == token:
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
Exemplo n.º 5
0
    def test_set_from_file_reads_non_padded_files(self):
        # pylint: disable=protected-access
        vocab_filename = self.TEST_DIR / 'vocab_file'
        with codecs.open(vocab_filename, 'w', 'utf-8') as vocab_file:
            vocab_file.write('B-PERS\n')
            vocab_file.write('I-PERS\n')
            vocab_file.write('O\n')
            vocab_file.write('B-ORG\n')
            vocab_file.write('I-ORG\n')

        vocab = Vocabulary()
        vocab.set_from_file(vocab_filename, is_padded=False, namespace='tags')
        assert vocab.get_token_index("B-PERS", namespace='tags') == 0
        assert vocab.get_token_index("I-PERS", namespace='tags') == 1
        assert vocab.get_token_index("O", namespace='tags') == 2
        assert vocab.get_token_index("B-ORG", namespace='tags') == 3
        assert vocab.get_token_index("I-ORG", namespace='tags') == 4
        assert vocab.get_token_from_index(0, namespace='tags') == "B-PERS"
        assert vocab.get_token_from_index(1, namespace='tags') == "I-PERS"
        assert vocab.get_token_from_index(2, namespace='tags') == "O"
        assert vocab.get_token_from_index(3, namespace='tags') == "B-ORG"
        assert vocab.get_token_from_index(4, namespace='tags') == "I-ORG"
Exemplo n.º 6
0
    def test_namespaces(self):
        vocab = Vocabulary()
        initial_vocab_size = vocab.get_vocab_size()
        word_index = vocab.add_token_to_namespace("word", namespace='1')
        assert "word" in vocab.get_index_to_token_vocabulary(
            namespace='1').values()
        assert vocab.get_token_index("word", namespace='1') == word_index
        assert vocab.get_token_from_index(word_index, namespace='1') == "word"
        assert vocab.get_vocab_size(namespace='1') == initial_vocab_size + 1

        # Now add it again, in a different namespace and a different word, and make sure it's like
        # new.
        word2_index = vocab.add_token_to_namespace("word2", namespace='2')
        word_index = vocab.add_token_to_namespace("word", namespace='2')
        assert "word" in vocab.get_index_to_token_vocabulary(
            namespace='2').values()
        assert "word2" in vocab.get_index_to_token_vocabulary(
            namespace='2').values()
        assert vocab.get_token_index("word", namespace='2') == word_index
        assert vocab.get_token_index("word2", namespace='2') == word2_index
        assert vocab.get_token_from_index(word_index, namespace='2') == "word"
        assert vocab.get_token_from_index(word2_index,
                                          namespace='2') == "word2"
        assert vocab.get_vocab_size(namespace='2') == initial_vocab_size + 2
 def _get_vocab_index_mapping(self, archived_vocab: Vocabulary) -> List[Tuple[int, int]]:
     vocab_index_mapping: List[Tuple[int, int]] = []
     for index in range(self.vocab.get_vocab_size(namespace="tokens")):
         token = self.vocab.get_token_from_index(index=index, namespace="tokens")
         archived_token_index = archived_vocab.get_token_index(token, namespace="tokens")
         # Checking if we got the UNK token index, because we don't want all new token
         # representations initialized to UNK token's representation. We do that by checking if
         # the two tokens are the same. They will not be if the token at the archived index is
         # UNK.
         if (
             archived_vocab.get_token_from_index(archived_token_index, namespace="tokens")
             == token
         ):
             vocab_index_mapping.append((index, archived_token_index))
     return vocab_index_mapping
Exemplo n.º 8
0
    def indices_to_tokens(self, indexed_tokens: IndexedTokenList,
                          vocabulary: Vocabulary) -> List[Token]:
        self._add_encoding_to_vocabulary_if_needed(vocabulary)

        token_ids = indexed_tokens["token_ids"]
        type_ids = indexed_tokens.get("type_ids")

        return [
            Token(
                text=vocabulary.get_token_from_index(token_ids[i],
                                                     self._namespace),
                text_id=token_ids[i],
                type_id=type_ids[i] if type_ids is not None else None,
            ) for i in range(len(token_ids))
        ]
Exemplo n.º 9
0
def get_question_tensors_for_clause_tensors_batched(
        batch_size: int,
        vocab: Vocabulary,
        all_slots: Dict[str, torch.LongTensor],
        all_probs: torch.LongTensor):
    clause_slots = { k[len("clause-"):] : v for k, v in all_slots.items() if k.startswith("clause-")}
    question_slot_names = ["wh", "aux", "subj", "verb", "obj", "prep", "obj2"]
    clause_slot_names = ["subj", "aux", "verb", "obj", "prep1", "prep1-obj", "prep2", "prep2-obj", "misc", "qarg"]
    stringy_clause_slots = [
        {k : vocab.get_token_from_index(
                v[i].item(),
                namespace = get_slot_label_namespace("clause-%s" % k))
            for k, v in clause_slots.items()}
        for i in range(batch_size)
    ]
    filtered_stringy_clause_slots = []
    stringy_question_slots = []
    question_probs = []
    # for clause_slots, prob in zip(stringy_clause_slots, all_probs):
    for i in range(len(stringy_clause_slots)):
        try:
            stringy_question_slots.append(get_question_for_clause(stringy_clause_slots[i], vocab))
            filtered_stringy_clause_slots.append(stringy_clause_slots[i])
            question_probs.append(all_probs[i].item())
        except ValueError as e:
            print(str(e))

    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda:%s" % torch.cuda.current_device())
    filtered_clause_slots = {
        ("clause-%s" % slot_name) : torch.tensor(
            [vocab.get_token_index(slots[slot_name], namespace = get_slot_label_namespace("clause-%s" % slot_name))
             for slots in stringy_clause_slots],
            device = device
        ).long()
        for slot_name in clause_slot_names
    }
    question_slots = {
        slot_name : torch.tensor(
            [vocab.get_token_index(slots[slot_name], namespace = get_slot_label_namespace(slot_name))
             for slots in stringy_question_slots],
            device = device
        ).long()
        for slot_name in question_slot_names
    }
    question_probs_tensor = torch.tensor(question_probs, device = device)
    return filtered_clause_slots, question_slots, question_probs_tensor
    def __init__(self, experiment_path: str,
                 vocab: Vocabulary,
                 embedder: SyllableEmbedder,
                 embedding_dim: int = 300) -> None:
        summary_writer = SummaryWriter(f'{experiment_path}/log/visualization')
        data_reader = KoWikiReader()

        words = [vocab.get_token_from_index(i, namespace='words')
                 for i in range(vocab.get_vocab_size('words'))]
        embeddings = torch.zeros(vocab.get_vocab_size('words'), embedding_dim)

        for i, c in enumerate(words):
            word = data_reader.text_to_instance(source=Token(c))['source']
            word.index(vocab)
            word_tensor = word.as_tensor(word.get_padding_lengths())['syllables']
            embeddings[i] = embedder(word_tensor)

        summary_writer.add_embedding(embeddings, metadata=words, tag='syllable_embeddings')
        summary_writer.close()
Exemplo n.º 11
0
 def index(self, vocab: Vocabulary):
     if self._index_label_id is None:
         self._index_label_id = vocab.get_token_index(
             self.index_label, self._index_label_namespace)  # type: ignore
     if self._set_label_id is None:
         self._set_label_id = vocab.get_token_index(
             self.set_label, self._set_label_namespace)  # type: ignore
     if len(self.target_label_set) == 0:
         set_set = vocab.get_token_from_index(self.index_label,
                                              self._set_index_map)
         flat_set = []
         for tgt_set in set_set:
             flat_set += vocab.get_token_index(
                 tgt_set, self._set_target_map)  # type: ignore
         self.target_label_set = set(flat_set)
         target_ind_set = [
             vocab.get_token_index(lb, self._target_label_namespace)
             for lb in self.target_label_set
         ]
         self.target_ind_set = sorted(list(target_ind_set))
Exemplo n.º 12
0
def decode_tokens(batch_predicted_indices: torch.Tensor,
                  vocab: Vocabulary,
                  end_index: int,
                  start_index: int = None,
                  vocab_namespace: str = 'tokens',
                  truncate: bool = False):
    if not isinstance(batch_predicted_indices, numpy.ndarray):
        batch_predicted_indices = batch_predicted_indices.detach().cpu().numpy(
        )

    all_predicted_tokens = []
    for predicted_indices in batch_predicted_indices:
        # Beam search gives us the top k results for each source sentence in the batch
        # but we just want the single best.

        if len(predicted_indices.shape) == 1:
            predicted_indices = numpy.expand_dims(predicted_indices, axis=0)

        instance_predicted_tokens = []
        for indices in predicted_indices:
            # We add start token to the predictions.
            # In case it is present at position 0, remove it.
            if start_index is not None and start_index == indices[0]:
                indices = indices[1:]

            indices = list(indices)
            # Collect indices till the first end_symbol
            if truncate and end_index in indices:
                indices = indices[:indices.index(end_index)]
            predicted_tokens = [
                vocab.get_token_from_index(x, namespace=vocab_namespace)
                for x in indices
            ]

            instance_predicted_tokens.append(predicted_tokens)
        all_predicted_tokens.append(instance_predicted_tokens)
    return all_predicted_tokens
Exemplo n.º 13
0
#
# for article in data:
#     story = article['story']

a = "the man went to the store and bought a gallon of milk"
b = tokenizer.tokenize(a)
print(b)

bert_vocab = Vocabulary()
c = token_indexer.tokens_to_indices(b, bert_vocab, 'bert')
print(c)

input_ids = c['bert']
for input_id in input_ids:
    tokens = [
        bert_vocab.get_token_from_index(index=idx, namespace='bert')
        for idx in input_id
    ]
    print(tokens)

d = token_embedder(torch.LongTensor(c['bert']))
print(d.size())

e = token_embedder(torch.LongTensor(c['bert']),
                   torch.LongTensor(c['bert-offsets']))
print(e.size())

# d = TextField(b, {'bert': token_indexer})
# print(b)
#
# sentence1 = a
Exemplo n.º 14
0
def train_lstm(train_dataset,
               batch_size,
               num_layers,
               use_elmo=False,
               epochs=15,
               bidirectional=True,
               learning_rate=3e-4,
               hidden_size=64,
               num_classes=2,
               use_gpu=False):
    """
    Trains a LSTM and its variants (Vanilla, Bi-Directional, Stacked BiLSTM) on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.


    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_layers: int
        number of BiLSTM layers: 2 or higher for Stacked BiLSTMs
    use_elmo: bool
        use elmo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    bidirectional: bool
        True for a bidirectional LSTM
    learning_rate: float
        learning rate for Adam Optimizer
    hidden_size: int
        size of the hidden layer in the encoder
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(tokens=['fic', 'non'],
                                      namespace="labels")
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic'
    assert vocab.get_token_from_index(index=1, namespace='labels') == 'non'
    print("\n\nThe ordering of labels is ['fic', 'non']\n\n")

    encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(
        nn.LSTM(word_embeddings.get_output_dim(),
                hidden_size,
                num_layers=num_layers,
                bidirectional=bidirectional,
                batch_first=True))

    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)
    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Exemplo n.º 15
0
def train_cnn(train_dataset,
              batch_size,
              num_filters,
              filter_sizes,
              use_elmo=False,
              epochs=15,
              learning_rate=3e-4,
              num_classes=2,
              use_gpu=False):
    """
    Trains CNN on train_dataset. Initialises word embeddings with pre-trained GloVe OR uses pre-trained ELMo model to dynamically compute embeddings.
    The CNN has one convolution layer for each ngram filter size.

    Parameters
    ----------
    train_dataset: List[Instance]
        Instances for training set
    batch_size: int
        number of Instances to process in a batch
    num_filters: int
        output dim for each convolutional layer, which is the number of 'filters' learned by that layer
    filter_sizes: Tuple[int]
        specifies the number of convolutional layers and their sizes
    use_elmo: bool
        use ELMo embeddings (transfer learning) if True | GloVe if False
    epochs: int
        total number of epochs to train on (default=30)
    learning_rate: float
        learning rate for Adam Optimizer
    num_classes: int
        default=2 for binary classification
    use_gpu: bool
        True to use the GPU

    Returns
    -------
    Trained Model, Vocabulary, Number of actual training epochs
    """
    if use_elmo:
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(tokens=['fic', 'non'],
                                      namespace="labels")
        word_embeddings: TextFieldEmbedder = load_elmo_embeddings()
    else:
        vocab = Vocabulary.from_instances(train_dataset)
        word_embeddings: TextFieldEmbedder = load_glove_embeddings(vocab)

    iterator = BucketIterator(batch_size=batch_size,
                              sorting_keys=[("tokens", "num_tokens")])

    iterator.index_with(vocab)  # numericalize the data

    assert vocab.get_token_from_index(index=0, namespace='labels') == 'fic'
    assert vocab.get_token_from_index(index=1, namespace='labels') == 'non'
    print("\n\nThe ordering of labels is ['fic', 'non']\n\n")

    encoder: Seq2VecEncoder = CnnEncoder(
        embedding_dim=word_embeddings.get_output_dim(),
        num_filters=num_filters,
        ngram_filter_sizes=filter_sizes)

    classifier_feedforward: FeedForward = nn.Linear(encoder.get_output_dim(),
                                                    num_classes)
    model = models.Classifier(vocab=vocab,
                              word_embeddings=word_embeddings,
                              encoder=encoder,
                              classifier_feedforward=classifier_feedforward)

    if use_gpu: model.cuda()
    else: model

    optimizer = optim.Adam(model.parameters(), learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      cuda_device=0 if use_gpu else -1,
                      num_epochs=epochs)

    metrics = trainer.train()
    print(metrics)

    return model, vocab, metrics['training_epochs']
Exemplo n.º 16
0
    token_embedding = token_embedding.cuda(cuda_device)
    token_ids = token_ids.cuda(cuda_device)
else:
    cuda_device = -1

token_embedding.eval()
with torch.no_grad():
    embeddings = token_embedding(token_ids).cpu().numpy()

print(embeddings.shape)
tsne = TSNE(n_components=2, verbose=1, perplexity=10, n_iter=300)
tsne_results = tsne.fit_transform(embeddings)

plt.figure()
for i, (x, y) in enumerate(tsne_results):
    label = vocab.get_token_from_index(i + 2)
    color, marker = label_to_color_and_marker(label)
    plt.scatter(x, y, s=1, c=color, marker=marker)
    plt.text(x, y, label, fontsize=2, color=color)
plt.savefig("figures/tsne_results.pdf")

print(embeddings.shape)
pca = PCA(n_components=2)
pca_results = pca.fit_transform(embeddings)

plt.figure()
for i, (x, y) in enumerate(pca_results):
    label = vocab.get_token_from_index(i + 2)
    color, marker = label_to_color_and_marker(label)
    plt.scatter(x, y, s=1, c=color, marker=marker)
    plt.text(x, y, label, fontsize=2, color=color)
Exemplo n.º 17
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        '''
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        '''

        vocab_dir = self.TEST_DIR / 'vocab_save'
        original_vocab = Vocabulary(non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple", namespace="tokens0") # index:2
        original_vocab.add_token_to_namespace("bat", namespace="tokens0")   # index:3
        original_vocab.add_token_to_namespace("cat", namespace="tokens0")   # index:4

        original_vocab.add_token_to_namespace("apple", namespace="tokens1") # index:0
        original_vocab.add_token_to_namespace("bat", namespace="tokens1")   # index:1
        original_vocab.add_token_to_namespace("cat", namespace="tokens1")   # index:2

        original_vocab.add_token_to_namespace("a", namespace="tokens2") # index:0
        original_vocab.add_token_to_namespace("b", namespace="tokens2") # index:1
        original_vocab.add_token_to_namespace("c", namespace="tokens2") # index:2

        original_vocab.add_token_to_namespace("p", namespace="tokens3") # index:0
        original_vocab.add_token_to_namespace("q", namespace="tokens3") # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens0": SingleIdTokenIndexer("tokens0")})
        text_field1 = TextField([Token(t) for t in ["cat", "an", "apple", "banana", "atom", "bat"]],
                                {"tokens1": SingleIdTokenIndexer("tokens1")})
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([Instance({"text0": text_field0, "text1": text_field1,
                                     "text4": text_field4, "text5": text_field5})])

        params = Params({"directory_path": vocab_dir,
                         "extend": True,
                         "non_padded_namespaces": ["tokens1", "tokens5"]})
        extended_vocab = Vocabulary.from_params(params, instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {"tokens1", "tokens3", "tokens5"}

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size("tokens0") == 8 # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size("tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size("tokens4") == 6 # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3 # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(index, namespace)
                assert vocab_token == extended_vocab_token
Exemplo n.º 18
0
    def test_from_params_valid_vocab_extension_thoroughly(self):
        """
        Tests for Valid Vocab Extension thoroughly: Vocab extension is valid
        when overlapping namespaces have same padding behaviour (padded/non-padded)
        Summary of namespace paddings in this test:
        original_vocab namespaces
            tokens0     padded
            tokens1     non-padded
            tokens2     padded
            tokens3     non-padded
        instances namespaces
            tokens0     padded
            tokens1     non-padded
            tokens4     padded
            tokens5     non-padded
        TypicalExtention example: (of tokens1 namespace)
        -> original_vocab index2token
           apple          #0->apple
           bat            #1->bat
           cat            #2->cat
        -> Token to be extended with: cat, an, apple, banana, atom, bat
        -> extended_vocab: index2token
           apple           #0->apple
           bat             #1->bat
           cat             #2->cat
           an              #3->an
           atom            #4->atom
           banana          #5->banana
        """

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(
            non_padded_namespaces=["tokens1", "tokens3"])
        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens0")  # index:2
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens0")  # index:3
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens0")  # index:4

        original_vocab.add_token_to_namespace("apple",
                                              namespace="tokens1")  # index:0
        original_vocab.add_token_to_namespace("bat",
                                              namespace="tokens1")  # index:1
        original_vocab.add_token_to_namespace("cat",
                                              namespace="tokens1")  # index:2

        original_vocab.add_token_to_namespace("a",
                                              namespace="tokens2")  # index:0
        original_vocab.add_token_to_namespace("b",
                                              namespace="tokens2")  # index:1
        original_vocab.add_token_to_namespace("c",
                                              namespace="tokens2")  # index:2

        original_vocab.add_token_to_namespace("p",
                                              namespace="tokens3")  # index:0
        original_vocab.add_token_to_namespace("q",
                                              namespace="tokens3")  # index:1

        original_vocab.save_to_files(vocab_dir)

        text_field0 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens0": SingleIdTokenIndexer("tokens0")},
        )
        text_field1 = TextField(
            [
                Token(t)
                for t in ["cat", "an", "apple", "banana", "atom", "bat"]
            ],
            {"tokens1": SingleIdTokenIndexer("tokens1")},
        )
        text_field4 = TextField([Token(t) for t in ["l", "m", "n", "o"]],
                                {"tokens4": SingleIdTokenIndexer("tokens4")})
        text_field5 = TextField([Token(t) for t in ["x", "y", "z"]],
                                {"tokens5": SingleIdTokenIndexer("tokens5")})
        instances = Batch([
            Instance({
                "text0": text_field0,
                "text1": text_field1,
                "text4": text_field4,
                "text5": text_field5,
            })
        ])

        params = Params({
            "type": "extend",
            "directory": vocab_dir,
            "non_padded_namespaces": ["tokens1", "tokens5"],
        })
        extended_vocab = Vocabulary.from_params(params, instances=instances)

        # namespaces: tokens0, tokens1 is common.
        # tokens2, tokens3 only vocab has. tokens4, tokens5 only instances
        extended_namespaces = {*extended_vocab._token_to_index}
        assert extended_namespaces == {"tokens{}".format(i) for i in range(6)}

        # # Check that _non_padded_namespaces list is consistent after extension
        assert extended_vocab._non_padded_namespaces == {
            "tokens1", "tokens3", "tokens5"
        }

        # # original_vocab["tokens1"] has 3 tokens, instances of "tokens1" ns has 5 tokens. 2 overlapping
        assert extended_vocab.get_vocab_size("tokens1") == 6
        assert extended_vocab.get_vocab_size(
            "tokens0") == 8  # 2 extra overlapping because padded

        # namespace tokens3, tokens4 was only in original_vocab,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens2") == original_vocab.get_vocab_size("tokens2")
        assert extended_vocab.get_vocab_size(
            "tokens3") == original_vocab.get_vocab_size("tokens3")

        # namespace tokens2 was only in instances,
        # and its token count should be same in extended_vocab
        assert extended_vocab.get_vocab_size(
            "tokens4") == 6  # l,m,n,o + oov + padding
        assert extended_vocab.get_vocab_size("tokens5") == 3  # x,y,z

        # Word2index mapping of all words in all namespaces of original_vocab
        # should be maintained in extended_vocab
        for namespace, token2index in original_vocab._token_to_index.items():
            for token, _ in token2index.items():
                vocab_index = original_vocab.get_token_index(token, namespace)
                extended_vocab_index = extended_vocab.get_token_index(
                    token, namespace)
                assert vocab_index == extended_vocab_index
        # And same for Index2Word mapping
        for namespace, index2token in original_vocab._index_to_token.items():
            for index, _ in index2token.items():
                vocab_token = original_vocab.get_token_from_index(
                    index, namespace)
                extended_vocab_token = extended_vocab.get_token_from_index(
                    index, namespace)
                assert vocab_token == extended_vocab_token
class KNNPredictor(Predictor):
    def __init__(self, 
                 model: Model, 
                 dataset_reader: DatasetReader,
                 vocab_path: str = 'resources/vocab',
                 df_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv',
                 annoy_index_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/index.tree'
                ) -> None:
        super().__init__(model.eval(), dataset_reader)
        
        self.vocab = Vocabulary().from_files(vocab_path)
        self.df = pd.read_csv(df_path).set_index("track_id")
        
        self.index = None
        if annoy_index_path:
            self.build_index(annoy_index_path)
    
    def build_index(self, path: str, tracks: List[Tuple[str, np.array]] =None):
        features = self._model.classifier_feedforward.get_output_dim()
        if tracks is None:
            if not os.path.exists(path):
                path = urlretrieve(path)[0]
            self.index = AnnoyIndex(features, metric='angular')
            self.index.load(path)
            return
        
        index = AnnoyIndex(features, metric='angular')
        for track, vector in tqdm(tracks):
            i = self.vocab.get_token_to_index_vocabulary("labels")[track]
            index.add_item(i, vector)
        
        index.build(-1)
        index.save(path)
        
        self.index = index
    
    def neighbors_to_tracks(self, nns):
        tracks = [self.vocab.get_token_from_index(i, "labels") for i in nns]
        return self.df.loc[tracks].reset_index(drop=True).to_dict(orient='records')
    
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        n = inputs.pop('n', 10)
        if 'track_id' in inputs:
            if self.index is None:
                raise AttributeError("Please build an index before searching by track.")
            idx = self.vocab.get_token_to_index_vocabulary("labels")[inputs['track_id']]
            nns = self.index.get_nns_by_item(idx, n+1)[1:]
            #scores = self.index.get_item_vector(idx) 
            tracks = self.neighbors_to_tracks(nns)
            return tracks
            #return {'tracks': tracks, 'scores': scores}
            
            
        instance = self._json_to_instance(inputs)
        output_dict = self.predict_instance(instance)
        output_dict['inputs'] = inputs
        if self.index:
            logits = output_dict.get('logits')
            nns = self.index.get_nns_by_vector(logits, n)
            return self.neighbors_to_tracks(nns)
            #output_dict['tracks'] = self.neighbors_to_tracks(nns)
        return output_dict

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        return self._dataset_reader.text_to_instance(text=json_dict['query'])