예제 #1
0
 def __init__(self, docs, embeddings, cuda, word_dropout=0, max_len=-1):
     # print(docs)
     mini_vocab = Vocab.from_docs(docs,
                                  default=UNK_IDX,
                                  start=START_TOKEN_IDX,
                                  end=END_TOKEN_IDX)
     # Limit maximum document length (for efficiency reasons).
     if max_len != -1:
         docs = [doc[:max_len] for doc in docs]
     doc_lens = [len(doc) for doc in docs]
     self.doc_lens = cuda(torch.LongTensor(doc_lens))
     self.max_doc_len = max(doc_lens)
     if word_dropout:
         # for each token, with probability `word_dropout`, replace word index with UNK_IDX.
         docs = [[
             UNK_IDX if np.random.rand() < word_dropout else x for x in doc
         ] for doc in docs]
     # pad docs so they all have the same length.
     # we pad with UNK, whose embedding is 0, so it doesn't mess up sums or averages.
     docs = [
         right_pad(mini_vocab.numberize(doc), self.max_doc_len, UNK_IDX)
         for doc in docs
     ]
     self.docs = [cuda(fixed_var(torch.LongTensor(doc))) for doc in docs]
     local_embeddings = [embeddings[i] for i in mini_vocab.names]
     self.embeddings_matrix = cuda(
         fixed_var(FloatTensor(local_embeddings).t()))
예제 #2
0
    def test_denumberize_numberize(self):
        """ Tests that `denumberize` is left inverse of `numberize` """
        fixture1 = [["a", "b", "c"], ["d", "e", "f"], ["a", "f", "b"],
                    ["b", "e", "d"]]
        fixture2 = [[0, 1, 2], [3, 4, 5], [0, 5, 1], [2, 4, 3]]

        for fixture in (fixture1, fixture2):
            v = Vocab.from_docs(fixture)
            for doc in fixture:
                self.assertEqual(v.denumberize(v.numberize(doc)), doc)