def __init__(self, docs, embeddings, cuda, word_dropout=0, max_len=-1): # print(docs) mini_vocab = Vocab.from_docs(docs, default=UNK_IDX, start=START_TOKEN_IDX, end=END_TOKEN_IDX) # Limit maximum document length (for efficiency reasons). if max_len != -1: docs = [doc[:max_len] for doc in docs] doc_lens = [len(doc) for doc in docs] self.doc_lens = cuda(torch.LongTensor(doc_lens)) self.max_doc_len = max(doc_lens) if word_dropout: # for each token, with probability `word_dropout`, replace word index with UNK_IDX. docs = [[ UNK_IDX if np.random.rand() < word_dropout else x for x in doc ] for doc in docs] # pad docs so they all have the same length. # we pad with UNK, whose embedding is 0, so it doesn't mess up sums or averages. docs = [ right_pad(mini_vocab.numberize(doc), self.max_doc_len, UNK_IDX) for doc in docs ] self.docs = [cuda(fixed_var(torch.LongTensor(doc))) for doc in docs] local_embeddings = [embeddings[i] for i in mini_vocab.names] self.embeddings_matrix = cuda( fixed_var(FloatTensor(local_embeddings).t()))
def test_denumberize_numberize(self): """ Tests that `denumberize` is left inverse of `numberize` """ fixture1 = [["a", "b", "c"], ["d", "e", "f"], ["a", "f", "b"], ["b", "e", "d"]] fixture2 = [[0, 1, 2], [3, 4, 5], [0, 5, 1], [2, 4, 3]] for fixture in (fixture1, fixture2): v = Vocab.from_docs(fixture) for doc in fixture: self.assertEqual(v.denumberize(v.numberize(doc)), doc)