class Corpus(object): def __init__(self, path): self.dictionary = Dictionary() self.train = self.tokenize(os.path.join(path, 'train.txt')) self.valid = self.tokenize(os.path.join(path, 'valid.txt')) self.test = self.tokenize(os.path.join(path, 'test.txt')) def tokenize(self, path): """Tokenizes a text file.""" assert os.path.exists(path) # Add words to the dictionary with open(path, 'r') as f: tokens = 0 for line in f: words = line.split() + ['<eos>'] tokens += len(words) for word in words: self.dictionary.add_word(word) # Tokenize file content with open(path, 'r') as f: ids = torch.LongTensor(tokens) token = 0 for line in f: words = line.split() + ['<eos>'] for word in words: ids[token] = self.dictionary.word2idx[word] token += 1 return ids