class ReviewCorpus(object): def __init__(self, filename, dictionary): self.filename = filename self.dictionary = dictionary self.tokenizer = SimpleTokenizer() def __iter__(self): with open(self.filename) as f: for line in f: review = json.loads(line) tokens = self.tokenizer.tokenize(review) yield self.dictionary.doc2bow(tokens)
def load_reviews(filename): tokeniser = SimpleTokenizer() with open(filename, 'r') as f: for line in f: review = json.loads(line) yield tokeniser.tokenize(review)