class SentenceStream(object): def __init__(self): self.docs = CaseReportLibrary() self.tokenizer = SimpleTokenizer() def __iter__(self): doc_count = len(self.docs) count = 0 for doc in self.docs: for sentence in sent_tokenize(doc.get_text().lower()): tokens = self.tokenizer.tokenize(sentence) yield tokens count += 1 logging.info(msg="%s/%s documents streamed" % (count, doc_count, ))
def __init__(self): self.docs = CaseReportLibrary() self.tokenizer = SimpleTokenizer()