Пример #1
0
class SentenceStream(object):
    def __init__(self):
        self.docs = CaseReportLibrary()
        self.tokenizer = SimpleTokenizer()

    def __iter__(self):
        doc_count = len(self.docs)
        count = 0
        for doc in self.docs:
            for sentence in sent_tokenize(doc.get_text().lower()):
                tokens = self.tokenizer.tokenize(sentence)
                yield tokens
            count += 1
            logging.info(msg="%s/%s documents streamed" % (count, doc_count, ))
Пример #2
0
 def __init__(self):
     self.docs = CaseReportLibrary()
     self.tokenizer = SimpleTokenizer()