def test_get_sentences(self):
     test_text = "First sentence. Second sentence. Sentence with 2.0 number. Sentence with H. pylori. Good bye."
     expected = [
         "First sentence.",
         "Second sentence.",
         "Sentence with 2.0 number.",
         "Sentence with H. pylori.",
         "Good bye.",
     ]
     actual = get_sentences(test_text)
     self.assertListEqual(expected, actual)
Exemplo n.º 2
0
def main(article_data_sources, writers, sentence_finder, data_sources_to_skip=0, sentences_to_skip=0):
    data_source_names = list(map(lambda x: str(x), article_data_sources))
    constants.logger.info("data sources: %s" % data_source_names)
    total_sentence_number = 0
    for i in range(data_sources_to_skip, len(article_data_sources)):
        article_data_source = article_data_sources[i]

        articles = article_data_source.get_articles()
        # todo: sort to be able to continue
        sentences_articles_tuples = ((sentence, article) for article in articles
                                           for sentence in get_sentences(article.text))

        constants.logger.info("start looping sentences with data source №%i %s" % (i + 1, str(article_data_source)))
        sentence_number = sentences_to_skip
        for _ in range(sentences_to_skip):
            next(sentences_articles_tuples)
        sentences_to_skip = 0

        for sentence_text, article in sentences_articles_tuples:
            try:
                sentence = sentence_finder.get_sentence(sentence_text, article)
            except Exception:
                constants.logger.info(format_exc())
                constants.logger.info("sentence with error: %s" % sentence_text)
                constants.logger.info("got error in sentence loop; continue")
                continue
            if not sentence:
                continue

            for writer in writers:
                writer.write(sentence)

            sentence_number += 1
            constants.logger.info("memory usage: %f" % memory_usage_psutil())
            constants.logger.info("sentence № %i, data source № %i\n%s" % (sentence_number, i+1, sentence_text))
            constants.logger.info("=" * 80)

        total_sentence_number += sentence_number
        constants.logger.info("finish looping sentences with %s\n" % str(article_data_source))
    constants.pattern_logger.info('total number sentences: %i' % total_sentence_number)