def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        sentence = self.__next_sentence()
        if sentence is None:
            raise StopIteration

        sentence = sentence.strip()

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        if self.__copy_to is None:
            raise StopIteration

        sentence = self.__copy_to.get_line()

        if sentence is None:
            self.__copy_to.end()
            self.__copy_to = None
            raise StopIteration

        sentence = sentence.strip()

        self.__sentence_counter += 1
        if self.__sentence_counter % 1000 == 0:
            log.info("Feeding sentence %d..." % self.__sentence_counter)

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
示例#3
0
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        sentence = self.__next_sentence()
        if sentence is None:
            raise StopIteration

        sentence = sentence.strip()

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
示例#4
0
    def test_language_is_enabled(self):
        assert LanguageFactory.language_is_enabled('en') is True
        assert LanguageFactory.language_is_enabled('lt') is True

        # noinspection PyTypeChecker
        assert LanguageFactory.language_is_enabled(None) is False
        assert LanguageFactory.language_is_enabled('') is False
        assert LanguageFactory.language_is_enabled('xx') is False
示例#5
0
    def test_language_is_enabled(self):
        assert LanguageFactory.language_is_enabled('en') is True
        assert LanguageFactory.language_is_enabled('lt') is True

        # noinspection PyTypeChecker
        assert LanguageFactory.language_is_enabled(None) is False
        assert LanguageFactory.language_is_enabled('') is False
        assert LanguageFactory.language_is_enabled('xx') is False
示例#6
0
    def test_language_for_code(self):
        assert isinstance(LanguageFactory.language_for_code('en'), EnglishLanguage)
        assert isinstance(LanguageFactory.language_for_code('lt'), LithuanianLanguage)

        # noinspection PyTypeChecker
        assert LanguageFactory.language_for_code(None) is None
        assert LanguageFactory.language_for_code('') is None
        assert LanguageFactory.language_for_code('xx') is None
def test_language_code_for_text():
    assert language_code_for_text(text='') == ''
    # noinspection PyTypeChecker
    assert language_code_for_text(text=None) == ''

    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert language_code == language_code_for_text(text=language.sample_sentence())
def test_identification_would_be_reliable():
    assert identification_would_be_reliable(text='') is False
    # noinspection PyTypeChecker
    assert identification_would_be_reliable(text=None) is False

    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert identification_would_be_reliable(text=language.sample_sentence())
示例#9
0
def test_identification_would_be_reliable():
    assert identification_would_be_reliable(text='') is False
    # noinspection PyTypeChecker
    assert identification_would_be_reliable(text=None) is False

    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert identification_would_be_reliable(
            text=language.sample_sentence())
示例#10
0
    def test_language_for_code(self):
        assert isinstance(LanguageFactory.language_for_code('en'),
                          EnglishLanguage)
        assert isinstance(LanguageFactory.language_for_code('lt'),
                          LithuanianLanguage)

        # noinspection PyTypeChecker
        assert LanguageFactory.language_for_code(None) is None
        assert LanguageFactory.language_for_code('') is None
        assert LanguageFactory.language_for_code('xx') is None
示例#11
0
def test_language_code_for_text():
    assert language_code_for_text(text='') == ''
    # noinspection PyTypeChecker
    assert language_code_for_text(text=None) == ''

    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert language_code == language_code_for_text(
            text=language.sample_sentence())
示例#12
0
def _get_sentences_from_story_text(story_text: str, story_lang: str) -> List[str]:
    """Split story text to individual sentences."""
    story_text = decode_object_from_bytes_if_needed(story_text)
    story_lang = decode_object_from_bytes_if_needed(story_lang)

    # Tokenize into sentences
    lang = LanguageFactory.language_for_code(story_lang)
    if not lang:
        lang = LanguageFactory.default_language()

    sentences = lang.split_text_to_sentences(story_text)

    return sentences
示例#13
0
def _get_sentences_from_content(story_text: str) -> List[str]:
    """Given raw HML content, extract the content and parse it into sentences."""
    story_text = decode_object_from_bytes_if_needed(story_text)

    lang = LanguageFactory.language_for_code(__AP_LANGUAGE_CODE)
    sentences = lang.split_text_to_sentences(text=story_text)

    return sentences
示例#14
0
def _get_sentences_from_content(story_text: str) -> List[str]:
    """Given raw HML content, extract the content and parse it into sentences."""
    story_text = decode_object_from_bytes_if_needed(story_text)

    lang = LanguageFactory.language_for_code(__AP_LANGUAGE_CODE)
    sentences = lang.split_text_to_sentences(text=story_text)

    return sentences
示例#15
0
 def test_enabled_languages(self):
     assert 'lt' in LanguageFactory.enabled_languages()
     assert 'en' in LanguageFactory.enabled_languages()
     assert 'xx' not in LanguageFactory.enabled_languages()
示例#16
0
 def test_default_language_code(self):
     assert LanguageFactory.default_language_code() == 'en'
示例#17
0
def add_content_to_test_story(db: DatabaseHandler, story: dict,
                              feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    content_language_code = None
    if 'content' in story:
        content = story['content']
        content_language_code = language_code_for_text(content)
    else:
        content = _get_test_content()

    # If language code was undetermined, or if we're using Latin test content
    if not content_language_code:
        content_language_code = 'en'

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={
                'full_text_rss': False,
                'language': content_language_code,
            },
        )

    host = get_url_host(feed['url'])

    download = db.create(table='downloads',
                         insert_hash={
                             'feeds_id': feed['feeds_id'],
                             'url': story['url'],
                             'host': host,
                             'type': 'content',
                             'sequence': 1,
                             'state': 'fetching',
                             'priority': 1,
                             'extracted': True,
                             'stories_id': story['stories_id'],
                         })

    download = store_content(db=db, download=download, content=content)

    extracted_content = html_strip(content)

    story['download'] = download
    story['content'] = extracted_content

    db.query(
        """
        INSERT INTO download_texts (downloads_id, download_text, download_text_length)
        VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s))
    """, {
            'downloads_id': download['downloads_id'],
            'download_text': extracted_content,
        })

    lang = LanguageFactory.language_for_code(content_language_code)
    assert lang, f"Language is None for code {content_language_code}"

    sentences = lang.split_text_to_sentences(extracted_content)
    sentence_number = 1
    for sentence in sentences:
        db.insert(table='story_sentences',
                  insert_hash={
                      'sentence': sentence,
                      'language': language_code_for_text(sentence) or 'en',
                      'sentence_number': sentence_number,
                      'stories_id': story['stories_id'],
                      'media_id': story['media_id'],
                      'publish_date': story['publish_date'],
                  })
        sentence_number += 1

    mark_as_processed(db=db, stories_id=story['stories_id'])

    story['download_text'] = db.query(
        """
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {
            'downloads_id': download['downloads_id']
        }).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
示例#18
0
 def test_enabled_languages(self):
     assert 'lt' in LanguageFactory.enabled_languages()
     assert 'en' in LanguageFactory.enabled_languages()
     assert 'xx' not in LanguageFactory.enabled_languages()
示例#19
0
 def test_default_language(self):
     assert isinstance(LanguageFactory.default_language(), EnglishLanguage)
示例#20
0
 def test_default_language_code(self):
     assert LanguageFactory.default_language_code() == 'en'
def test_language_code_for_text_uppercase():
    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert language_code_for_text(text=language.sample_sentence().upper()) == language_code
示例#22
0
 def test_default_language(self):
     assert isinstance(LanguageFactory.default_language(), EnglishLanguage)
示例#23
0
def test_language_code_for_text_uppercase():
    enabled_languages = LanguageFactory.enabled_languages()
    for language_code in enabled_languages:
        language = LanguageFactory.language_for_code(language_code)
        assert language_code_for_text(
            text=language.sample_sentence().upper()) == language_code