예제 #1
0
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        if self.__copy_to is None:
            raise StopIteration

        sentence = self.__copy_to.get_line()

        if sentence is None:
            self.__copy_to.end()
            self.__copy_to = None
            raise StopIteration

        sentence = sentence.strip()

        self.__sentence_counter += 1
        if self.__sentence_counter % 1000 == 0:
            log.info("Feeding sentence %d..." % self.__sentence_counter)

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
예제 #2
0
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        sentence = self.__next_sentence()
        if sentence is None:
            raise StopIteration

        sentence = sentence.strip()

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
    def __next__(self) -> List[str]:
        """Return list of next sentence's words to be added to the word2vec vector."""

        sentence = self.__next_sentence()
        if sentence is None:
            raise StopIteration

        sentence = sentence.strip()

        if not len(sentence):
            return []

        language = None
        if identification_would_be_reliable(sentence):
            language_code = language_code_for_text(sentence)
            language = LanguageFactory.language_for_code(language_code)
        if language is None:
            language = LanguageFactory.default_language()

        words = language.split_sentence_to_words(sentence)

        if not len(words):
            return []

        return words
예제 #4
0
def _get_sentences_from_story_text(story_text: str, story_lang: str) -> List[str]:
    """Split story text to individual sentences."""
    story_text = decode_object_from_bytes_if_needed(story_text)
    story_lang = decode_object_from_bytes_if_needed(story_lang)

    # Tokenize into sentences
    lang = LanguageFactory.language_for_code(story_lang)
    if not lang:
        lang = LanguageFactory.default_language()

    sentences = lang.split_text_to_sentences(story_text)

    return sentences
예제 #5
0
 def test_default_language(self):
     assert isinstance(LanguageFactory.default_language(), EnglishLanguage)
예제 #6
0
 def test_default_language(self):
     assert isinstance(LanguageFactory.default_language(), EnglishLanguage)