def word_rank_readability_score(cls, text: str,
                                    language: 'model.Language'):

        langtb = TextBlob(text).detect_language()
        words = nltk.word_tokenize(text)

        #detect and remove proper nouns
        if 1 == 0:
            words = []
            sentences = nltk.sent_tokenize(text)
            for s in sentences:
                # print(s)
                tokens = nltk.word_tokenize(s)
                words.append(tokens[0])
                for i in range(1, len(tokens)):
                    if not str.isupper(tokens[i][0]):
                        words.append(tokens[i])

        #remove punctuation
        words = [w for w in words if w not in string.punctuation]
        #remove digits (or dates)
        words = [w for w in words if re.search("\d", w) == None]

        #translate tokens
        words_trans = words.copy()
        for i in range(len(words)):
            try:
                translation = TextBlob(words[i]).translate(from_lang=langtb,
                                                           to='en')
                words_trans[i] = translation
                print("translated: ", words[i], "to: ", words_trans[i])
            except:
                print("not translated: ", words[i])

        #remove equivalent words, assume they are cognates and therefore not difficult
        words = [
            words[i] for i in range(len(words))
            if words_trans[i].lower != words[i].lower()
        ]

        number_of_words = len(words)

        number_of_sentences = len(nltk.sent_tokenize(text))

        constants = cls.get_constants_for_language(language)

        #calculate word rank per word
        words_stat = [Word.stats(w, language.code) for w in words]
        #throw away words that do not occur in the top 50k
        difficulties = [
            w.difficulty for w in words_stat if w.frequency is not 0
        ]
        number_of_words = len(difficulties)

        #index = constants["start"] - constants["sentence"] * (number_of_words / number_of_sentences) \
        #        - constants["word"] * (sum(difficulties) / number_of_words)
        #average difficulty of text
        return sum(difficulties) / number_of_words
Пример #2
0
    def json_serializable_dict(self, with_context=True, with_title=False):
        try:
            translation_word = self.translation.word
            translation_language = self.translation.language.code
        except AttributeError as e:
            translation_word = ""
            translation_language = ""
            zeeguu.core.log(
                f"Exception caught: for some reason there was no translation for {self.id}"
            )
            print(str(e))

        word_info = Word.stats(self.origin.word, self.origin.language.code)

        learned_datetime = str(
            self.learned_time.date()) if self.learned else ""

        created_day = "today" if self.time.date() == datetime.now().date(
        ) else ""

        bookmark_title = ""
        if with_title:
            try:
                bookmark_title = self.text.article.title
            except Exception as e:
                from sentry_sdk import capture_exception

                capture_exception(e)
                print(
                    f"could not find article title for bookmark with id: {self.id}"
                )

        result = dict(
            id=self.id,
            to=translation_word,
            from_lang=self.origin.language.code,
            to_lang=translation_language,
            title=bookmark_title,
            url=self.text.url(),
            origin_importance=word_info.importance,
            learned_datetime=learned_datetime,
            origin_rank=word_info.rank if word_info.rank != 100000 else "",
            starred=self.starred if self.starred is not None else False,
            article_id=self.text.article_id if self.text.article_id else "",
            created_day=created_day,  # human readable stuff...
            time=datetime_to_json(self.time),
            fit_for_study=self.fit_for_study == 1,
        )

        if self.text.article:
            result["article_title"] = self.text.article.title

        result["from"] = self.origin.word
        if with_context:
            result["context"] = self.text.content
        return result
Пример #3
0
    def importance_level(self):
        """
            Note that this code will break if the wordstats throws an exception,
            which could happen in case the language is inexistent…
            but this should not happen.

            Note that the importance level is float

        :return: number between 0 and 10 as returned by the wordstats module
        """
        stats = Word.stats(self.word, self.language.code)
        return int(stats.importance)
Пример #4
0
def text_difficulty(text, language, known_probabilities, difficulty_computer = 'default', rank_boundary = REFERENCE_VOCABULARY_SIZE):
    """
    :param known_probabilities: the probabilities that the user knows individual words
    :param language: the learned language
    :param difficulty_computer: if known the name of the algo used to compute the difficulty.
        currently only default is implemented
    :param personalized (deprecated)
    :param rank_boundary: 10.000 words
    :param text: text to analyse
    :return: a dictionary with three items for every text:
      1. score_average - average difficulty of the words in the text
      2. score_median - median difficulty of the words in the text
      3. estimated_difficulty - oen of three "EASY", "MEDIUM", "HARD"
    """
    word_difficulties = []

    # Calculate difficulty for each word
    words = split_words_from_text(text)

    for word in words:
        difficulty = word_difficulty(known_probabilities, True, Word.stats(word, language.code), word)
        word_difficulties.append(difficulty)

    # If we can't compute the text difficulty, we estimate hard
    if (len(word_difficulties)) == 0:
        return \
            dict(
                score_median=1,
                score_average=1,
                estimated_difficulty=1)

    # Average difficulty for text
    difficulty_average = sum(word_difficulties) / float(len(word_difficulties))

    # Median difficulty
    word_difficulties.sort()
    center = int(round(len(word_difficulties) / 2, 0))
    difficulty_median = word_difficulties[center]

    normalized_estimate = difficulty_average

    difficulty_scores = dict(
        score_median=difficulty_median,
        score_average=difficulty_average,
        estimated_difficulty=discrete_text_difficulty(difficulty_average, difficulty_median),
        # previous are for backwards compatibility reasonons
        # TODO: must be removed

        normalized=normalized_estimate,
        discrete=discrete_text_difficulty(difficulty_average, difficulty_median),
    )

    return difficulty_scores
Пример #5
0
    def json_serializable_dict(self, with_context=True, with_title=False):
        try:
            translation_word = self.translation.word
        except AttributeError as e:
            translation_word = ''
            zeeguu_core.log(
                f"Exception caught: for some reason there was no translation for {self.id}"
            )
            print(str(e))

        word_info = Word.stats(self.origin.word, self.origin.language.code)

        learned_datetime = str(
            self.learned_time.date()) if self.learned else ''

        created_day = "today" if self.time.date() == datetime.now().date(
        ) else ''

        bookmark_title = ""
        if with_title:
            try:
                bookmark_title = self.text.article.title
            except Exception as e:
                from sentry_sdk import capture_exception
                capture_exception(e)
                print(
                    f"could not find article title for bookmark with id: {self.id}"
                )

        result = dict(
            id=self.id,
            to=translation_word,
            from_lang=self.origin.language.code,
            to_lang=self.translation.language.code,
            title=bookmark_title,
            url=self.text.url.as_string(),
            origin_importance=word_info.importance,
            learned_datetime=SortedExerciseLog(
                self).str_most_recent_correct_dates(),
            origin_rank=word_info.rank if word_info.rank != 100000 else '',
            starred=self.starred if self.starred is not None else False,
            article_id=self.text.article_id if self.text.article_id else '',
            created_day=created_day,  # human readable stuff...
            time=self.time.strftime(JSON_TIME_FORMAT))

        if self.text.article:
            result['article_title'] = self.text.article.title

        result["from"] = self.origin.word
        if with_context:
            result['context'] = self.text.content
        return result
Пример #6
0
    def json_serializable_dict(self, with_context=True):
        try:
            translation_word = self.translation.word
        except AttributeError as e:
            translation_word = ''
            zeeguu.log(
                f"Exception caught: for some reason there was no translation for {self.id}"
            )
            print(str(e))

        result = dict(
            id=self.id,
            to=translation_word,
            from_lang=self.origin.language.code,
            to_lang=self.translation.language.code,
            title=self.text.url.title,
            url=self.text.url.as_string(),
            origin_importance=Word.stats(self.origin.word,
                                         self.origin.language.code).importance)
        result["from"] = self.origin.word
        if with_context:
            result['context'] = self.text.content
        return result
Пример #7
0
 def rank(b):
     return Word.stats(b.origin.word, b.origin.language.code).rank
Пример #8
0
 def importance_level(self):
     stats = Word.stats(self.word, self.language.code)
     if stats:
         return int(min(stats.importance, 10))
     else:
         return 0
Пример #9
0
    if not bookmark.origin.language.code == language:
        continue

    # if not bookmark.quality_bookmark():
    #     continue

    if len(bookmark.origin.word) < 4:
        continue

    date_key = bookmark.time.strftime("%y-%m")

    if date_key not in months_dict:
        months_dict[date_key] = SortedList(key=lambda x: x.rank)

    word_stats = Word.stats(bookmark.origin.word, language)

    if word_stats.rank == 100000:
        print("ignoring: " + bookmark.origin.word)
        print(word_stats.rank)
        continue

    # our user has a lot of het's
    # might make sense to keep a word only once

    if word_stats not in months_dict[date_key]:
        months_dict[date_key].add(word_stats)

for key in months_dict:

    len_for_month = len(months_dict[key])