def word_rank_readability_score(cls, text: str, language: 'model.Language'): langtb = TextBlob(text).detect_language() words = nltk.word_tokenize(text) #detect and remove proper nouns if 1 == 0: words = [] sentences = nltk.sent_tokenize(text) for s in sentences: # print(s) tokens = nltk.word_tokenize(s) words.append(tokens[0]) for i in range(1, len(tokens)): if not str.isupper(tokens[i][0]): words.append(tokens[i]) #remove punctuation words = [w for w in words if w not in string.punctuation] #remove digits (or dates) words = [w for w in words if re.search("\d", w) == None] #translate tokens words_trans = words.copy() for i in range(len(words)): try: translation = TextBlob(words[i]).translate(from_lang=langtb, to='en') words_trans[i] = translation print("translated: ", words[i], "to: ", words_trans[i]) except: print("not translated: ", words[i]) #remove equivalent words, assume they are cognates and therefore not difficult words = [ words[i] for i in range(len(words)) if words_trans[i].lower != words[i].lower() ] number_of_words = len(words) number_of_sentences = len(nltk.sent_tokenize(text)) constants = cls.get_constants_for_language(language) #calculate word rank per word words_stat = [Word.stats(w, language.code) for w in words] #throw away words that do not occur in the top 50k difficulties = [ w.difficulty for w in words_stat if w.frequency is not 0 ] number_of_words = len(difficulties) #index = constants["start"] - constants["sentence"] * (number_of_words / number_of_sentences) \ # - constants["word"] * (sum(difficulties) / number_of_words) #average difficulty of text return sum(difficulties) / number_of_words
def json_serializable_dict(self, with_context=True, with_title=False): try: translation_word = self.translation.word translation_language = self.translation.language.code except AttributeError as e: translation_word = "" translation_language = "" zeeguu.core.log( f"Exception caught: for some reason there was no translation for {self.id}" ) print(str(e)) word_info = Word.stats(self.origin.word, self.origin.language.code) learned_datetime = str( self.learned_time.date()) if self.learned else "" created_day = "today" if self.time.date() == datetime.now().date( ) else "" bookmark_title = "" if with_title: try: bookmark_title = self.text.article.title except Exception as e: from sentry_sdk import capture_exception capture_exception(e) print( f"could not find article title for bookmark with id: {self.id}" ) result = dict( id=self.id, to=translation_word, from_lang=self.origin.language.code, to_lang=translation_language, title=bookmark_title, url=self.text.url(), origin_importance=word_info.importance, learned_datetime=learned_datetime, origin_rank=word_info.rank if word_info.rank != 100000 else "", starred=self.starred if self.starred is not None else False, article_id=self.text.article_id if self.text.article_id else "", created_day=created_day, # human readable stuff... time=datetime_to_json(self.time), fit_for_study=self.fit_for_study == 1, ) if self.text.article: result["article_title"] = self.text.article.title result["from"] = self.origin.word if with_context: result["context"] = self.text.content return result
def importance_level(self): """ Note that this code will break if the wordstats throws an exception, which could happen in case the language is inexistent… but this should not happen. Note that the importance level is float :return: number between 0 and 10 as returned by the wordstats module """ stats = Word.stats(self.word, self.language.code) return int(stats.importance)
def text_difficulty(text, language, known_probabilities, difficulty_computer = 'default', rank_boundary = REFERENCE_VOCABULARY_SIZE): """ :param known_probabilities: the probabilities that the user knows individual words :param language: the learned language :param difficulty_computer: if known the name of the algo used to compute the difficulty. currently only default is implemented :param personalized (deprecated) :param rank_boundary: 10.000 words :param text: text to analyse :return: a dictionary with three items for every text: 1. score_average - average difficulty of the words in the text 2. score_median - median difficulty of the words in the text 3. estimated_difficulty - oen of three "EASY", "MEDIUM", "HARD" """ word_difficulties = [] # Calculate difficulty for each word words = split_words_from_text(text) for word in words: difficulty = word_difficulty(known_probabilities, True, Word.stats(word, language.code), word) word_difficulties.append(difficulty) # If we can't compute the text difficulty, we estimate hard if (len(word_difficulties)) == 0: return \ dict( score_median=1, score_average=1, estimated_difficulty=1) # Average difficulty for text difficulty_average = sum(word_difficulties) / float(len(word_difficulties)) # Median difficulty word_difficulties.sort() center = int(round(len(word_difficulties) / 2, 0)) difficulty_median = word_difficulties[center] normalized_estimate = difficulty_average difficulty_scores = dict( score_median=difficulty_median, score_average=difficulty_average, estimated_difficulty=discrete_text_difficulty(difficulty_average, difficulty_median), # previous are for backwards compatibility reasonons # TODO: must be removed normalized=normalized_estimate, discrete=discrete_text_difficulty(difficulty_average, difficulty_median), ) return difficulty_scores
def json_serializable_dict(self, with_context=True, with_title=False): try: translation_word = self.translation.word except AttributeError as e: translation_word = '' zeeguu_core.log( f"Exception caught: for some reason there was no translation for {self.id}" ) print(str(e)) word_info = Word.stats(self.origin.word, self.origin.language.code) learned_datetime = str( self.learned_time.date()) if self.learned else '' created_day = "today" if self.time.date() == datetime.now().date( ) else '' bookmark_title = "" if with_title: try: bookmark_title = self.text.article.title except Exception as e: from sentry_sdk import capture_exception capture_exception(e) print( f"could not find article title for bookmark with id: {self.id}" ) result = dict( id=self.id, to=translation_word, from_lang=self.origin.language.code, to_lang=self.translation.language.code, title=bookmark_title, url=self.text.url.as_string(), origin_importance=word_info.importance, learned_datetime=SortedExerciseLog( self).str_most_recent_correct_dates(), origin_rank=word_info.rank if word_info.rank != 100000 else '', starred=self.starred if self.starred is not None else False, article_id=self.text.article_id if self.text.article_id else '', created_day=created_day, # human readable stuff... time=self.time.strftime(JSON_TIME_FORMAT)) if self.text.article: result['article_title'] = self.text.article.title result["from"] = self.origin.word if with_context: result['context'] = self.text.content return result
def json_serializable_dict(self, with_context=True): try: translation_word = self.translation.word except AttributeError as e: translation_word = '' zeeguu.log( f"Exception caught: for some reason there was no translation for {self.id}" ) print(str(e)) result = dict( id=self.id, to=translation_word, from_lang=self.origin.language.code, to_lang=self.translation.language.code, title=self.text.url.title, url=self.text.url.as_string(), origin_importance=Word.stats(self.origin.word, self.origin.language.code).importance) result["from"] = self.origin.word if with_context: result['context'] = self.text.content return result
def rank(b): return Word.stats(b.origin.word, b.origin.language.code).rank
def importance_level(self): stats = Word.stats(self.word, self.language.code) if stats: return int(min(stats.importance, 10)) else: return 0
if not bookmark.origin.language.code == language: continue # if not bookmark.quality_bookmark(): # continue if len(bookmark.origin.word) < 4: continue date_key = bookmark.time.strftime("%y-%m") if date_key not in months_dict: months_dict[date_key] = SortedList(key=lambda x: x.rank) word_stats = Word.stats(bookmark.origin.word, language) if word_stats.rank == 100000: print("ignoring: " + bookmark.origin.word) print(word_stats.rank) continue # our user has a lot of het's # might make sense to keep a word only once if word_stats not in months_dict[date_key]: months_dict[date_key].add(word_stats) for key in months_dict: len_for_month = len(months_dict[key])