def compute_mtld(sentences): ''' Computes the MTLD :param sentences: sentences :returns: The MTLD (float) ''' ll = ' '.join(sentences) return ld.mtld(ll)
def build_aux_metrics(filename_series, doc_series): lex_vol = []; ttr = []; mtld = []; vocd = [] # lexical div measures neg_mean = []; neu_mean = []; pos_mean = []; compound_mean = [] neg_std = []; neu_std = []; pos_std = []; compound_std = [] filename = [] # sentiment measures for i0 in range(len(doc_series)): filename0 = filename_series.iloc[i0]; filename0 doc0 = doc_series.iloc[i0]; doc0 doc0_list = nltk.sent_tokenize(doc0); doc0_list doc0_string = " ".join(doc0_list); doc0_string n1 = len(doc0_list); n1 if n1 > 1: vs_list = [] for i1 in range(n1): sent0 = doc0_list[i1] vs0 = analyzer.polarity_scores(sent0); vs0 vs_list.append(vs0) doc0_df = pd.DataFrame(vs_list); doc0_df mean_list0 = [x for x in doc0_df.mean()]; mean_list0 std_list0 = [x for x in doc0_df.std()]; std_list0 else: mean_list0 = [float(0) for x in range(4)]; mean_list0 std_list0 = [float(0) for x in range(4)]; std_list0 neg_mean.append(mean_list0[0]); neu_mean.append(mean_list0[1]) pos_mean.append(mean_list0[2]); compound_mean.append(mean_list0[3]) neg_std.append(std_list0[0]); neu_std.append(std_list0[1]) pos_std.append(std_list0[2]); compound_std.append(std_list0[3]) filename.append(filename0) flt = ld.flemmatize(doc0_string); flt lex_vol0 = len(flt) # lexical volume measure ttr0 = ld.ttr(flt) # basic Text-Type Ratio or TTR mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability vocd0 = ld.hdd(flt) # vocd or Hypergeometric distribution D (HDD), as per McCarthy and Jarvis (2007, 2010) lex_vol.append(lex_vol0) ttr.append(ttr0) mtld.append(mtld0) vocd.append(vocd0) if i0%5000 == 0: print(i0) # save as df df1 = pd.DataFrame({'filename':filename, 'senti_neg': neg_mean, 'senti_neu': neu_mean, 'senti_pos': pos_mean, 'senti_compound': compound_mean, 'senti_neg_std': neg_std, 'senti_neu_std': neu_std, 'senti_pos_std': pos_std, 'senti_compound_std': compound_std, 'lex_vol':lex_vol, 'ttr':ttr, 'mtld':mtld, 'vocd':vocd}) return(df1)
def build_aux_metrics1(filename_series, doc_series): lex_vol = []; mtld = []; # lexical div measures compound_mean = []; compound_std = [] # sentiment measures filename = []; #hyp_relev_num =[] for i0 in range(len(doc_series)): filename0 = filename_series.iloc[i0]; filename0 doc0 = doc_series.iloc[i0]; doc0 doc0_list = nltk.sent_tokenize(doc0); doc0_list doc0_string = " ".join(doc0_list); doc0_string n1 = len(doc0_list); n1 if n1 > 1: vs_list = [] for i1 in range(n1): sent0 = doc0_list[i1] vs0 = analyzer.polarity_scores(sent0); vs0 vs_list.append(vs0) doc0_df = pd.DataFrame(vs_list); doc0_df mean_list0 = [x for x in doc0_df.mean()]; mean_list0 std_list0 = [x for x in doc0_df.std()]; std_list0 else: mean_list0 = [float(0) for x in range(4)]; mean_list0 std_list0 = [float(0) for x in range(4)]; std_list0 compound_mean.append(mean_list0[3]); compound_std.append(std_list0[3]) filename.append(filename0) flt = ld.flemmatize(str(doc0_string)); flt lex_vol0 = len(flt) # lexical volume measure mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability lex_vol.append(lex_vol0) mtld.append(mtld0) if i0%5000 == 0: print(i0) # save as df df1 = pd.DataFrame({'filename':filename, 'senti_compound': compound_mean, 'senti_compound_std': compound_std, 'lex_vol':lex_vol, 'mtld':mtld}) return(df1)
def lexdiv(self, text): self.lexical_diversity = float('{:.2f}'.format( ld.mtld(ld.flemmatize(text)))) pass
def get_news_features(headline, text): nlp = es_core_news_md.load() ## headline ## headline = re.sub(r"http\S+", "", headline) headline = re.sub(r"http", "", headline) headline = re.sub(r"@\S+", "", headline) headline = re.sub("\n", " ", headline) headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline) headline = headline.replace(r"*NUMBER*", "número") headline = headline.replace(r"*PHONE*", "número") headline = headline.replace(r"*EMAIL*", "email") headline = headline.replace(r"*URL*", "url") headline_lower = headline.lower() doc_h = nlp(headline_lower) list_tokens_h = [] list_tags_h = [] for sentence_h in doc_h.sents: for token in sentence_h: list_tokens_h.append(token.text) fdist_h = FreqDist(list_tokens_h) syllables_h = get_nsyllables(headline) words_h = len(list_tokens_h) # headline complexity features avg_word_size_h = round( sum(len(word) for word in list_tokens_h) / words_h, 2) avg_syllables_word_h = round(syllables_h / words_h, 2) unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2) mltd_h = round(ld.mtld(list_tokens_h), 2) ttr_h = round(ld.ttr(list_tokens_h) * 100, 2) ## text content## text = re.sub(r"http\S+", "", text) text = re.sub(r"http", "", text) text = re.sub("\n", " ", text) text = text.replace(r"*NUMBER*", "número") text = text.replace(r"*PHONE*", "número") text = text.replace(r"*EMAIL*", "email") text = text.replace(r"*URL*", "url") # to later calculate upper case letters ratio alph = list(filter(str.isalpha, text)) text_lower = text.lower() doc = nlp(text_lower) list_tokens = [] list_pos = [] list_tag = [] list_entities = [] sents = 0 for entity in doc.ents: list_entities.append(entity.label_) for sentence in doc.sents: sents += 1 for token in sentence: list_tokens.append(token.text) list_pos.append(token.pos_) list_tag.append(token.tag_) # Calculate entities, pos, tag, freq, syllables, words and quotes entities = len(list_entities) n_pos = nltk.Counter(list_pos) n_tag = nltk.Counter(list_tag) fdist = FreqDist(list_tokens) syllables = get_nsyllables(text) words = len(list_tokens) quotes = n_tag['PUNCT__PunctType=Quot'] # complexity features avg_word_sentence = round(words / sents, 2) avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2) avg_syllables_word = round(syllables / words, 2) unique_words = round((len(fdist.hapaxes()) / words) * 100, 2) ttr = round(ld.ttr(list_tokens) * 100, 2) # readability spanish test huerta_score = round( 206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2) szigriszt_score = round( 206.835 - ((62.3 * syllables) / words) - (words / sents), 2) # stylometric features mltd = round(ld.mtld(list_tokens), 2) upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2) entity_ratio = round((entities / words) * 100, 2) quotes_ratio = round((quotes / words) * 100, 2) propn_ratio = round((n_pos['PROPN'] / words) * 100, 2) noun_ratio = round((n_pos['NOUN'] / words) * 100, 2) pron_ratio = round((n_pos['PRON'] / words) * 100, 2) adp_ratio = round((n_pos['ADP'] / words) * 100, 2) det_ratio = round((n_pos['DET'] / words) * 100, 2) punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2) verb_ratio = round((n_pos['VERB'] / words) * 100, 2) adv_ratio = round((n_pos['ADV'] / words) * 100, 2) sym_ratio = round((n_tag['SYM'] / words) * 100, 2) # create df_features df_features = pd.DataFrame({ 'text': text_lower, 'headline': headline_lower, 'words_h': words_h, 'word_size_h': [avg_word_size_h], 'avg_syllables_word_h': [avg_syllables_word_h], 'unique_words_h': [unique_words_h], 'ttr_h': ttr_h, 'mltd_h': [mltd_h], 'sents': sents, 'words': words, 'avg_words_sent': [avg_word_sentence], 'avg_word_size': [avg_word_size], 'avg_syllables_word': avg_syllables_word, 'unique_words': [unique_words], 'ttr': [ttr], 'huerta_score': [huerta_score], 'szigriszt_score': [szigriszt_score], 'mltd': [mltd], 'upper_case_ratio': [upper_case_ratio], 'entity_ratio': [entity_ratio], 'quotes': quotes, 'quotes_ratio': [quotes_ratio], 'propn_ratio': [propn_ratio], 'noun_ratio': [noun_ratio], 'pron_ratio': [pron_ratio], 'adp_ratio': [adp_ratio], 'det_ratio': [det_ratio], 'punct_ratio': [punct_ratio], 'verb_ratio': [verb_ratio], 'adv_ratio': [adv_ratio], 'sym_ratio': [sym_ratio] }) return df_features
def lexical_mtld(tokens): return lex_div.mtld(tokens)
def mtld_ld(text): tokens = BasicMetrics.trueTokens(text) return ld.mtld(tokens)