def LexicalDiversity(text): tok = ld.tokenize(text) basic = len(tok) / len(set(tok)) SimpleTTR = ld.ttr(tok) RootTTR = ld.root_ttr(tok) # lexical_diversity(text)[2] # sztem ez a legjobb lexdiv mutató LogTTR = ld.log_ttr(tok) return basic, SimpleTTR, RootTTR, LogTTR
def build_aux_metrics(filename_series, doc_series): lex_vol = []; ttr = []; mtld = []; vocd = [] # lexical div measures neg_mean = []; neu_mean = []; pos_mean = []; compound_mean = [] neg_std = []; neu_std = []; pos_std = []; compound_std = [] filename = [] # sentiment measures for i0 in range(len(doc_series)): filename0 = filename_series.iloc[i0]; filename0 doc0 = doc_series.iloc[i0]; doc0 doc0_list = nltk.sent_tokenize(doc0); doc0_list doc0_string = " ".join(doc0_list); doc0_string n1 = len(doc0_list); n1 if n1 > 1: vs_list = [] for i1 in range(n1): sent0 = doc0_list[i1] vs0 = analyzer.polarity_scores(sent0); vs0 vs_list.append(vs0) doc0_df = pd.DataFrame(vs_list); doc0_df mean_list0 = [x for x in doc0_df.mean()]; mean_list0 std_list0 = [x for x in doc0_df.std()]; std_list0 else: mean_list0 = [float(0) for x in range(4)]; mean_list0 std_list0 = [float(0) for x in range(4)]; std_list0 neg_mean.append(mean_list0[0]); neu_mean.append(mean_list0[1]) pos_mean.append(mean_list0[2]); compound_mean.append(mean_list0[3]) neg_std.append(std_list0[0]); neu_std.append(std_list0[1]) pos_std.append(std_list0[2]); compound_std.append(std_list0[3]) filename.append(filename0) flt = ld.flemmatize(doc0_string); flt lex_vol0 = len(flt) # lexical volume measure ttr0 = ld.ttr(flt) # basic Text-Type Ratio or TTR mtld0 = ld.mtld(flt) # Measure of Textual Lexical Diversity (MTLD) for lexical variability vocd0 = ld.hdd(flt) # vocd or Hypergeometric distribution D (HDD), as per McCarthy and Jarvis (2007, 2010) lex_vol.append(lex_vol0) ttr.append(ttr0) mtld.append(mtld0) vocd.append(vocd0) if i0%5000 == 0: print(i0) # save as df df1 = pd.DataFrame({'filename':filename, 'senti_neg': neg_mean, 'senti_neu': neu_mean, 'senti_pos': pos_mean, 'senti_compound': compound_mean, 'senti_neg_std': neg_std, 'senti_neu_std': neu_std, 'senti_pos_std': pos_std, 'senti_compound_std': compound_std, 'lex_vol':lex_vol, 'ttr':ttr, 'mtld':mtld, 'vocd':vocd}) return(df1)
def extract_lexical_features(Authors): ''' Extract the readability and typed-token-ratio features Takes dictionary of authors as an input and returns the modified version. ''' # On raw text, get average grade level of the tweets for author in Authors.keys(): Authors[author].readability = 0 for tweet in Authors[author].tweets: Authors[author].readability += (textstat.text_standard(tweet, float_output=True)/len(Authors[author].tweets)) # i am angery at textstat # On lemmatized text, get the TTR to determine the lexical diversity for author in Authors.keys(): Authors[author].TTR = ld.ttr(Authors[author].clean) return Authors
len_word_rng_auth = [max(len_tw_word[i*100:i*100+99])-min(len_tw_word[i*100:i*100+99]) for i in range(int(len(len_tw_word)/100))] len_char_mean_auth = [np.mean(len_tw_char[i*100:i*100+99]) for i in range(int(len(len_tw_char)/100))] len_word_mean_auth = [np.mean(len_tw_word[i*100:i*100+99]) for i in range(int(len(len_tw_word)/100))] ########## # # vocab variety (TTR) # tweets_szerz = [" ".join(list(es_data["Tweets"])[i*100:99+i*100]) for i in range(int(len(len_tw_char)/100))] ttr_szerz = [ld.ttr(ld.flemmatize(i)) for i in tweets_szerz] ########## # # tags # #RT rt_szerz = [np.sum([k == "RT" for k in i.split(" ")]) for i in tweets_szerz] #URL url_szerz = [np.sum([k == "#URL#" for k in i.split(" ")]) for i in tweets_szerz] #hashtag hsg_szerz = [np.sum([k == "#HASHTAG#" for k in i.split(" ")]) for i in tweets_szerz]
def get_news_features(headline, text): nlp = es_core_news_md.load() ## headline ## headline = re.sub(r"http\S+", "", headline) headline = re.sub(r"http", "", headline) headline = re.sub(r"@\S+", "", headline) headline = re.sub("\n", " ", headline) headline = re.sub(r"(?<!\n)\n(?!\n)", " ", headline) headline = headline.replace(r"*NUMBER*", "número") headline = headline.replace(r"*PHONE*", "número") headline = headline.replace(r"*EMAIL*", "email") headline = headline.replace(r"*URL*", "url") headline_lower = headline.lower() doc_h = nlp(headline_lower) list_tokens_h = [] list_tags_h = [] for sentence_h in doc_h.sents: for token in sentence_h: list_tokens_h.append(token.text) fdist_h = FreqDist(list_tokens_h) syllables_h = get_nsyllables(headline) words_h = len(list_tokens_h) # headline complexity features avg_word_size_h = round( sum(len(word) for word in list_tokens_h) / words_h, 2) avg_syllables_word_h = round(syllables_h / words_h, 2) unique_words_h = round((len(fdist_h.hapaxes()) / words_h) * 100, 2) mltd_h = round(ld.mtld(list_tokens_h), 2) ttr_h = round(ld.ttr(list_tokens_h) * 100, 2) ## text content## text = re.sub(r"http\S+", "", text) text = re.sub(r"http", "", text) text = re.sub("\n", " ", text) text = text.replace(r"*NUMBER*", "número") text = text.replace(r"*PHONE*", "número") text = text.replace(r"*EMAIL*", "email") text = text.replace(r"*URL*", "url") # to later calculate upper case letters ratio alph = list(filter(str.isalpha, text)) text_lower = text.lower() doc = nlp(text_lower) list_tokens = [] list_pos = [] list_tag = [] list_entities = [] sents = 0 for entity in doc.ents: list_entities.append(entity.label_) for sentence in doc.sents: sents += 1 for token in sentence: list_tokens.append(token.text) list_pos.append(token.pos_) list_tag.append(token.tag_) # Calculate entities, pos, tag, freq, syllables, words and quotes entities = len(list_entities) n_pos = nltk.Counter(list_pos) n_tag = nltk.Counter(list_tag) fdist = FreqDist(list_tokens) syllables = get_nsyllables(text) words = len(list_tokens) quotes = n_tag['PUNCT__PunctType=Quot'] # complexity features avg_word_sentence = round(words / sents, 2) avg_word_size = round(sum(len(word) for word in list_tokens) / words, 2) avg_syllables_word = round(syllables / words, 2) unique_words = round((len(fdist.hapaxes()) / words) * 100, 2) ttr = round(ld.ttr(list_tokens) * 100, 2) # readability spanish test huerta_score = round( 206.84 - (60 * avg_syllables_word) - (1.02 * avg_word_sentence), 2) szigriszt_score = round( 206.835 - ((62.3 * syllables) / words) - (words / sents), 2) # stylometric features mltd = round(ld.mtld(list_tokens), 2) upper_case_ratio = round(sum(map(str.isupper, alph)) / len(alph) * 100, 2) entity_ratio = round((entities / words) * 100, 2) quotes_ratio = round((quotes / words) * 100, 2) propn_ratio = round((n_pos['PROPN'] / words) * 100, 2) noun_ratio = round((n_pos['NOUN'] / words) * 100, 2) pron_ratio = round((n_pos['PRON'] / words) * 100, 2) adp_ratio = round((n_pos['ADP'] / words) * 100, 2) det_ratio = round((n_pos['DET'] / words) * 100, 2) punct_ratio = round((n_pos['PUNCT'] / words) * 100, 2) verb_ratio = round((n_pos['VERB'] / words) * 100, 2) adv_ratio = round((n_pos['ADV'] / words) * 100, 2) sym_ratio = round((n_tag['SYM'] / words) * 100, 2) # create df_features df_features = pd.DataFrame({ 'text': text_lower, 'headline': headline_lower, 'words_h': words_h, 'word_size_h': [avg_word_size_h], 'avg_syllables_word_h': [avg_syllables_word_h], 'unique_words_h': [unique_words_h], 'ttr_h': ttr_h, 'mltd_h': [mltd_h], 'sents': sents, 'words': words, 'avg_words_sent': [avg_word_sentence], 'avg_word_size': [avg_word_size], 'avg_syllables_word': avg_syllables_word, 'unique_words': [unique_words], 'ttr': [ttr], 'huerta_score': [huerta_score], 'szigriszt_score': [szigriszt_score], 'mltd': [mltd], 'upper_case_ratio': [upper_case_ratio], 'entity_ratio': [entity_ratio], 'quotes': quotes, 'quotes_ratio': [quotes_ratio], 'propn_ratio': [propn_ratio], 'noun_ratio': [noun_ratio], 'pron_ratio': [pron_ratio], 'adp_ratio': [adp_ratio], 'det_ratio': [det_ratio], 'punct_ratio': [punct_ratio], 'verb_ratio': [verb_ratio], 'adv_ratio': [adv_ratio], 'sym_ratio': [sym_ratio] }) return df_features
def lex_div(self, text): token = ld.tokenize(text) return ld.ttr(token)
def lexical_ttr(tokens): return lex_div.ttr(tokens)
def preprocess(df_total): """Preprocessing article text : avergae length of sentence, frequency of tags, POS tagging""" # Cleaning text df_total["text"] = df_total.text.apply(lambda x: x.lower()) # table = str.maketrans('', '', string.punctuation) # df_total["text"] = df_total.text.apply(lambda x: x.translate(table)) df_total["text"] = df_total.text.apply(lambda x: re.sub(r'\d+', 'num', x)) # substituting "U.S." df_total["little_clean"] = df_total.text.apply( lambda x: re.sub("U.S.", "United States", x)) # cleaning text table_ = str.maketrans('', '') df_total['cleaned_text'] = df_total.text.str.translate(table_) # *******SYNTACTIC FEATURES *******# # splitting articles into sentences df_total["sentences"] = df_total.little_clean.str.split("\. ") # calculating num of sentences in each article df_total["num_of_sentences"] = df_total.sentences.apply(lambda x: len(x)) # average length of sentences df_total["avg_sentence_length"] = df_total.sentences.apply( lambda x: round(np.mean([len(item) for item in x]))) # POS Tagging df_total['POS_tags'] = df_total.cleaned_text.apply( lambda x: nltk.pos_tag(nltk.word_tokenize(x), tagset='universal')) # frequency of tags df_total["tag_fq"] = df_total.POS_tags.apply( lambda x: nltk.FreqDist(tag for (word, tag) in x)) # count of each tag in each article df_total['Noun'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['NOUN']) df_total['Verb'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['VERB']) df_total['Punctuation'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['.']) df_total['Adposition'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['ADP']) df_total['Determiner'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['DET']) df_total['Adjective'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['ADJ']) df_total['Particle'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['PRT']) df_total['Adverb'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['ADV']) df_total['Pronoun'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['PRON']) df_total['Conjunction'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['CONJ']) df_total['Numeral'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['NUM']) df_total['Other'] = df_total.tag_fq.apply( lambda x: pos(x.most_common())['X']) # *********LEXICAL FEATURES **********# # word count df_total['characters_count'] = df_total.text.str.len() # Filtering only large texts df_total = df_total.loc[df_total.characters_count >= 100] # word average df_total['word_average'] = df_total['text'].apply( lambda x: np.mean([len(w) for w in x.split(' ')])) # lexical diversity df_total['lexical_diversity'] = df_total.text.apply( lambda x: ld.ttr([w for w in x.split(' ')])) # lexical richness df_total['lex_words'] = df_total.text.apply( lambda x: LexicalRichness(x).words) df_total['lex_uniquewords'] = df_total.text.apply( lambda x: LexicalRichness(x).terms) df_total['lex_ttr'] = df_total.text.apply( lambda x: LexicalRichness( x).ttr) # type token ratio : lexical richness # *********PSYCOLINGUISTIC FEATURES **********# # Sentiment score analyser = SentimentIntensityAnalyzer() df_total['sentiment_score'] = df_total.text.apply( lambda x: analyser.polarity_scores(x)['compound']) return df_total
wb = Workbook() sheet = wb.active wbin = load_workbook(filename="yale_tweets.xlsx") wbinsh = wbin.active sheet["A1"] = "Flesh Reading Ease" sheet["B1"] = "Flesh Kincaid Grade Level" sheet["C1"] = "Coleman Liau Index" sheet["D1"] = "Gunning Fog Index" sheet["E1"] = "SMOG Index" sheet["F1"] = "ARI Index" sheet["G1"] = "LIX Index" sheet["H1"] = "Dale-Chall Score" sheet["I1"] = "TTR Simple" for i in range(1,144332): if len(str(wbinsh["K"+str(i)].value).split(" "))> 15: calc = readcalc.ReadCalc(wbinsh["K"+str(i)].value) tokenized = word_tokenize(str(wbinsh["K"+str(i)].value)) sheet["A"+str(i+1)] = calc.get_flesch_reading_ease() sheet["B"+str(i+1)] = calc.get_flesch_kincaid_grade_level() sheet["C"+str(i+1)] = calc.get_coleman_liau_index() sheet["D"+str(i+1)] = calc.get_gunning_fog_index() sheet["E"+str(i+1)] = calc.get_smog_index() sheet["F"+str(i+1)] = calc.get_ari_index() sheet["G"+str(i+1)] = calc.get_lix_index() sheet["H"+str(i+1)] = calc.get_dale_chall_score() sheet["I"+str(i+1)] = ld.ttr(tokenized) wb.save(filename="yale_scores.xlsx")