def compute_lexical_richness(events, by=["event_type"], extent=["discontiguous_triggers"], preproc=None): """ Compute lexical richness measures of unit attributes. event_type extracts the mention tokens :return: :param events: list of Event objects :param by: Group metric by attribute name. Used for grouping by event_type or subtype :param extent: Extent of the text getter functions on Event. Default: Full even trigger with discont., :param preproc: list of preprocessing functions that take a string of text as input. :return: """ from lexicalrichness import LexicalRichness print( f"Computing lexical richness of {str(extent).upper()} grouped by {str(by).upper()} with preprocessing: {str(preproc)}" ) # collect text by attribute all_text = {} for attrib_name, g in groupby(events, key=lambda x: (getattr(x, attrib_n) for attrib_n in by)): attrib_name = ".".join(str(attrib_name)) for event in g: text = event.get_extent_text(extent=extent) if preproc: for preproc_func in preproc: text = preproc_func(text) all_text.setdefault(attrib_name, []).append(text) # compute lexical diversity by attribute d = [] for attrib_name, text in all_text.items(): # This was a bad idea because mention TTR is nearly always 1. # # mean mention type-token ratio: variant of mean segment ttr (Johnsson 1944) # # instead of to_fix segments: annotation mentions # mention_ttr = [LexicalRichness(t).ttr for t in text] # mmttr = sum(mention_ttr) / len(text) # print(mention_ttr) # print(mmttr) # Lexical entropy p, lns = Counter(text), float(len(text)) entropy = -sum(count / lns * math.log(count / lns, 2) for count in p.values()) # metrics on all mentions together text = " ".join(text) lr = LexicalRichness(text) d.append({ "annotation_type": attrib_name, # "Mean mention TTR": mmttr, # this was a bad idea of mine "cttr": lr.cttr, "entropy": entropy, "dugast": lr.Dugast, "type_count": lr.terms, "token_count": lr.words, "herdan": lr.Herdan, "somers": lr.Summer, "maas": lr.Maas, # low sensivitty "ttr": lr.ttr, "rttr": lr.rttr, "mtld": lr.mtld(threshold=0.72), # length correct, mid sensivitty "msttr": lr.msttr(segment_window=25), # length correct, mid sensivity "mattr": lr.mattr(window_size=25), # length correct, mid sensivitty "hdd": lr.hdd(draws=42), # length correct, high sensitivity }) df_lr = pd.DataFrame(d) # invert Maas for plotting df_lr["maas_inv"] = df_lr["maas"] * -1.0 rec_metrics = ["maas", "hdd", "mtld"] # recommended metrics in McCarthy 2010 # rank df_lr = util.rank_dataframe_column( df_lr, ascending=False) # add rank column for easy comparison df_lr["maas_rank"] = (df_lr["maas"].rank().astype(int) ) # Maas is inverted, lower score is more richness df_lr = df_lr.drop(labels=["annotation_type_rank"], axis=1) # no need for index column ranking # nicer output df_lr = df_lr.sort_index(axis=1) # sort columns alphabetically rank_cols = [c for c in df_lr if "_rank" in c and "_count" not in c] df_lr["rank_all"] = (df_lr[rank_cols].sum(axis=1).rank().astype(int) ) # sum every metric rank and rank inversely df_lr["rank_maas_hdd_mtld"] = (df_lr[[m + "_rank" for m in rec_metrics ]].sum(axis=1).rank().astype(int) ) # combine recommended metrics df_lr = df_lr.set_index("annotation_type") df_lr = df_lr.sort_values( by="rank_maas_hdd_mtld" ) # sort values by conbination of recommended metrics in McCarthy 2010 return df_lr
def calculate_lexical_richness_measure(self, text, window_size = 200, threshold = 0.72): lex = LexicalRichness(text) self.measures['mattr'] = lex.mattr(window_size=window_size) #moving average self.measures['mtld'] = lex.mtld(threshold=threshold) #measure of lexical diversity return self.measures
def process_speech(transcribe_df, r_config): """ Preparing speech features Args: transcribe_df: Transcribed dataframe r_config: raw config file object Returns: Dataframe for speech features """ err_transcribe = transcribe_df[r_config.err_reason].iloc[0] transcribe = transcribe_df[r_config.nlp_transcribe].iloc[0] total_time = transcribe_df[r_config.nlp_totalTime].iloc[0] master_url = transcribe_df['dbm_master_url'].iloc[0] #clean transcribe transcribe = transcribe.replace(",", "") transcribe = " ".join(re.findall(r"[\w']+|[.!?]", transcribe)) if err_transcribe != 'Pass': df_speech = empty_speech(r_config, master_url, error_txt) return df_speech speech_dict = {} nltk_download() sentences = nltk.tokenize.sent_tokenize(transcribe) words_all = nltk.tokenize.word_tokenize(transcribe) num_sentences = len(sentences) speech_dict[r_config.nlp_numSentences] = num_sentences #nlp_singPron i_s = transcribe.count('I') me_s = transcribe.count('me') my_s = transcribe.count('my') sing_count = i_s + me_s + my_s speech_dict[r_config.nlp_singPronPerAns] = sing_count if len( words_all) > 0 else np.nan speech_dict[r_config.nlp_singPronPerSen] = divide_var( speech_dict[r_config.nlp_singPronPerAns], num_sentences) tagged = nltk.pos_tag(transcribe.split()) tagged_df = pd.DataFrame(tagged, columns=['word', 'pos_tag']) #Past tense per answer all_POSs = tagged_df['pos_tag'].tolist() speech_dict[r_config.nlp_pastTensePerAns] = all_POSs.count( 'VBD') if len(words_all) > 0 else np.nan speech_dict[r_config.nlp_pastTensePerSen] = divide_var( speech_dict[r_config.nlp_pastTensePerAns], num_sentences) #Pronoun per answer pronounsPerAns = all_POSs.count('PRP') + all_POSs.count('PRP$') speech_dict[r_config.nlp_pronounsPerAns] = pronounsPerAns if len( words_all) > 0 else np.nan speech_dict[r_config.nlp_pronounsPerSen] = divide_var( speech_dict[r_config.nlp_pronounsPerAns], num_sentences) #Verb per answer verbPerAns = all_POSs.count('VB') + all_POSs.count('VBD') + all_POSs.count('VBG') \ + all_POSs.count('VBN') + all_POSs.count('VBP') + all_POSs.count('VBZ') speech_dict[r_config. nlp_verbsPerAns] = verbPerAns if len(words_all) > 0 else np.nan speech_dict[r_config.nlp_verbsPerSen] = divide_var( speech_dict[r_config.nlp_verbsPerAns], num_sentences) #Adjective per answer adjectivesAns = all_POSs.count('JJ') + all_POSs.count( 'JJR') + all_POSs.count('JJS') speech_dict[r_config.nlp_adjectivesPerAns] = adjectivesAns if len( words_all) > 0 else np.nan speech_dict[r_config.nlp_adjectivesPerSen] = divide_var( speech_dict[r_config.nlp_adjectivesPerAns], num_sentences) #Noun per answer nounsAns = all_POSs.count('NN') + all_POSs.count('NNP') + all_POSs.count( 'NNS') speech_dict[ r_config.nlp_nounsPerAns] = nounsAns if len(words_all) > 0 else np.nan speech_dict[r_config.nlp_nounsPerSen] = divide_var( speech_dict[r_config.nlp_nounsPerAns], num_sentences) #Sentiment analysis vader = SentimentIntensityAnalyzer() sentence_valences = [] for s in sentences: sentiment_dict = vader.polarity_scores(s) sentence_valences.append(sentiment_dict['compound']) speech_dict[r_config.nlp_sentiment_mean] = np.mean( sentence_valences) if len(sentence_valences) > 0 else np.nan non_punc = list(value for value in words_all if value not in ['.', '!', '?']) non_punc_as_str = " ".join(str(non_punc)) lex = LexicalRichness(non_punc_as_str) speech_dict[r_config.nlp_mattr] = lex.mattr( window_size=lex.words) if lex.words > 0 else np.nan #Number of words per minute speech_dict[r_config.nlp_wordsPerMin] = divide_var(len(non_punc), total_time) * 60 speech_dict[r_config.nlp_totalTime] = total_time speech_dict['dbm_master_url'] = master_url df_speech = pd.DataFrame([speech_dict]) return df_speech