def _getLexicalStats(messages): words = statsUtil.getWords(messages) #text = nltk.Text(words) tokensCount = len(words) vocabularyCount = len(set(words)) if tokensCount == 0: lexicalRichness = 0 else: lexicalRichness = vocabularyCount / tokensCount return tokensCount, vocabularyCount, lexicalRichness
def _getLexicalStats(messages): words = statsUtil.getWords(messages) # text = nltk.Text(words) tokensCount = len(words) vocabularyCount = len(set(words)) if tokensCount == 0: lexicalRichness = 0 else: lexicalRichness = vocabularyCount / tokensCount return tokensCount, vocabularyCount, lexicalRichness
def _generateLexicalStatsBy(self, groupByColumns=[]): res = self.df.rename(columns={'text':'text'}) #enough, probably the best is to make another simpler method in statsUtil res = res.groupby(['sender'] + groupByColumns, as_index=False).agg( {'text' : lambda x: tuple(statsUtil.getWords(" ".join(x)))}) res['tokensCount'] = res['text'].apply(lambda x: len(x)) res['vocabularyCount'] = res['text'].apply(lambda x: len(set(x))) res.drop('text', axis=1, inplace=True) if groupByColumns: tot = res.groupby(groupByColumns, as_index=False).sum() tot['sender'] = "total" res = pd.concat([res, tot]) #TODO Missing tokencount = zero case res['lexicalRichness'] = res['vocabularyCount']/res['tokensCount'] return res else: res.set_index(['sender'], inplace=True) res.loc['total'] = res.sum() res['lexicalRichness'] = res['vocabularyCount']/res['tokensCount'] return res[['tokensCount', 'vocabularyCount', 'lexicalRichness']]
def _computeWordsCount(msgs, groupByColumns): """ Generates dataframe with words count for each group-by entry. Grouping is done on passed columns plus the sender one. """ # Group messages by sender and specified feature, concatenating text field grouped_msgs = msgs.groupby(groupByColumns).agg( {'text': lambda x: " ".join(x)}) # Count-vectorize msgs, using own defined analyzer (tokenizer) vectorizer = CountVectorizer(analyzer=lambda x: statsUtil.getWords(x)) X = vectorizer.fit_transform(grouped_msgs['text'].values) # Create count matrix using words as columns countMatrix = pd.DataFrame(X.toarray(), index=grouped_msgs.index, columns=vectorizer.get_feature_names()) # Join data while dropping text column wordsCount = grouped_msgs.drop('text', axis=1).join(countMatrix) return wordsCount