def generateDataFrameSingleWordCountBy(self, mFun, word): agglomeratedMessages = ConvStats._getMessagesBy( mFun, self.conversation.messages) df = self._generateDataFrameAgglomeratedStatsBy( mFun, agglomeratedMessages) wOcc1 = [ (d, statsUtil.getWordsCount( list( filter(lambda m: m.sender == self.conversation.sender1, a)))) for d, a in agglomeratedMessages.items() ] wOcc2 = [ (d, statsUtil.getWordsCount( list( filter(lambda m: m.sender == self.conversation.sender2, a)))) for d, a in agglomeratedMessages.items() ] s1Count = [ count[word] if word in count else (by, 0) for (by, count) in wOcc1 ] s2Count = [ count[word] if word in count else (by, 0) for (by, count) in wOcc2 ] df[self.conversation.sender1 + '_count'] = np.array(s1Count) df[self.conversation.sender2 + '_count'] = np.array(s2Count) df['totCount'] = df[self.conversation.sender1 + '_count'] + df[self.conversation.sender2 + '_count'] return df
def _getWordsUsedJustByStats(sender1Messages, sender2Messages): wordsSaidBySender1 = statsUtil.getWordsCount(sender1Messages).keys() wordsSaidBySender2 = statsUtil.getWordsCount(sender2Messages).keys() wordsSaidByBoth = set(wordsSaidBySender1).intersection(wordsSaidBySender2) wordsSaidJustByS1 = set(wordsSaidBySender1).difference(wordsSaidBySender2) wordsSaidJustByS2 = set(wordsSaidBySender2).difference(wordsSaidBySender1) return wordsSaidByBoth, wordsSaidJustByS1, wordsSaidJustByS2
def _getWordsUsedJustByStats(sender1Messages, sender2Messages): wordsSaidBySender1 = statsUtil.getWordsCount(sender1Messages).keys() wordsSaidBySender2 = statsUtil.getWordsCount(sender2Messages).keys() wordsSaidByBoth = set(wordsSaidBySender1).intersection( wordsSaidBySender2) wordsSaidJustByS1 = set(wordsSaidBySender1).difference( wordsSaidBySender2) wordsSaidJustByS2 = set(wordsSaidBySender2).difference( wordsSaidBySender1) return wordsSaidByBoth, wordsSaidJustByS1, wordsSaidJustByS2
def _getWordsCountStats(messages, limit=0): wCount = statsUtil.getWordsCount(messages) if limit == 0: return wCount.most_common() else: return wCount.most_common(limit)
def _generateWordCountStatsBy(self, groupByColumns=[], word=None): fun = lambda x: tuple(sorted( statsUtil.getWordsCount(" ".join(x)).items(), key=lambda y: y[1], reverse=True)) label = 'wordCount' countId = 'word' results = self._generateCountStatsBy(fun, label, countId, groupByColumns, word) return results
def generateDataFrameSingleWordCountBy(self, mFun, word): agglomeratedMessages = ConvStats._getMessagesBy(mFun, self.conversation.messages) df = self._generateDataFrameAgglomeratedStatsBy(mFun, agglomeratedMessages) wOcc1 = [ (d, statsUtil.getWordsCount(list(filter(lambda m: m.sender == self.conversation.sender1, a)))) for d, a in agglomeratedMessages.items() ] wOcc2 = [ (d, statsUtil.getWordsCount(list(filter(lambda m: m.sender == self.conversation.sender2, a)))) for d, a in agglomeratedMessages.items() ] s1Count = [count[word] if word in count else (by, 0) for (by, count) in wOcc1] s2Count = [count[word] if word in count else (by, 0) for (by, count) in wOcc2] df[self.conversation.sender1 + "_count"] = np.array(s1Count) df[self.conversation.sender2 + "_count"] = np.array(s2Count) df["totCount"] = df[self.conversation.sender1 + "_count"] + df[self.conversation.sender2 + "_count"] return df