def message_no_add_egde(message_graph, conversation): for index in xrange(config.MAX_EXPECTED_DIFF_NICKS): if (len(conversation[index]) == 3 and conversation[index][0] >= config.THRESHOLD_MESSAGE_NUMBER_GRAPH): if len( conversation[index] [1]) >= config.MINIMUM_NICK_LENGTH and len( conversation[index][2]) >= config.MINIMUM_NICK_LENGTH: message_graph.add_edge( util.get_nick_representative(nicks, nick_same_list, conversation[index][1]), util.get_nick_representative(nicks, nick_same_list, conversation[index][2]), weight=conversation[index][0]) return message_graph
def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list): if (rec == nick_name): if (nick_to_compare != nick_name): nick_receiver = util.get_nick_representative( nicks, nick_same_list, nick_name) return nick_receiver
def keywords(log_dict, nicks, nick_same_list): """ Returns keywods for all users Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : list of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py Returns keywords_filtered: filtered keywords for user user_keyword_freq_dict: dictionary for each user having keywords and their frequency user_words_dict: keywods for user nicks_for_stop_words: stop words """ user_words_dict = [] user_keyword_freq_dict = [] keywords_filtered = [] no_messages = 0 def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list): if (rec == nick_name): if (nick_to_compare != nick_name): nick_receiver = util.get_nick_representative( nicks, nick_same_list, nick_name) return nick_receiver for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] for line in day_log: flag_comma = 0 if (util.check_if_msg_line(line)): m = re.search(r"\<(.*?)\>", line) nick_to_compare = util.correctLastCharCR( (m.group(0)[1:-1])) nick_sender = '' nick_sender = util.get_nick_representative( nicks, nick_same_list, nick_to_compare) nick_receiver = '' for nick_name in nicks: rec_list = [e.strip() for e in line.split(':') ] #receiver list splited about : util.rec_list_splice(rec_list) if not rec_list[1]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for rec in rec_list: nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if "," in rec_list[ 1]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [ e.strip() for e in rec_list[1].split(',') ] rec_list_2 = util.correct_last_char_list( rec_list_2) for rec in rec_list_2: nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) if (flag_comma == 0 ): #receiver list can be <Dhruv> Rohan, Hi! rec = util.splice_find(line, ">", ", ", 1) nick_receiver = get_nick_receiver( nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list) #generating the words written by the sender message = rec_list[1:] no_messages += 1 correctedNickReciever = util.correct_nick_for_( nick_receiver) if correctedNickReciever in message: message.remove(correctedNickReciever) lmtzr = WordNetLemmatizer() #limit word size = 3, drop numbers. word_list_temp = re.sub( r'\d+', '', " ".join( re.findall(r'\w{3,}', ":".join(message).replace( ",", " ")))).split(" ") word_list = [] #remove punctuations for word in word_list_temp: word = word.lower() word_list.append(word.replace("'", "")) word_list_lemmatized = [] try: word_list_lemmatized = map( lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list)) except UnicodeDecodeError: pass fr = 1 for dic in user_words_dict: if dic['sender'] == nick_sender: dic['words'].extend(word_list_lemmatized) fr = 0 if fr: user_words_dict.append({ 'sender': nick_sender, 'words': word_list_lemmatized }) nicks_for_stop_words = [] stop_word_without_apostrophe = [] for l in nick_same_list: nicks_for_stop_words.extend(l) for dictonary in user_words_dict: nicks_for_stop_words.append(dictonary['sender']) nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words]) for words in common_english_words.words: stop_word_without_apostrophe.append(words.replace("'", "")) stop_words_extended = extended_stop_words(nicks_for_stop_words, stop_word_without_apostrophe) count_vect = CountVectorizer(analyzer='word', stop_words=stop_words_extended, min_df=1) keywords_for_channels = [] for dictonary in user_words_dict: try: matrix = count_vect.fit_transform(dictonary['words']) freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()] keywords = sorted(freqs, key=lambda x: -x[1]) total_freq = 0.0 for freq_tuple in keywords: total_freq += freq_tuple[1] for freq_tuple in keywords: freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5)) user_keyword_freq_dict.append({ 'nick': dictonary['sender'], 'keywords': keywords }) keywords_for_channels.extend(keywords) except ValueError: pass for data in user_keyword_freq_dict: keywords, normal_scores = top_keywords_for_nick( user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS) if config.DEBUGGER and config.PRINT_WORDS: print "Nick:", data['nick'] print "Keywords with normalised score > 0.01\n", keywords print "Their Normal scores\n", normal_scores print "\n" if keywords: keywords_filtered.append({ 'nick': data['nick'], 'keywords': keywords }) return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words, sorted( keywords_for_channels, key=lambda x: x[2], reverse=True)
def test_get_nick_representative(self, nicks, nick_same_list, nick_to_compare, expected_result): # look at nickTracker.py in that too nick_same_list has size = maxm expected diff nicks self.assertEqual( util.get_nick_representative(nicks, nick_same_list, nick_to_compare), expected_result)