Exemplo n.º 1
0
 def test_correct_nick_for_(self, input_nick, expected_output_nick):
     self.assertEqual(util.correct_nick_for_("rohan_"), "rohan")
Exemplo n.º 2
0
def keywords(log_dict, nicks, nick_same_list):
    """
    Returns keywods for all users

    Args:   
        log_dict (str): Dictionary of logs data created using reader.py
        nicks(List) : list of nickname created using nickTracker.py
        nick_same_list :List of same_nick names created using nickTracker.py

    Returns
        keywords_filtered: filtered keywords for user
        user_keyword_freq_dict: dictionary for each user having keywords and their frequency
        user_words_dict: keywods for user
        nicks_for_stop_words: stop words
    """
    user_words_dict = []
    user_keyword_freq_dict = []
    keywords_filtered = []
    no_messages = 0

    def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name,
                          nicks, nick_same_list):
        if (rec == nick_name):
            if (nick_to_compare != nick_name):
                nick_receiver = iter_nicks(nick_receiver, nicks,
                                           nick_same_list, nick_name)
        return nick_receiver

    def iter_nicks(nick_sender_receiver, nicks, nick_same_list, nick_comp):
        for i in range(len(nicks)):
            if nick_comp in nick_same_list[i]:
                nick_sender_receiver = nick_same_list[i][0]
                break
            else:
                nick_sender_receiver = nick_comp
        return nick_sender_receiver

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            for line in day_log:
                flag_comma = 0
                if (util.check_if_msg_line(line)):
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_compare = util.correctLastCharCR(
                        (m.group(0)[1:-1]))
                    nick_sender = ''
                    nick_sender = iter_nicks(nick_sender, nicks,
                                             nick_same_list, nick_to_compare)

                    nick_receiver = ''
                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')
                                    ]  #receiver list splited about :
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]:  #index 0 will contain time 14:02
                            break
                        rec_list = util.correct_last_char_list(rec_list)
                        for rec in rec_list:
                            nick_receiver = get_nick_receiver(
                                nick_receiver, rec, nick_to_compare, nick_name,
                                nicks, nick_same_list)

                        if "," in rec_list[
                                1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [
                                e.strip() for e in rec_list[1].split(',')
                            ]
                            rec_list_2 = util.correct_last_char_list(
                                rec_list_2)
                            for rec in rec_list_2:
                                nick_receiver = get_nick_receiver(
                                    nick_receiver, rec, nick_to_compare,
                                    nick_name, nicks, nick_same_list)

                        if (flag_comma == 0
                            ):  #receiver list can be <Dhruv> Rohan, Hi!
                            rec = util.splice_find(line, ">", ", ", 1)
                            nick_receiver = get_nick_receiver(
                                nick_receiver, rec, nick_to_compare, nick_name,
                                nicks, nick_same_list)

                    #generating the words written by the sender
                    message = rec_list[1:]
                    no_messages += 1
                    correctedNickReciever = util.correct_nick_for_(
                        nick_receiver)
                    if correctedNickReciever in message:
                        message.remove(correctedNickReciever)

                    lmtzr = WordNetLemmatizer()

                    #limit word size = 3, drop numbers.
                    word_list_temp = re.sub(
                        r'\d+', '', " ".join(
                            re.findall(r'\w{3,}', ":".join(message).replace(
                                ",", " ")))).split(" ")
                    word_list = []

                    #remove punctuations
                    for word in word_list_temp:
                        word = word.lower()
                        word_list.append(word.replace("'", ""))
                    word_list_lemmatized = []

                    try:
                        word_list_lemmatized = map(
                            lmtzr.lemmatize,
                            map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
                    except UnicodeDecodeError:
                        pass

                    fr = 1
                    for dic in user_words_dict:
                        if dic['sender'] == nick_sender:
                            dic['words'].extend(word_list_lemmatized)
                            fr = 0
                    if fr:
                        user_words_dict.append({
                            'sender': nick_sender,
                            'words': word_list_lemmatized
                        })

    nicks_for_stop_words = []
    stop_word_without_apostrophe = []

    for l in nick_same_list:
        nicks_for_stop_words.extend(l)

    for dictonary in user_words_dict:
        nicks_for_stop_words.append(dictonary['sender'])

    nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'", ""))

    stop_words_extended = extended_stop_words(nicks_for_stop_words,
                                              stop_word_without_apostrophe)

    count_vect = CountVectorizer(analyzer='word',
                                 stop_words=stop_words_extended,
                                 min_df=1)

    for dictonary in user_words_dict:
        try:
            matrix = count_vect.fit_transform(dictonary['words'])
            freqs = [[word, matrix.getcol(idx).sum()]
                     for word, idx in count_vect.vocabulary_.items()]
            keywords = sorted(freqs, key=lambda x: -x[1])
            total_freq = 0.0
            for freq_tuple in keywords:
                total_freq += freq_tuple[1]

            for freq_tuple in keywords:
                freq_tuple.append(round(freq_tuple[1] / float(total_freq), 5))
            user_keyword_freq_dict.append({
                'nick': dictonary['sender'],
                'keywords': keywords
            })
        except ValueError:
            pass
    for data in user_keyword_freq_dict:
        keywords, normal_scores = top_keywords_for_nick(
            user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD,
            config.KEYWORDS_MIN_WORDS)
        if config.DEBUGGER:
            print "Nick:", data['nick']
            print "Keywords with normalised score > 0.01\n", keywords
            print "Their Normal scores\n", normal_scores
            print "\n"
        if keywords:
            keywords_filtered.append({
                'nick': data['nick'],
                'keywords': keywords
            })

    return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words
Exemplo n.º 3
0
def keywords(log_dict, nicks, nick_same_list):
    """
    Returns keywods for all users

    Args:   
        log_dict (str): Dictionary of logs data created using reader.py
        nicks(List) : list of nickname created using nickTracker.py
        nick_same_list :List of same_nick names created using nickTracker.py

    Returns
        keywords_filtered: filtered keywords for user
        user_keyword_freq_dict: dictionary for each user having keywords and their frequency
        user_words_dict: keywods for user
        nicks_for_stop_words: stop words
    """
    user_words_dict = []
    user_keyword_freq_dict = []
    keywords_filtered = []
    no_messages = 0    

    def get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list):              
        if(rec == nick_name):
            if(nick_to_compare != nick_name):                
                nick_receiver = iter_nicks(nick_receiver, nicks, nick_same_list, nick_name)        
        return nick_receiver           

    def iter_nicks(nick_sender_receiver, nicks, nick_same_list, nick_comp):        
        for i in range(len(nicks)):
            if nick_comp in nick_same_list[i]:
                nick_sender_receiver = nick_same_list[i][0]
                break
            else:
                nick_sender_receiver = nick_comp
        return nick_sender_receiver    

    for day_content_all_channels in log_dict.values():
        for day_content in day_content_all_channels:
            day_log = day_content["log_data"]
            for line in day_log:
                flag_comma = 0
                if(util.check_if_msg_line(line)):
                    m = re.search(r"\<(.*?)\>", line)
                    nick_to_compare = util.correctLastCharCR((m.group(0)[1:-1]))
                    nick_sender = ''                    
                    nick_sender = iter_nicks(nick_sender, nicks, nick_same_list, nick_to_compare)
                    
                    nick_receiver = ''
                    for nick_name in nicks:
                        rec_list = [e.strip() for e in line.split(':')] #receiver list splited about :
                        util.rec_list_splice(rec_list)
                        if not rec_list[1]: #index 0 will contain time 14:02
                            break                        
                        rec_list = util.correct_last_char_list(rec_list)        
                        for rec in rec_list:
                            nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list)                            
                
                        if "," in rec_list[1]:  #receiver list may of the form <Dhruv> Rohan, Ram :
                            flag_comma = 1
                            rec_list_2 = [e.strip() for e in rec_list[1].split(',')]                            
                            rec_list_2 = util.correct_last_char_list(rec_list_2)        
                            for rec in rec_list_2:
                                nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list)                                

                        if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi!
                            rec = util.splice_find(line, ">", ", ", 1)                            
                            nick_receiver = get_nick_receiver(nick_receiver, rec, nick_to_compare, nick_name, nicks, nick_same_list)                           
                                            
                    
                    #generating the words written by the sender
                    message = rec_list[1:]
                    no_messages += 1
                    correctedNickReciever = util.correct_nick_for_(nick_receiver)
                    if correctedNickReciever in message:
                        message.remove(correctedNickReciever)

                    lmtzr = WordNetLemmatizer()
                    
                    #limit word size = 3, drop numbers.
                    word_list_temp = re.sub(r'\d+', '', " ".join(re.findall(r'\w{3,}', ":".join(message).replace(","," ")))).split(" ")
                    word_list = []
                    
                    #remove punctuations
                    for word in word_list_temp:
                        word = word.lower()
                        word_list.append(word.replace("'",""))
                    word_list_lemmatized = []
                    
                    try:     
                        word_list_lemmatized = map(lmtzr.lemmatize, map(lambda x: lmtzr.lemmatize(x, 'v'), word_list))
                    except UnicodeDecodeError:
                        pass

                    fr = 1
                    for dic in user_words_dict:
                        if dic['sender'] == nick_sender:
                                dic['words'].extend(word_list_lemmatized)
                                fr = 0
                    if fr:
                        user_words_dict.append({'sender':nick_sender, 'words':word_list_lemmatized }) 

    nicks_for_stop_words = []
    stop_word_without_apostrophe = []

    for l in nick_same_list:
        nicks_for_stop_words.extend(l)

    for dictonary in user_words_dict:
        nicks_for_stop_words.append(dictonary['sender'])

    nicks_for_stop_words.extend([x.lower() for x in nicks_for_stop_words])

    for words in common_english_words.words:
        stop_word_without_apostrophe.append(words.replace("'",""))        
    
    stop_words_extended = extended_stop_words(nicks_for_stop_words, stop_word_without_apostrophe)

    count_vect = CountVectorizer(analyzer = 'word', stop_words=stop_words_extended, min_df = 1)

    for dictonary in user_words_dict:
        try:
            matrix = count_vect.fit_transform(dictonary['words'])
            freqs = [[word, matrix.getcol(idx).sum()] for word, idx in count_vect.vocabulary_.items()]
            keywords = sorted(freqs, key = lambda x: -x[1])
            total_freq = 0.0
            for freq_tuple in keywords:
                total_freq += freq_tuple[1]
            
            for freq_tuple in keywords:
                freq_tuple.append(round(freq_tuple[1]/float(total_freq), 5))
            user_keyword_freq_dict.append({'nick':dictonary['sender'], 'keywords': keywords })
        except ValueError:
                pass
    for data in user_keyword_freq_dict:
        keywords, normal_scores = top_keywords_for_nick(user_keyword_freq_dict, data['nick'], config.KEYWORDS_THRESHOLD, config.KEYWORDS_MIN_WORDS)
        if config.DEBUGGER:    
            print "Nick:", data['nick']
            print "Keywords with normalised score > 0.01\n", keywords
            print "Their Normal scores\n", normal_scores
            print "\n"
        if keywords:
            keywords_filtered.append({'nick': data['nick'], 'keywords': keywords})
    
    return keywords_filtered, user_keyword_freq_dict, user_words_dict, nicks_for_stop_words