def parse_log_lines_for_conv(log_dict, nicks, conn_comp_list, conversations): dateadd = -1 #Variable used for response time calculation. Varies from 0-365. for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] dateadd = dateadd + 1 send_time = [] #list of all the times a user sends a message to another user #code for making relation map between clients for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[2]: break rec_list = util.correct_last_char_list(rec_list) conversations, nick_receiver, send_time = \ build_conversation(rec_list, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list, line) if "," in rec_list[2]: flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[2].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) conversations, nick_receiver, send_time = \ build_conversation(rec_list_2, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list, line) if(flag_comma == 0): rec = util.splice_find(line, ">", ", ", 1) conversations, nick_receiver, send_time = \ conv_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, dateadd, conversations, conn_comp_list, line) return conversations, nick_receiver, send_time
def message_time_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False): """ creates a directed graph where each edge denotes a message sent from a user to another user with the stamp denoting the time at which the message was sent Args: log_dict (dictionary): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list(List) :List of same_nick names created using nickTracker.py Returns: msg_time_graph_list(List): List of message time graphs for different days msg_time_aggr_graph: aggregate message time graph where edges are date + time when sender sends a message to receiver """ msg_time_graph_list = [] msg_time_aggr_graph = nx.MultiDiGraph() G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) def compare_spliced_nick(nick_to_compare, spliced_nick, nick_name, line): if(nick_to_compare == nick_name): if(spliced_nick != nick_name): nick_receiver = nick_receiver_from_conn_comp(nick_name, conn_comp_list) util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph) conn_comp_list = util.create_connected_nick_list(conn_comp_list) for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] year, month, day = util.get_year_month_day(day_content) graph_conversation = nx.MultiDiGraph() #graph with multiple directed edges between clients used for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): m = re.search(r"\<(.*?)\>", line) spliced_nick = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = "" nick_sender = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, spliced_nick, conn_comp_list, nick_sender) for nick_name in nicks: rec_list = [e.strip() for e in line.split(':')] #receiver list splited about : rec_list = util.rec_list_splice(rec_list) if not rec_list[2]: #index 0 will contain time 14:02 break rec_list = util.correct_last_char_list(rec_list) for nick_to_search in rec_list: if(nick_to_search == nick_name): if(spliced_nick != nick_name): nick_receiver = "" nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick_name, conn_comp_list, nick_receiver) util.build_graphs(nick_sender, nick_receiver, line[1:6], year, month, day, graph_conversation, msg_time_aggr_graph) if "," in rec_list[2]: #receiver list may of the form <Dhruv> Rohan, Ram : flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[2].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for nick_to_search in rec_list_2: compare_spliced_nick(nick_to_search, spliced_nick, nick_name, line) if(flag_comma == 0): #receiver list can be <Dhruv> Rohan, Hi! rec = line[line.find(">") + 1:line.find(", ")] rec = util.correctLastCharCR(rec[1:]) compare_spliced_nick(rec, spliced_nick, nick_name, line) msg_time_graph_list.append(graph_conversation) if DAY_BY_DAY_ANALYSIS: return msg_time_graph_list else: return msg_time_aggr_graph
def response_time(log_dict, nicks, nick_same_list, cutoff_percentile): """ finds the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message. Args: log_dict (str): Dictionary of logs data created using reader.py nicks(List) : List of nickname created using nickTracker.py nick_same_list :List of same_nick names created using nickTracker.py cutoff_percentile (int): Cutoff percentile indicating statistical significance Returns: rows_RT(zip List): Response Time (This refers to the response time of a message i.e. the best guess for the time at which one can expect a reply for his/her message) """ G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) util.create_connected_nick_list(conn_comp_list) graph_cumulative = [] graph_x_axis = [] graph_y_axis = [] def build_mean_list(conversations, index, mean_list): for j in range(2, len(conversations[index])): mean_list.append(conversations[index][j]) return mean_list def resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list): if(rec == nick): send_time.append(line[1:6]) if(nick_to_search != nick): nick_receiver = util.get_nick_sen_rec(len(nicks), nick, conn_comp_list, nick_receiver) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if (nick_sender in conversations[i] and nick_receiver in conversations[i]): conversations[i].append(line[1:6]) break if(len(conversations[i]) == 0): conversations[i].append(nick_sender) conversations[i].append(nick_receiver) conversations[i].append(line[1:6]) break return conversations, nick_receiver, send_time for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] send_time = [] #list of all the times a user sends a message to another user meanstd_list = [] totalmeanstd_list = [] x_axis = [] y_axis = [] real_y_axis = [] conversations = [[] for i in range(config.MAX_RESPONSE_CONVERSATIONS)] #code for making relation map between clients for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): nick_sender = "" nick_receiver = "" m = re.search(r"\<(.*?)\>", line) nick_to_search = util.correctLastCharCR(m.group(0)[1:-1]) nick_sender = util.get_nick_sen_rec(len(nicks), nick_to_search, conn_comp_list, nick_sender) for nick in nicks: rec_list = [e.strip() for e in line.split(':')] util.rec_list_splice(rec_list) if not rec_list[2]: break rec_list = util.correct_last_char_list(rec_list) for name in rec_list: conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if "," in rec_list[2]: flag_comma = 1 rec_list_2 = [e.strip() for e in rec_list[2].split(',')] rec_list_2 = util.correct_last_char_list(rec_list_2) for name in rec_list_2: conversations, nick_receiver, send_time = resp_helper(name, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) if(flag_comma == 0): rec = util.splice_find(line, ">", ", ",1) conversations, nick_receiver, send_time = resp_helper(rec, nick, send_time, nick_to_search, nick_receiver, nick_sender, conversations, conn_comp_list) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): for j in range(2, len(conversations[i]) - 1): conversations[i][j]=(int(conversations[i][j+1][0:2])*config.MINS_PER_HOUR+int(conversations[i][j+1][3:5])) - (int(conversations[i][j][0:2])*config.MINS_PER_HOUR+int(conversations[i][j][3:5])) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): if(len(conversations[i]) == 3): conversations[i][2] = int(conversations[i][2][0:2])*config.MINS_PER_HOUR+int(conversations[i][2][3:5]) else: del conversations[i][-1] #Explanation provided in parser-CL+CRT.py for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): totalmeanstd_list = build_mean_list(conversations, i, totalmeanstd_list) if(len(totalmeanstd_list) != 0): for i in range(max(totalmeanstd_list) + 1): x_axis.append(i) for i in x_axis: y_axis.append(float(totalmeanstd_list.count(i)) / float(len(totalmeanstd_list))) #finding the probability of each RT to occur=No. of occurence/total occurences. real_y_axis.append(y_axis[0]) for i in range(len(y_axis)): real_y_axis.append(float(real_y_axis[i-1]) + float(y_axis[i])) #to find cumulative just go on adding the current value to previously cumulated value till sum becomes 1 for last entry. for i in range(len(totalmeanstd_list)): graph_cumulative.append(totalmeanstd_list[i]) if len(totalmeanstd_list) > 0: totalmeanstd_list.append(numpy.mean(totalmeanstd_list)) totalmeanstd_list.append(numpy.mean(totalmeanstd_list)+2*numpy.std(totalmeanstd_list)) for i in range(config.MAX_RESPONSE_CONVERSATIONS): if(len(conversations[i]) != 0): meanstd_list = build_mean_list(conversations, i, meanstd_list) conversations[i].append(numpy.mean(meanstd_list)) conversations[i].append(numpy.mean(meanstd_list)+(2*numpy.std(meanstd_list))) meanstd_list[:] = [] graph_cumulative.sort() truncated_rt = None rt_cutoff_time = None if graph_cumulative: for i in range(graph_cumulative[len(graph_cumulative)-1] + 1): graph_y_axis.append(graph_cumulative.count(i)) # problem when ti=0 count is unexpectedly large graph_x_axis.append(i) #Finally storing the RT values along with their frequencies in a csv file; no need to invoke build_stat_dist() function rows_rt = zip(graph_x_axis, graph_y_axis) truncated_rt, rt_cutoff_time = truncate_table(rows_rt, cutoff_percentile) if config.CUTOFF_TIME_STRATEGY == "TWO_SIGMA": resp_time, resp_frequency_tuple = zip(*truncated_rt) resp_frequency = list(resp_frequency_tuple) rt_cutoff_time_frac = numpy.mean(resp_frequency) + 2*numpy.std(resp_frequency) rt_cutoff_time = int(numpy.ceil(rt_cutoff_time_frac)) elif config.CUTOFF_TIME_STRATEGY == "PERCENTILE": # nothing further to do; truncate_table() already gives rt_cutoff_time # based on percentile pass return truncated_rt, rt_cutoff_time
def message_number_graph(log_dict, nicks, nick_same_list, DAY_BY_DAY_ANALYSIS=False): """ Creates a directed graph with each node representing an IRC user and each directed edge has a weight which mentions the number messages sent and recieved by that user in the selected time frame. Args: log_dict (dict): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} nicks(list): list of all the nicks nick_same_list(list): list of lists mentioning nicks which belong to same users Returns: message_number_graph (nx graph object) """ message_number_day_list = [] conversations=[[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] aggregate_message_number_graph = nx.DiGraph() #graph with multiple directed edges between clients used G = util.to_graph(nick_same_list) conn_comp_list = list(connected_components(G)) conn_comp_list = util.create_connected_nick_list(conn_comp_list) def msg_no_analysis_helper(rec_list, nick_sender, nick, conn_comp_list,conversations,today_conversation): for receiver in rec_list: if(receiver == nick): if(nick_sender != nick): nick_receiver = '' nick_receiver = util.get_nick_sen_rec(config.MAX_EXPECTED_DIFF_NICKS, nick, conn_comp_list, nick_receiver) if DAY_BY_DAY_ANALYSIS: today_conversation = util.extend_conversation_list(nick_sender, nick_receiver, today_conversation) else: conversations = util.extend_conversation_list(nick_sender, nick_receiver, conversations) def message_no_add_egde(message_graph, conversation): for index in xrange(config.MAX_EXPECTED_DIFF_NICKS): if(len(conversation[index]) == 3 and conversation[index][0] >= config.THRESHOLD_MESSAGE_NUMBER_GRAPH): if len(conversation[index][1]) >= config.MINIMUM_NICK_LENGTH and len(conversation[index][2]) >= config.MINIMUM_NICK_LENGTH: message_graph.add_edge(util.get_nick_representative(nicks, nick_same_list, conversation[index][1]), util.get_nick_representative(nicks, nick_same_list, conversation[index][2]), weight=conversation[index][0]) return message_graph for day_content_all_channels in log_dict.values(): for day_content in day_content_all_channels: day_log = day_content["log_data"] today_conversation = [[0] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] for line in day_log: flag_comma = 0 if(util.check_if_msg_line (line)): parsed_nick = re.search(r"\<(.*?)\>", line) nick_sender = util.correctLastCharCR(parsed_nick.group(0)[1:-1]) nick_receiver = "" for nick in nicks: rec_list = [e.strip() for e in line.split(':')] rec_list = util.rec_list_splice(rec_list) if not rec_list[2]: break rec_list = util.correct_last_char_list(rec_list) msg_no_analysis_helper(rec_list, nick_sender, nick, conn_comp_list, conversations,today_conversation) if "," in rec_list[1]: flag_comma = 1 rec_list_2=[e.strip() for e in rec_list[1].split(',')] for i in xrange(0,len(rec_list_2)): if(rec_list_2[i]): rec_list_2[i] = util.correctLastCharCR(rec_list_2[i]) msg_no_analysis_helper(rec_list_2, nick_sender, nick, conn_comp_list, conversations, today_conversation) if(flag_comma == 0): rec = line[line.find(">")+1:line.find(", ")] rec = rec[1:] rec = util.correctLastCharCR(rec) if(rec == nick): if(nick_sender != nick): nick_receiver = nick_receiver_from_conn_comp(nick, conn_comp_list) if DAY_BY_DAY_ANALYSIS: today_message_number_graph = nx.DiGraph() today_message_number_graph = message_no_add_egde(today_message_number_graph, today_conversation) year, month, day = util.get_year_month_day(day_content) message_number_day_list.append([today_message_number_graph, year+'-'+month+'-'+day]) print "\nBuilding graph object with EDGE WEIGHT THRESHOLD:", config.THRESHOLD_MESSAGE_NUMBER_GRAPH if not DAY_BY_DAY_ANALYSIS: aggregate_message_number_graph = message_no_add_egde(aggregate_message_number_graph, conversations) if config.DEBUGGER: print "========> 30 on " + str(len(conversations)) + " conversations" print conversations[:30] if DAY_BY_DAY_ANALYSIS: return message_number_day_list else: return aggregate_message_number_graph
def nick_tracker(log_dict, track_users_on_channels = False): """ Tracks all nicks and the identifies nicks which point to same user Args: log_dict(dictionary): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} Returns: nicks(list): all nicks nick_same_list(list): list of lists with each list corresponding to nicks of same user """ nicks = [] # list of all the nicknames nick_same_list = [[] for i in range(config.MAX_EXPECTED_DIFF_NICKS)] nick_channel_dict = [] channels_for_user = [] nicks_hash = [] channels_hash = [] #Getting all the nicknames in a list def nick_append(nick, nicks, nicks_today_on_this_channel, track_users_on_channels): if track_users_on_channels and (nick not in nicks_today_on_this_channel): nicks_today_on_this_channel.append(nick) #not nicks as there are same nicks spread across multiple channels nicks.append(nick) elif nick not in nicks: nicks.append(nick) return nicks, nicks_today_on_this_channel for day_content_all_channels in log_dict.values(): #traverse over data of different channels for that day channels_for_user_day = {}#empty for next day usage for day_content in day_content_all_channels: day_log = day_content["log_data"] channel_name = day_content["auxiliary_data"]["channel"] nicks_today_on_this_channel = [] for i in day_log: # use regex to get the string between <> and appended it to the nicks list if(util.check_if_msg_line (i)): m = re.search(r"\<(.*?)\>", i) nick = util.correctLastCharCR(m.group(0)[1:-1]) nicks, nicks_today_on_this_channel = nick_append(nick, nicks, nicks_today_on_this_channel, track_users_on_channels) ''' Forming list of lists for avoiding nickname duplicacy ''' for line in day_log: if("Nick change:" in line): old_nick = line.split()[3] new_nick = line.split()[5] nicks, nicks_today_on_this_channel = nick_append(old_nick, nicks, nicks_today_on_this_channel, track_users_on_channels) nicks, nicks_today_on_this_channel = nick_append(new_nick, nicks, nicks_today_on_this_channel, track_users_on_channels) #nicks.append(new_nick) for i in range(config.MAX_EXPECTED_DIFF_NICKS): if old_nick in nick_same_list[i] or new_nick in nick_same_list[i]: if old_nick not in nick_same_list[i]: nick_same_list[i].append(old_nick) if new_nick not in nick_same_list[i]: nick_same_list[i].append(new_nick) break if not nick_same_list[i]: if old_nick not in nick_same_list[i]: nick_same_list[i].append(old_nick) if new_nick not in nick_same_list[i]: nick_same_list[i].append(new_nick) break if track_users_on_channels: ''' Creating list of dictionaries nick_channel_dict of the format : [{'nickname':'rohan', 'channels':['[#abc', 0],['#bcd', 0]]},{}] ''' considered_nicks = [] if config.DEBUGGER: print "Analysis on", (str(day_content["auxiliary_data"]["day"]) + "-" + str(day_content["auxiliary_data"]["month"])), channel_name for user in nicks_today_on_this_channel: f = 1 for nick_tuple in nick_same_list: if user in nick_tuple: user_nick = nick_tuple[0] f = 0 break if f: user_nick = user '''for channels of user on a day''' if channels_for_user_day.has_key(user_nick) and channel_name not in channels_for_user_day[user_nick]: channels_for_user_day[user_nick].append(channel_name) else: channels_for_user_day[user_nick] = [channel_name] flag = 1 for dictionary in nick_channel_dict: if dictionary['nickname'] == user_nick and user_nick not in considered_nicks: index = searchChannel(channel_name, dictionary['channels']) if index == -1: dictionary['channels'].append([channel_name,1]) else: dictionary['channels'][index][1]+=1 flag = 0 considered_nicks.append(user_nick) break if flag: nick_channel_dict.append({'nickname':user_nick, 'channels': [[channel_name, 1]]}) considered_nicks.append(user_nick) channels_for_user.append(channels_for_user_day) for nick in nicks: for index in range(config.MAX_EXPECTED_DIFF_NICKS): if nick in nick_same_list[index]: break if not nick_same_list[index]: nick_same_list[index].append(nick) break if config.DEBUGGER: print "========> 30 on " + str(len(nicks)) + " nicks" print nicks[:30] print "========> 30 on " + str(len(nick_same_list)) + " nick_same_list" print nick_same_list[:30] if not track_users_on_channels: return [nicks, nick_same_list] else: for dicts in nick_channel_dict: nick = dicts['nickname'] if nick not in nicks_hash: nicks_hash.append(nick) for channel in dicts['channels']: if channel[0] not in channels_hash: channels_hash.append(channel[0]) return [nicks, nick_same_list, channels_for_user, nick_channel_dict, nicks_hash, channels_hash]
def test_check_if_msg_line(self, line, line2): self.assertTrue(util.check_if_msg_line(line)) self.assertFalse(util.check_if_msg_line(line2))
def test_slack_check_if_msg_line(self): self.assertTrue( util.check_if_msg_line('[01:02:33] <shonudo> thanks ;')) self.assertTrue( util.check_if_msg_line( '[01:00:49] <Skywise> is it new years yet?'))
def nick_tracker(log_dict): """ Tracks all nicks and the identifies nicks which point to same user Args: log_dict(dictionary): with key as dateTime.date object and value as {"data":datalist,"channel_name":channels name} Returns: nicks(list): all nicks nick_same_list(list): list of lists with each list corresponding to nicks of same user """ nicks = [] # list of all the nicknames nick_same_list = [[] for i in xrange(config.MAX_EXPECTED_DIFF_NICKS)] # Getting all the nicknames in a list def nick_append(nick, nicks): if nick not in nicks: nicks.append(nick) return nicks for day_content_all_channels in log_dict.values(): # traverse over data of different channels for that day for day_content in day_content_all_channels: day_logs = day_content["log_data"] for day_log in day_logs: # use regex to get the string between <> and appended it to the nicks list if (util.check_if_msg_line(day_log)): m = re.search(r"\<(.*?)\>", day_log) nick = util.correctLastCharCR(m.group(0)[1:-1]) nicks = nick_append(nick, nicks) ''' Forming list of lists for avoiding nickname duplicacy ''' for line in day_logs: if ("Nick change:" in line): old_nick = line.split()[3] new_nick = line.split()[5] nicks = nick_append(old_nick, nicks) nicks = nick_append(new_nick, nicks) for i in xrange(config.MAX_EXPECTED_DIFF_NICKS): if old_nick in nick_same_list[ i] or new_nick in nick_same_list[i]: if old_nick not in nick_same_list[i]: nick_same_list[i].append(old_nick) if new_nick not in nick_same_list[i]: nick_same_list[i].append(new_nick) break if not nick_same_list[i]: nick_same_list[i].append(old_nick) nick_same_list[i].append(new_nick) break for nick in nicks: for index in xrange(config.MAX_EXPECTED_DIFF_NICKS): if nick in nick_same_list[index]: break if not nick_same_list[index]: nick_same_list[index].append(nick) break if config.DEBUGGER: print "========> 30 on {} nicks".format(len(nicks)) print nicks[:30] print "========> 30 on {} nick_same_list".format(len(nick_same_list)) print nick_same_list[:30] return [nicks, nick_same_list]