예제 #1
0
def generate_edge_list():
    """
    This function generates a list of nodes and edges in the graphs from the JSON file and saves it as a CSV file.
    :param ref_toggle: If True, References attribute is used to make edges and if False, In-Reply-To is used.
    """
    # The following set stores all the mail UIDs and the corresponding time as a semi-colon separated string
    nodes = set()
    edges = set()
    with open('graph_edges.csv', 'w') as csv_file:
            with open('clean_data.json', 'r') as fil:
                for chunk in lines_per_n(fil, 9) :
                    jfile = json.loads(chunk)
                    msg_id = jfile['Message-ID']
                    msg_time = jfile['Time']
                    msg_from = "".join(jfile['From'].split())
                    nodes.add(str(msg_id) + ";" + msg_from + ";" + msg_time)
                    if jfile['References']:
                        ref_list = str(jfile['References']).split(',')
                        # Message Id of the parent mail is appended to the end of the list of references.
                        parent_id = int(ref_list[-1])
                        if parent_id and parent_id < msg_id:
                            edges.add((parent_id, msg_id))
                    if jfile['In-Reply-To']:
                        parent_id = jfile['In-Reply-To']
                        if parent_id and parent_id < msg_id:
                            edges.add((parent_id, msg_id))
    with open('graph_nodes.csv', 'w') as node_file:
        for node_str in nodes:
            node_file.write(node_str + "\n")
    with open('graph_edges.csv', 'w') as edge_file:
        for parent_id, msg_id in edges:
            edge_file.write(str(parent_id) + ';' + str(msg_id) + "\n")
def generate_edge_list(nodelist_filename='graph_nodes.csv', edgelist_filename='graph_edges.csv', json_filename='clean_data.json'):
    """
    This function generates a list of nodes and edges in the graphs from the JSON file and saves it as a CSV file.
    :param ref_toggle: If True, References attribute is used to make edges and if False, In-Reply-To is used.
    """
    # The following set stores all the mail UIDs and the corresponding time as a semi-colon separated string
    nodes = set()
    edges = set()
    with open(json_filename, 'r') as fil:
        for chunk in lines_per_n(fil, 9) :
            jfile = json.loads(chunk)
            msg_id = jfile['Message-ID']
            msg_time = jfile['Time']
            msg_from = "".join(jfile['From'].split())
            nodes.add(str(msg_id) + ";" + msg_from + ";" + msg_time)
            if jfile['References']:
                ref_list = str(jfile['References']).split(',')
                # Message Id of the parent mail is appended to the end of the list of references.
                parent_id = int(ref_list[-1])
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
            if jfile['In-Reply-To']:
                parent_id = jfile['In-Reply-To']
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
    with open(nodelist_filename, 'w') as node_file:
        for node_str in nodes:
            node_file.write(node_str + "\n")
    with open(edgelist_filename, 'w') as edge_file:
        for parent_id, msg_id in edges:
            edge_file.write(str(parent_id) + ';' + str(msg_id) + "\n")
예제 #3
0
def write_author_uid_map():

    index = 0
    author_set = set()
    author_uid_map = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    with open('clean_data.json', 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
            from_addr = email_re.search(json_obj['From'])
            author_set.add(from_addr.group(0) if from_addr is not None else json_obj['From'])
            author_set |= set(email_re.findall(json_obj['To']))
            if json_obj['Cc'] is not None:
                author_set |= set(email_re.findall(json_obj['Cc']))
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
    print("JSON data loaded.")

    for address in author_set:
        author_uid_map[address] = index
        index += 1

    with open("author_uid_map.json", 'w') as map_file:
        json.dump(author_uid_map, map_file, indent=1)
        map_file.close()
    print("UID map written to author_uid_map.json.")
예제 #4
0
def write_author_uid_map():
    """

    This function is used to generate and write to a JSON file the mapping of authors to a unique integer identifier.
    Authors are identified through a regular expression search for their email addresses. The integer identifiers
    generated are used in other modules like the generation and statistical analysis of hyperedges.
    """
    index = 0
    author_set = set()
    author_uid_map = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    with open('clean_data.json', 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
            from_addr = email_re.search(json_obj['From'])
            author_set.add(
                from_addr.
                group(0) if from_addr is not None else json_obj['From'])
            author_set |= set(email_re.findall(json_obj['To']))
            if json_obj['Cc'] is not None:
                author_set |= set(email_re.findall(json_obj['Cc']))
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
    print("JSON data loaded.")

    for address in author_set:
        author_uid_map[address] = index
        index += 1

    with open("author_uid_map.json", 'w') as map_file:
        json.dump(author_uid_map, map_file, indent=1)
        map_file.close()
    print("UID map written to author_uid_map.json.")
예제 #5
0
def remove_duplicate_headers(to_remove=duplicate_uid):
    """
    This function removes all the duplicate entries of the UIDs specified in the to_remove parameter. By default,
    it removes all the duplicate entries in the JSON file.
    :param to_remove: A list of UIDs that need to be removed. Default value is the list of duplicate mails' UIDs.
    """
    # The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
    # In case a duplicate exists, it would be read twice and hence would fail the set membership test.
    read_uid = set([])

    if len(to_remove) > 0:
        print("Removing duplicate headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []

        with open('headers.json', 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in read_uid:
                    write_to_file.append(json_obj)
                read_uid.add(json_obj['Message-ID'])

        with open('headers.json', 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")
예제 #6
0
def remove_unwanted_headers(to_remove=unwanted_uid):
    """
    This function removes all the UIDs specified in the to_remove parameter. By default, it removes all the unwanted
    entries in the JSON file, i.e. the list of UIDs of mails that are not forwarded from LKML subscription.
    :param to_remove: A list of UIDs that need to be removed. Default value is the list of unwanted mails' UIDs
    """
    if len(to_remove) > 0:
        print("Removing unwanted headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []

        with open('headers.json', 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in unwanted_uid:
                    write_to_file.append(json_obj)

        with open('headers.json', 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")
예제 #7
0
def replace_invalid_headers(to_replace=invalid_uid):
    """
    This function removes the mail headers that have insufficient attributes and fetches those headers again.
    If an attribute is missing in the original mail header or if the mail has been deleted, this function ignores that UID.
    :param to_replace: A list of UIDs that need to be replaced. Default value is the list of invalid mails' UIDs.
    """
    if len(to_replace) > 0:
        print("Replacing invalid headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []
        with open('headers.json', 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in invalid_uid:
                    write_to_file.append(json_obj)

        with open('headers.json', 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")

        add_missing_headers(to_replace)
예제 #8
0
def remove_invalid_references(ref_toggle=False):

    # The "unspecified_ref" list is used to keep track of all those mails that have '0' in their reference list.
    # If any mail has any of the element in this list in its list of references, we can eliminate them as well
    unspecified_ref = ['0']

    print("Removing headers associated with invalid references...")

    with open('headers.json', 'r') as fil:
        with open("clean_data.json", mode='w', encoding='utf-8') as fin_file :

            for chunk in lines_per_n(fil, 9):
                # The "jfile" is used to store the json object read from the file.
                jfile = json.loads(chunk)

                """
                Mails that have references that are of type None indicate that they maybe the start of threads.
                Anything else could be mail in a thread or something else.
                """
                if jfile['References'] is not None:
                    # Checking if the references is an empty string
                    if not jfile['References'] == "":
                        # The references are stored as a comma separated string. We have to split it at the ',' to get a list.
                        if ref_toggle:
                            ref_list = jfile['References'].split(',')
                        else:
                            if jfile['In-Reply-To'] is not None:
                                ref_list = [str(jfile['In-Reply-To'])]
                            else:
                                ref_list = None
                        # A '0' in the list indicates that the mail contains references to some other mail which is not available to us
                        if '0' not in ref_list or ref_list is None:
                            data = {}
                            data['Message-ID'] = jfile['Message-ID']
                            data['From'] = jfile['From']
                            data['To'] = jfile['To']
                            data['Cc'] = jfile['Cc']
                            data['In-Reply-To'] = jfile['In-Reply-To']
                            data['References'] = jfile['References']
                            data['Time'] = jfile['Time']
                            contain_unspec_ref = False

                            # This is done to eliminate all those mails whose reference list contains mails that have '0' in their reference list
                            for ref in ref_list :
                                if ref in unspecified_ref:
                                    contain_unspec_ref = True
                            if not contain_unspec_ref:
                                    json.dump(data, fin_file, indent=1)
                                    fin_file.write('\n')
                        else:
                            unspecified_ref.append(str(jfile['Message-ID']))

                # Writing all those mails that have None as their References
                else:
                    data = {}
                    data['Message-ID'] = jfile['Message-ID']
                    data['From'] = jfile['From']
                    data['To'] = jfile['To']
                    data['Cc'] = jfile['Cc']
                    data['In-Reply-To'] = jfile['In-Reply-To']
                    data['References'] = jfile['References']
                    data['Time'] = str(jfile['Time'])
                    json.dump(data, fin_file, indent=1)
                    fin_file.write('\n')

        fin_file.close()
    fil.close()
def generate_hyperedges():
    discussion_graph = nx.DiGraph()
    json_data = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    with open("graph_nodes.csv", "r") as node_file:
        for pair in node_file:
            node = pair.split(';', 2)
            discussion_graph.add_node(node[0],
                                      time=node[2].strip(),
                                      sender=node[1].strip())
        node_file.close()
    print("Nodes added.")

    with open("graph_edges.csv", "r") as edge_file:
        for pair in edge_file:
            edge = pair.split(';')
            edge[1] = edge[1].strip()
            try:
                discussion_graph.node[edge[0]]['sender']
                discussion_graph.node[edge[1]]['sender']
                discussion_graph.add_edge(*edge)
            except KeyError:
                pass
        edge_file.close()
    print("Edges added.")

    with open('clean_data.json', 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
            from_addr = email_re.search(json_obj['From'])
            json_obj['From'] = from_addr.group(
                0) if from_addr is not None else json_obj['From']
            json_obj['To'] = set(email_re.findall(json_obj['To']))
            json_obj['Cc'] = set(email_re.findall(
                json_obj['Cc'])) if json_obj['Cc'] is not None else None
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
            json_data[json_obj['Message-ID']] = json_obj

    with open('author_uid_map.json', 'r') as uid_file:
        author_uid = json.load(uid_file)
        uid_file.close()
    print("JSON data loaded.")

    for conn_subgraph in nx.weakly_connected_component_subgraphs(
            discussion_graph):
        origin_node = min(int(x) for x in conn_subgraph.nodes())
        if origin_node != 5141:
            continue
        thread_nodes = list()
        thread_authors = set()
        add_thread_nodes(thread_authors, [origin_node], None, 0, json_data,
                         thread_nodes, conn_subgraph)
        thread_authors = list(thread_authors)
        thread_nodes.sort()

        index = 1
        author_interaction_matrix = [[' ' for x in range(len(thread_authors))]
                                     for y in range(1 + len(thread_nodes))]
        for message_node in thread_nodes:
            # print(len(thread_authors), len(thread_nodes), thread_authors.index(message_node.from_addr), index)
            for to_addr in message_node.to_addr:
                author_interaction_matrix[index][thread_authors.index(
                    to_addr)] = 'T'
            for cc_addr in message_node.cc_addr:
                author_interaction_matrix[index][thread_authors.index(
                    cc_addr)] = 'C'
            author_interaction_matrix[index][thread_authors.index(
                message_node.from_addr)] = 'F'
            index += 1

        index = 0
        # author_enumeration = dict()
        for author in thread_authors:
            author_interaction_matrix[0][index] = "author-" + str(
                author_uid[author])
            index += 1
            # author_enumeration[author] = "author-" + str(author_uid[author])

        indegree = [0 for x in range(len(thread_authors))]
        outdegree = [0 for x in range(len(thread_authors))]
        for i in range(1, len(thread_nodes) + 1):
            for j in range(len(thread_authors)):
                if author_interaction_matrix[i][j] in ('T', 'C'):
                    indegree[j] += 1
                elif author_interaction_matrix[i][j] == 'F':
                    outdegree[j] += 1

        thread_authors = [
            x for (y, x) in sorted(zip(outdegree, thread_authors),
                                   key=lambda pair: pair[0],
                                   reverse=True)
        ]
        indegree = [
            x for (y, x) in sorted(zip(outdegree, indegree),
                                   key=lambda pair: pair[0],
                                   reverse=True)
        ]
        author_interaction_matrix = map(list, zip(*author_interaction_matrix))
        author_interaction_matrix = [
            x for (y, x) in sorted(zip(outdegree, author_interaction_matrix),
                                   key=lambda pair: pair[0],
                                   reverse=True)
        ]
        author_interaction_matrix = list(
            map(list, zip(*author_interaction_matrix)))
        outdegree.sort(reverse=True)

        index = 1
        prev_height = -1
        total_cc = row_cc = 0
        total_to = row_to = 0
        with open("hyperedge/" + str(origin_node) + ".csv",
                  'w') as hyperedge_file:
            tablewriter = csv.writer(hyperedge_file)
            tablewriter.writerow(
                ["Height", "Message-ID", "Parent-ID", "Time"] +
                author_interaction_matrix[0] + ["No. of CCs", "No. of TOs"])
            for message_node in thread_nodes:
                curr_height = " " if message_node.height == prev_height else message_node.height
                parent_id = message_node.parent_id if message_node.parent_id else "None"
                row_cc = author_interaction_matrix[index].count('C')
                row_to = author_interaction_matrix[index].count('T')
                total_cc += row_cc
                total_to += row_to
                tablewriter.writerow([
                    curr_height, message_node.msg_id, parent_id,
                    message_node.time
                ] + author_interaction_matrix[index] + [row_cc, row_to])
                prev_height = message_node.height
                index += 1
            tablewriter.writerow([" ", " ", " ", "Outdegree"] + outdegree +
                                 ["Total CCs", "Total TOs"])
            tablewriter.writerow([" ", " ", " ", "Indegree"] + indegree +
                                 [total_cc, total_to])
            hyperedge_file.close()
def generate_hyperedge_distribution(nodelist_filename,
                                    edgelist_filename,
                                    clean_headers_filename,
                                    foldername,
                                    time_limit=None,
                                    ignore_lat=False):
    """

    :param ignore_lat: If true, then messages that belong to threads that have only a single author are ignored.
    :param time_limit: Time limit can be specified here in the form of a timestamp in one of the identifiable formats
            and all messages that have arrived after this timestamp will be ignored.
    """
    if time_limit is None:
        time_limit = time.strftime("%a, %d %b %Y %H:%M:%S %z")
    msgs_before_time = set()
    time_limit = get_datetime_object(time_limit)
    print("All messages before", time_limit, "are being considered.")

    discussion_graph = nx.DiGraph()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')
    json_data = dict()
    # Author participation denotes the the number of threads an author is active in. This is a dictionary keyed
    # by the author's email id with the value equalling the number of threads in which the author has sent a mail.
    author_participation = dict()

    # Add nodes into NetworkX graph by reading from CSV file
    if not ignore_lat:
        with open(nodelist_filename, "r") as node_file:
            for pair in node_file:
                node = pair.split(';')
                if get_datetime_object(node[2].strip()) < time_limit:
                    node[0] = int(node[0])
                    msgs_before_time.add(node[0])
                    from_addr = email_re.search(node[1].strip())
                    from_addr = from_addr.group(
                        0) if from_addr is not None else node[1].strip()
                    discussion_graph.add_node(node[0],
                                              time=node[2].strip(),
                                              color="#ffffff",
                                              style='bold',
                                              sender=from_addr)
            node_file.close()
        print("Nodes added.")

        # Add edges into NetworkX graph by reading from CSV file
        with open(edgelist_filename, "r") as edge_file:
            for pair in edge_file:
                edge = pair.split(';')
                edge[0] = int(edge[0])
                edge[1] = int(edge[1])
                if edge[0] in msgs_before_time and edge[1] in msgs_before_time:
                    discussion_graph.add_edge(*edge)
            edge_file.close()
        print("Edges added.")

    else:
        lone_author_threads = get_lone_author_threads(
            save_file=None,
            nodelist_filename=nodelist_filename,
            edgelist_filename=edgelist_filename)
        # Add nodes into NetworkX graph only if they are not a part of a thread that has only a single author
        with open(nodelist_filename, "r") as node_file:
            for pair in node_file:
                node = pair.split(';')
                node[0] = int(node[0])
                if get_datetime_object(node[2].strip(
                )) < time_limit and node[0] not in lone_author_threads:
                    msgs_before_time.add(node[0])
                    from_addr = email_re.search(node[1].strip())
                    from_addr = from_addr.group(
                        0) if from_addr is not None else node[1].strip()
                    discussion_graph.add_node(node[0],
                                              time=node[2].strip(),
                                              color="#ffffff",
                                              style='bold',
                                              sender=from_addr)
            node_file.close()
        print("Nodes added.")

        # Add edges into NetworkX graph only if they are not a part of a thread that has only a single author
        with open(edgelist_filename, "r") as edge_file:
            for pair in edge_file:
                edge = pair.split(';')
                edge[0] = int(edge[0])
                edge[1] = int(edge[1])
                if edge[0] not in lone_author_threads and edge[
                        1] not in lone_author_threads:
                    if edge[0] in msgs_before_time and edge[
                            1] in msgs_before_time:
                        discussion_graph.add_edge(*edge)
            edge_file.close()
        print("Edges added.")

    with open(clean_headers_filename, 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
            from_addr = email_re.search(json_obj['From'])
            json_obj['From'] = from_addr.group(
                0) if from_addr is not None else json_obj['From']
            author_participation[json_obj['From']] = 0
            json_obj['To'] = set(email_re.findall(json_obj['To']))
            json_obj['Cc'] = set(email_re.findall(
                json_obj['Cc'])) if json_obj['Cc'] is not None else None
            for email_id in json_obj['To']:
                author_participation[email_id] = 0
            if json_obj['Cc'] is not None:
                for email_id in json_obj['Cc']:
                    author_participation[email_id] = 0
            json_data[json_obj['Message-ID']] = json_obj
    print("JSON data loaded.")

    # The index of the hyperedge_dist list contains the number of vertices receiving the hyperedge and the value stored
    # at the index corresponds to the frequency or the number of observations.
    hyperedge_dist = [0 for x in range(1000)]
    max_len = -1
    for conn_subgraph in nx.weakly_connected_component_subgraphs(
            discussion_graph):
        authors_active = set()
        for msg_id in conn_subgraph.nodes():
            msg_attr = json_data[msg_id]
            if msg_attr['From'] not in authors_active:
                author_participation[msg_attr['From']] += 1
            if msg_attr['Cc'] is not None:
                curr_len = len(msg_attr['Cc']) + len(msg_attr['To'])
            else:
                curr_len = len(msg_attr['To'])
            hyperedge_dist[curr_len] += 1
            if curr_len > max_len:
                max_len = curr_len

    with open(foldername + "/tables/hyperedge_distribution.csv",
              'w') as hyperedge_dist_file:
        hyperedge_dist_file.write(
            "No. of Vertices Receiving Hyperedge,Frequency\n")
        for index in range(1, 1000):
            hyperedge_dist_file.write(
                str(index) + "," + str(hyperedge_dist[index]) + "\n")
            if index == max_len:
                break
    hyperedge_dist_file.close()
    print("Hyperedge distribution statistic written to file.")

    plt.clf()
    plt.plot(range(1, max_len + 1), hyperedge_dist[1:max_len + 1])
    plt.savefig(foldername + "/plots/hyperedge_distribution.png")

    with open(foldername + "/tables/author_thread_participation.csv",
              'w') as author_participation_file:
        author_participation_file.write(
            "Author Email ID,Number of Active Threads\n")
        for author_id, num_threads in author_participation.items():
            author_participation_file.write(author_id + "," +
                                            str(num_threads) + "\n")
    print("Author-Thread Participation statistic written to file.")

    plt.clf()
    data = [
        num_threads for author_id, num_threads in author_participation.items()
    ]
    plt.hist(data, bins=50)
    plt.savefig(foldername + "/plots/author_thread_participation.png")
예제 #11
0
def check_validity(check_unavailable_uid='False'):
    """
    This function checks for and prints duplicate, missing, and invalid objects in the "headers.json" file.
    This function can be run first to generate a list of duplicate, missing, or invalid objects' UIDs which
    can then be used to add or remove their entries from the JSON file.
    :return: Last UID that was checked by the function.
    """
    previous_uid = 0

    # The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
    # In case a duplicate exists, it would be read twice and hence would fail the set membership test.
    read_uid = set([])

    # This variable contains the last UID that was checked. This variable is returned by the function.
    last_valid_uid = 0

    header_attrib = {'Message-ID', 'From', 'To', 'Cc', 'In-Reply-To', 'Time'}

    # Read UIDs of mails that are not forwarded from LKML subscription which is stored in a text file.

    with open('headers.json', 'r') as json_file:

        for chunk in lines_per_n(json_file, 9):
            try:
                json_obj = json.loads(chunk)
            except:
                print("Unreadable JSON object after UID: " + str(previous_uid))
                break

            # Checking for duplicate objects
            if not json_obj['Message-ID'] in read_uid:
                read_uid.add(json_obj['Message-ID'])
            else:
                duplicate_uid.add(json_obj['Message-ID'])

            # Check if the JSON object has sufficient attributes by checking if "header_attrib" is a subset of its keys
            if not set(header_attrib) <= json_obj.keys() or json_obj['Time'] is None:
                invalid_uid.add(json_obj['Message-ID'])

            # Check if it is a mail that is sent directly to "*****@*****.**", in which caseit has not been
            # forwarded from the LKML subscription.
            if json_obj['To'] == "*****@*****.**":
                unwanted_uid.add(json_obj['Message-ID'])

            previous_uid = json_obj['Message-ID']

    # Calculate the missing UIDs by performing a set difference on all the UIDs possible till the highest UID read
    # from the actual UIDs that have been read.
    if previous_uid != 0:
        global last_uid_read
        last_uid_read = max(read_uid)
        global missing_uid
        missing_uid = set(range(min(read_uid), last_uid_read+1)) - read_uid
        global unavailable_uid

    if check_unavailable_uid:
        unavailable_uid = get_unavailable_uid()
        print("Unavailable UIDs: ", unavailable_uid if len(unavailable_uid) > 0 else "None")
        with open("unwanted_uid.txt", 'a') as unw_file:
            for uid in unwanted_uid:
                unw_file.write(str(uid) + '\n')
        print("Unwanted UIDs: ", unwanted_uid if len(unwanted_uid) > 0 else "None")

    print("Duplicate UIDs: ", duplicate_uid if len(duplicate_uid) > 0 else "None")
    print("Missing UIDs: ", missing_uid if len(missing_uid) > 0 else "None")
    print("Invalid UIDs: ", invalid_uid if len(invalid_uid) > 0 else "None")
    return last_uid_read
import json
from util.read_utils import lines_per_n
import community
import networkx as nx

author_graph = nx.DiGraph()
with open('clean_data.json', 'r') as jfile:
    for chunk in lines_per_n(jfile, 9):
        hdr_data = json.loads(chunk)
        for to_addr in str(hdr_data['To']).split(","):
            if '@' in to_addr:
                author_graph.add_edge(str(hdr_data['From']), to_addr.strip(), style='solid', label=hdr_data['Time'])
        for cc_addr in str(hdr_data['Cc']).split(","):
            if '@' in to_addr:
                author_graph.add_edge(str(hdr_data['From']), cc_addr.strip(), style='dashed', label=hdr_data['Time'])
    jfile.close()

print("No. of Weakly Connected Components:", nx.number_weakly_connected_components(author_graph))
print("No. of Strongly Connected Components:", nx.number_strongly_connected_components(author_graph))
print("Nodes:", nx.number_of_nodes(author_graph))
print("Edges:", nx.number_of_edges(author_graph))

#The following lines of code generate a dendogram for the above graph
dendo = community.generate_dendogram(author_graph.to_undirected())
for level in range(len(dendo)) :
    print("Partition at level", level, "is", community.partition_at_level(dendo, level))
    print("-"*10)
예제 #13
0
msg_ref_map = {}  # Map between message id of each mail to its references list


# Function to eliminate the non-leaf message-ids from the list of leaf message ids.
def get_current_leaf_nodes(list1, list2):
    s = set(list2)
    list3 = [msg_id for msg_id in list1 if str(msg_id) not in s]
    return list3

"""
For each json object read, we add the message id into the list leaf_msgs and create an entry of the particular id in the map
msg_ref_map. We then check if any non-leaf message ids are present in the list leaf_msgs by calling the function get_current_leaf_nodes.
After going through the entire file, we then print the leaf message-ids and their references. This is stored in the file thread_paths.txt
"""
with open('clean_data.json', 'r') as fil:
    for chunk in lines_per_n(fil, 9):

        jfile = json.loads(chunk)

        leaf_msgs.append(jfile['Message-ID'])
        msg_ref_map[jfile['Message-ID']] = str(jfile['References'])

        if not (jfile['References'] == None):
            leaf_msgs = get_current_leaf_nodes(leaf_msgs, jfile['References'].split(','))

    fil.close()

with open('graph_leaf_nodes.csv', 'w') as csv_file:
    for msg_id in leaf_msgs:
        csv_file.write("{0};{1}\n".format(msg_id, msg_ref_map[msg_id]))
    csv_file.close()
with open("graph_edges.csv", "r") as edge_file:
    for pair in edge_file:
        edge = pair.split(';')
        edge[1] = edge[1].strip()
        try:
            discussion_graph.node[edge[0]]['sender']
            discussion_graph.node[edge[1]]['sender']
            discussion_graph.add_edge(*edge)
        except KeyError:
            pass
    edge_file.close()
print("Edges added.")

with open('clean_data.json', 'r') as json_file:
    for chunk in lines_per_n(json_file, 9):
        json_obj = json.loads(chunk)
        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
        from_addr = email_re.search(json_obj['From'])
        json_obj['From'] = from_addr.group(
            0) if from_addr is not None else json_obj['From']
        json_obj['To'] = set(email_re.findall(json_obj['To']))
        json_obj['Cc'] = set(email_re.findall(
            json_obj['Cc'])) if json_obj['Cc'] is not None else None
        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
        json_data[json_obj['Message-ID']] = json_obj

with open('author_uid_map.json', 'r') as uid_file:
    author_uid = json.load(uid_file)
    uid_file.close()
print("JSON data loaded.")
    node_file.close()
print("Nodes added.")

with open("graph_edges.csv", "r") as edge_file:
    for pair in edge_file:
        edge = pair.split(';')
        edge[1] = edge[1].strip()
        try:
            discussion_graph.node[edge[0]]['sender']
            discussion_graph.node[edge[1]]['sender']
            discussion_graph.add_edge(*edge)
        except KeyError:
            pass
    edge_file.close()
print("Edges added.")

with open('headers.json', 'r') as json_file:
    for chunk in lines_per_n(json_file, 9):
        json_obj = json.loads(chunk)
        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
        from_addr = email_re.search(json_obj['From'])
        json_obj['From'] = from_addr.group(0) if from_addr is not None else json_obj['From']
        json_obj['To'] = set(email_re.findall(json_obj['To']))
        json_obj['Cc'] = set(email_re.findall(json_obj['Cc'])) if json_obj['Cc'] is not None else None
        # print("\nFrom", json_obj['From'], "\nTo", json_obj['To'], "\nCc", json_obj['Cc'])
        json_data[json_obj['Message-ID']] = json_obj
print("JSON data loaded.")
author_interaction_weighted_graph(discussion_graph, json_data, limit=20)
author_interaction_multigraph(discussion_graph, json_data, limit=20)

예제 #16
0

# Function to eliminate the non-leaf message-ids from the list of leaf message ids.
def get_current_leaf_nodes(list1, list2):
    s = set(list2)
    list3 = [msg_id for msg_id in list1 if str(msg_id) not in s]
    return list3


"""
For each json object read, we add the message id into the list leaf_msgs and create an entry of the particular id in the map
msg_ref_map. We then check if any non-leaf message ids are present in the list leaf_msgs by calling the function get_current_leaf_nodes.
After going through the entire file, we then print the leaf message-ids and their references. This is stored in the file thread_paths.txt
"""
with open('clean_data.json', 'r') as fil:
    for chunk in lines_per_n(fil, 9):

        jfile = json.loads(chunk)

        leaf_msgs.append(jfile['Message-ID'])
        msg_ref_map[jfile['Message-ID']] = str(jfile['References'])

        if not (jfile['References'] == None):
            leaf_msgs = get_current_leaf_nodes(leaf_msgs,
                                               jfile['References'].split(','))

    fil.close()

with open('graph_leaf_nodes.csv', 'w') as csv_file:
    for msg_id in leaf_msgs:
        csv_file.write("{0};{1}\n".format(msg_id, msg_ref_map[msg_id]))
def remove_invalid_references(input_json_filename,
                              output_json_filename,
                              ref_toggle=False):

    # The "unspecified_ref" list is used to keep track of all those mails that have '0' in their reference list.
    # If any mail has any of the element in this list in its list of references, we can eliminate them as well
    unspecified_ref = ['0']

    print("Removing headers associated with invalid references...")

    with open(input_json_filename, 'r') as fil:
        with open(output_json_filename, mode='w',
                  encoding='utf-8') as fin_file:

            for chunk in lines_per_n(fil, 9):
                # The "jfile" is used to store the json object read from the file.
                jfile = json.loads(chunk)
                """
                Mails that have references that are of type None indicate that they maybe the start of threads.
                Anything else could be mail in a thread or something else.
                """
                if jfile['References'] is not None:
                    # Checking if the references is an empty string
                    if not jfile['References'] == "":
                        # The references are stored as a comma separated string. We have to split it at the ',' to get a list.
                        if ref_toggle:
                            ref_list = jfile['References'].split(',')
                        else:
                            if jfile['In-Reply-To'] is not None:
                                ref_list = [str(jfile['In-Reply-To'])]
                            else:
                                ref_list = None
                        # A '0' in the list indicates that the mail contains references to some other mail which is not available to us
                        if '0' not in ref_list or ref_list is None:
                            data = {}
                            data['Message-ID'] = jfile['Message-ID']
                            data['From'] = jfile['From']
                            data['To'] = jfile['To']
                            data['Cc'] = jfile['Cc']
                            data['In-Reply-To'] = jfile['In-Reply-To']
                            data['References'] = jfile['References']
                            data['Time'] = jfile['Time']
                            contain_unspec_ref = False

                            # This is done to eliminate all those mails whose reference list contains mails that have '0' in their reference list
                            for ref in ref_list:
                                if ref in unspecified_ref:
                                    contain_unspec_ref = True
                            if not contain_unspec_ref:
                                json.dump(data, fin_file, indent=1)
                                fin_file.write('\n')
                        else:
                            unspecified_ref.append(str(jfile['Message-ID']))

                # Writing all those mails that have None as their References
                else:
                    data = {}
                    data['Message-ID'] = jfile['Message-ID']
                    data['From'] = jfile['From']
                    data['To'] = jfile['To']
                    data['Cc'] = jfile['Cc']
                    data['In-Reply-To'] = jfile['In-Reply-To']
                    data['References'] = jfile['References']
                    data['Time'] = str(jfile['Time'])
                    json.dump(data, fin_file, indent=1)
                    fin_file.write('\n')

        fin_file.close()
    fil.close()