Python lines_per_n примеры, lib.util.read.lines_per_n Python примеры использования

Пример #1

0

Показать файл

Файл: edge_list.py Проект: prasadtalasila/MailingListParser

def generate_node_labels(nodelist_filename='graph_nodes.txt', edgelist_filename='graph_edges.txt', json_filename='clean_data.json'):
    """

    This function generates a list of nodes and edges in the graphs from the JSON file and saves it as a TXT file.

    :param nodelist_filename: txt file to store the graph nodes.
    :param edgelist_filename: txt file to store the graph edges.
    :param json_filename: The JSON file containing the cleaned headers.
    """
    # The following set stores all the mail UIDs and the corresponding time as a semi-colon separated string
    nodes = set()
    edges = set()
    with open(json_filename, 'r') as fil:
        for chunk in lines_per_n(fil, 9) :
            jfile = json.loads(chunk)
            msg_id = jfile['Message-ID']
            msg_time = jfile['Time']
            msg_from = "".join(jfile['From'].split())
            nodes.add(str(msg_id) + ",")
            if jfile['References']:
                ref_list = str(jfile['References']).split(',')
                # Message Id of the parent mail is appended to the end of the list of references.
                parent_id = int(ref_list[-1])
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
            if jfile['In-Reply-To']:
                parent_id = jfile['In-Reply-To']
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
    with open(nodelist_filename, 'w') as node_file:
        for node_str in nodes:
            node_file.write(node_str + "\n")
    with open(edgelist_filename, 'w') as edge_file:
        for parent_id, msg_id in edges:
            edge_file.write(str(parent_id) + '\t' + str(msg_id) + "\n")

Пример #2

0

Показать файл

Файл: check_headers.py Проект: tejalwakchoure/MLCAT

def remove_duplicate_headers(to_remove=duplicate_uid,
                             json_header_filename='headers.json'):
    """

	This function removes all the duplicate entries of the UIDs specified in the to_remove parameter. By default,
	it removes all the duplicate entries in the JSON file.

	:param to_remove: A list of UIDs that need to be removed. Default value is the list of duplicate mails' UIDs.
	:param json_header_filename: The header file from which duplicate entries are removed.
	"""
    # The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
    # In case a duplicate exists, it would be read twice and hence would fail the set membership test.
    read_uid = set([])

    if len(to_remove) > 0:
        print("Removing duplicate headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []

        with open(json_header_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in read_uid:
                    write_to_file.append(json_obj)
                read_uid.add(json_obj['Message-ID'])

        with open(json_header_filename, 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")

Пример #3

0

Показать файл

Файл: author.py Проект: prasadtalasila/MailingListParser

def get_uid_map(input_file='clean_data.json',output_file="author_uid_map.json"):
    """
    This function is used to generate and write to a JSON file the mapping of authors to a unique integer identifier.
    Authors are identified through a regular expression search for their email addresses. The integer identifiers
    generated are used in other modules like the generation and statistical analysis of hyperedges.
    
    :param input_file: Path of the JSON file containing the data under analysis (default= 'clean_data.json')
    :param output_file: Path of the JSON file to which the generated map is written (default= "author_uid_map.json")
    :return: A list of all message ids that are leaf nodes
    """
    index = 0
    author_set = set()
    author_uid_map = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    with open(input_file, 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            from_addr = email_re.search(json_obj['From'])
            author_set.add(from_addr.group(0) if from_addr is not None else json_obj['From'])
            author_set |= set(email_re.findall(json_obj['To']))
            if json_obj['Cc'] is not None:
                author_set |= set(email_re.findall(json_obj['Cc']))
    print("JSON data loaded.")

    for address in author_set:
        author_uid_map[address] = index
        index += 1

    with open(output_file, 'w') as map_file:
        json.dump(author_uid_map, map_file, indent=1)
        map_file.close()
    print("UID map written to "+output_file+".")
    return author_uid_map

Пример #4

0

Показать файл

Файл: check_headers.py Проект: prasadtalasila/MailingListParser

	def remove_unwanted_headers(self, to_remove=unwanted_uid, json_headers="headers.json", output_file="headers.json"):
		"""

		This function removes all the UIDs specified in the to_remove parameter. By default, it removes all the unwanted
		entries in the JSON file, i.e. the list of UIDs of mails that are not forwarded from LKML subscription.

		:param to_remove: A list of UIDs that need to be removed. Default value is the list of unwanted mails' UIDs.
		:param json_headers: The header file from which unwanted entries are to be removed.
		:param output_file: The updated json file with the unwanted headers removed.
		"""
		
		if len(to_remove) > 0:
			print("Removing unwanted headers...")
			# This list contains a list of JSON objects that need to be written to file
			write_to_file = []

			with open(json_headers, 'r') as json_file:
				for chunk in lines_per_n(json_file, 9):
					json_obj = json.loads(chunk)
					if not json_obj['Message-ID'] in self.unwanted_uid:
						write_to_file.append(json_obj)

			with open(output_file, 'w') as json_file:
				for json_obj in write_to_file:
					json.dump(json_obj, json_file, indent=1)
					json_file.write("\n")

Пример #5

0

Показать файл

Файл: graph.py Проект: prasadtalasila/MailingListParser

def get_leaf_nodes(src_file, dest_file):
    """
    This function is used to compute the message-ids of leaf nodes in the thread graph.

    :param src_file: Source file containing message-ids.
    :param dest_file: Destination file(csv) to which message-ids of leaf nodes be stored.
    :return: List of message-ids of leaf nodes
    """
    leaf_msgs = []  # Keeps track of all those message ids that are leaf nodes
    msg_ref_map = {}  # Map between message id of each mail to its references list

    with open(src_file, 'r') as fil:
        for chunk in lines_per_n(fil, 9):

            jfile = json.loads(chunk)

            leaf_msgs.append(jfile['Message-ID'])
            msg_ref_map[jfile['Message-ID']] = str(jfile['References'])

            if not (jfile['References'] == None):
                leaf_msgs = get_current_leaf_nodes(leaf_msgs, jfile['References'].split(','))

        fil.close()

    with open(dest_file, 'w') as csv_file:
        for msg_id in leaf_msgs:
            csv_file.write("{0};{1}\n".format(msg_id, msg_ref_map[msg_id]))
    csv_file.close()
    return leaf_msgs

Пример #6

0

Показать файл

Файл: check_headers.py Проект: tejalwakchoure/MLCAT

def replace_invalid_headers(to_replace=invalid_uid,
                            json_header_filename="headers.json"):
    """

	This function removes the mail headers that have insufficient attributes and fetches those headers again.
	If an attribute is missing in the original mail header or if the mail has been deleted, this function ignores that UID.

	:param to_replace: A list of UIDs that need to be replaced. Default value is the list of invalid mails' UIDs.
	:param json_header_filename: The json file containing the headers.
	"""
    if len(to_replace) > 0:
        print("Replacing invalid headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []
        with open(json_header_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in invalid_uid:
                    write_to_file.append(json_obj)

        with open(json_header_filename, 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")

        add_missing_headers(to_replace)

Пример #7

0

Показать файл

def get_leaf_nodes(src_file, dest_file):
    """
    This function is used to compute the message-ids of leaf nodes in the thread graph.

    :param src_file: Source file containing message-ids.
    :param dest_file: Destination file(csv) to which message-ids of leaf nodes be stored.
    :return: List of message-ids of leaf nodes
    """
    leaf_msgs = []  # Keeps track of all those message ids that are leaf nodes
    msg_ref_map = {
    }  # Map between message id of each mail to its references list

    with open(src_file, 'r') as fil:
        for chunk in lines_per_n(fil, 9):

            jfile = json.loads(chunk)

            leaf_msgs.append(jfile['Message-ID'])
            msg_ref_map[jfile['Message-ID']] = str(jfile['References'])

            if not (jfile['References'] == None):
                leaf_msgs = get_current_leaf_nodes(
                    leaf_msgs, jfile['References'].split(','))

        fil.close()

    with open(dest_file, 'w') as csv_file:
        for msg_id in leaf_msgs:
            csv_file.write("{0};{1}\n".format(msg_id, msg_ref_map[msg_id]))
    csv_file.close()
    return leaf_msgs

Пример #8

0

Показать файл

def get_leaf_nodes(write_to_file=True):
    """
    This function is used to compute the message-ids of leaf nodes in the thread graph.

    :param write_to_file: If true, writes a list of leaf nodes to graph_leaf_nodes.csv (default = True)
    :return: List of message-ids of leaf nodes
    """
    leaf_msgs = []  # Keeps track of all those message ids that are leaf nodes
    msg_ref_map = {
    }  # Map between message id of each mail to its references list

    with open('clean_data.json', 'r') as fil:
        for chunk in lines_per_n(fil, 9):

            jfile = json.loads(chunk)

            leaf_msgs.append(jfile['Message-ID'])
            msg_ref_map[jfile['Message-ID']] = str(jfile['References'])

            if not (jfile['References'] == None):
                leaf_msgs = get_current_leaf_nodes(
                    leaf_msgs, jfile['References'].split(','))

        fil.close()

    with open('graph_leaf_nodes.csv', 'w') as csv_file:
        for msg_id in leaf_msgs:
            csv_file.write("{0};{1}\n".format(msg_id, msg_ref_map[msg_id]))
    csv_file.close()
    return leaf_msgs

Пример #9

0

Показать файл

Файл: check_headers.py Проект: tejalwakchoure/MLCAT

def remove_unwanted_headers(to_remove=unwanted_uid,
                            json_header_filename='headers.json'):
    """

	This function removes all the UIDs specified in the to_remove parameter. By default, it removes all the unwanted
	entries in the JSON file, i.e. the list of UIDs of mails that are not forwarded from LKML subscription.

	:param to_remove: A list of UIDs that need to be removed. Default value is the list of unwanted mails' UIDs.
	:param json_header_filename: The header file from which unwanted entries are removed.
	"""
    if len(to_remove) > 0:
        print("Removing unwanted headers...")
        # This list contains a list of JSON objects that need to be written to file
        write_to_file = []

        with open(json_header_filename, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if not json_obj['Message-ID'] in unwanted_uid:
                    write_to_file.append(json_obj)

        with open(json_header_filename, 'w') as json_file:
            for json_obj in write_to_file:
                json.dump(json_obj, json_file, indent=1)
                json_file.write("\n")

Пример #10

0

Показать файл

	def replace_invalid_headers(self,to_replace=invalid_uid, json_headers="headers.json", output_file="headers.json", unwanted_uid_filename="unwanted_uid.txt", uid_map_filename="thread_uid_map.json"):
		"""

		This function removes the mail headers that have insufficient attributes and fetches those headers again.
		If an attribute is missing in the original mail header or if the mail has been deleted, this function ignores that UID.

		:param to_replace: A list of UIDs that need to be replaced. Default value is the list of invalid mails' UIDs.
		:param json_headers: The json file containing the headers.
		:param output_file: The updated json file with the invalid headers replaced.
		:param unwanted_uid_filename: The file containing unwanted uids.
		:param uid_map_filename: The JSON file where the Message_ID-UID mapping is stored.
		"""
		if len(to_replace) > 0:
			print("Replacing invalid headers...")
			# This list contains a list of JSON objects that need to be written to file
			write_to_file = []
			with open(json_headers, 'r') as json_file:
				for chunk in lines_per_n(json_file, 9):
					json_obj = json.loads(chunk)
					if not json_obj['Message-ID'] in self.invalid_uid:
						write_to_file.append(json_obj)

			with open(output_file, 'w') as json_file:
				for json_obj in write_to_file:
					json.dump(json_obj, json_file, indent=1)
					json_file.write("\n")

			self.add_missing_headers(to_replace, unwanted_uid_filename, uid_map_filename)

Пример #11

0

Показать файл

Файл: check_headers.py Проект: prasadtalasila/MailingListParser

	def remove_duplicate_headers(self,to_remove=duplicate_uid, json_headers="headers.json", output_file="headers.json"):
		"""

		This function removes all the duplicate entries of the UIDs specified in the to_remove parameter. By default,
		it removes all the duplicate entries in the JSON file.

		:param to_remove: A list of UIDs that need to be removed. Default value is the list of duplicate mails' UIDs.
		:param json_headers: The header file from which duplicate entries are to be removed.
		:param output_file: The updated json file with the duplicate headers removed.
		"""
		# The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
		# In case a duplicate exists, it would be read twice and hence would fail the set membership test.
		read_uid = set([])

		if len(to_remove) > 0:
			print("Removing duplicate headers...")
			# This list contains a list of JSON objects that need to be written to file
			write_to_file = []

			with open(json_headers, 'r') as json_file:
				for chunk in lines_per_n(json_file, 9):
					json_obj = json.loads(chunk)
					if not json_obj['Message-ID'] in read_uid:
						write_to_file.append(json_obj)
					read_uid.add(json_obj['Message-ID'])

			with open(output_file, 'w') as json_file:
				for json_obj in write_to_file:
					json.dump(json_obj, json_file, indent=1)
					json_file.write("\n")

Пример #12

0

Показать файл

Файл: test_check_headers.py Проект: prasadtalasila/MailingListParser

	def test_remove_unwanted_headers(self):

		checkHeaders=CheckHeaders()
		checkHeaders.check_validity(False,self.headers_file)
		checkHeaders.remove_unwanted_headers(checkHeaders.unwanted_uid,self.headers_file,self.output_file)
		with open(self.output_file, 'r') as json_file:
			for chunk in lines_per_n(json_file, 9):
				json_obj = json.loads(chunk)
				assert json_obj['Message-ID'] !=3

Пример #13

0

Показать файл

Файл: test_check_headers.py Проект: prasadtalasila/MailingListParser

	def test_add_missing_headers(self,mock_function):

		mock_function.return_value.uid.return_value=(1,['5'])
		checkHeaders=CheckHeaders()		
		with open(self.output_file, 'r') as json_file:
			for chunk in lines_per_n(json_file, 9):
				json_obj = json.loads(chunk)		
		checkHeaders.check_validity(False, self.headers_file)
		checkHeaders.add_missing_headers(checkHeaders.missing_uid,self.unwanted_uid_file,self.uid_map_file)	
		if(checkHeaders.missing_uid):
			mock_function.assert_any_call()

Пример #14

0

Показать файл

    def test_remove_unwanted_headers(self):

        checkHeaders = CheckHeaders()
        checkHeaders.check_validity(False, self.headers_file)
        checkHeaders.remove_unwanted_headers(checkHeaders.unwanted_uid,
                                             self.headers_file,
                                             self.output_file)
        with open(self.output_file, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                assert json_obj['Message-ID'] != 3

Пример #15

0

Показать файл

Файл: test_check_headers.py Проект: prasadtalasila/MailingListParser

	def test_remove_duplicate_headers(self):

		checkHeaders=CheckHeaders()
		checkHeaders.check_validity(False, self.headers_file)
		checkHeaders.remove_duplicate_headers(checkHeaders.duplicate_uid, self.headers_file,self.output_file)	
		count_uid=0
		with open(self.output_file, 'r') as json_file:
			for chunk in lines_per_n(json_file, 9):
				json_obj = json.loads(chunk)
				if json_obj['Message-ID'] ==2:
					count_uid=count_uid+1
		assert count_uid==1

Пример #16

0

Показать файл

def test_remove_invalid_references():

    input_json_filename = './test/integration_test/data/headers_for_cleanup.json'
    output_json_filename = './.tmp/integration_test/lib/input/data_cleanup/clean_headers.json'

    remove_invalid_references(input_json_filename,
                              output_json_filename,
                              ref_toggle=True)

    with open(output_json_filename, 'r') as clean_headers:
        for chunk in lines_per_n(clean_headers, 9):
            jfile = json.loads(chunk)
            assert jfile['Message-ID'] == 500 or 510

Пример #17

0

Показать файл

    def test_add_missing_headers(self, mock_function):

        mock_function.return_value.uid.return_value = (1, ['5'])
        checkHeaders = CheckHeaders()
        with open(self.output_file, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
        checkHeaders.check_validity(False, self.headers_file)
        checkHeaders.add_missing_headers(checkHeaders.missing_uid,
                                         self.unwanted_uid_file,
                                         self.uid_map_file)
        if (checkHeaders.missing_uid):
            mock_function.assert_any_call()

Пример #18

0

Показать файл

    def test_remove_duplicate_headers(self):

        checkHeaders = CheckHeaders()
        checkHeaders.check_validity(False, self.headers_file)
        checkHeaders.remove_duplicate_headers(checkHeaders.duplicate_uid,
                                              self.headers_file,
                                              self.output_file)
        count_uid = 0
        with open(self.output_file, 'r') as json_file:
            for chunk in lines_per_n(json_file, 9):
                json_obj = json.loads(chunk)
                if json_obj['Message-ID'] == 2:
                    count_uid = count_uid + 1
        assert count_uid == 1

Пример #19

0

Показать файл

Файл: author.py Проект: hrishikesh-dahiya/MLCAT

def get_uid_map(input_file='clean_data.json',
                output_file="author_uid_map.json"):
    """
    This function is used to generate and write to a JSON file the mapping of authors to a unique integer identifier.
    Authors are identified through a regular expression search for their email addresses. The integer identifiers
    generated are used in other modules like the generation and statistical analysis of hyperedges.
    
    :param input_file: Path of the JSON file containing the data under analysis (default= 'clean_data.json')
    :param output_file: Path of the JSON file to which the generated map is written (default= "author_uid_map.json")
    :return: A list of all message ids that are leaf nodes
    """
    index = 0
    author_set = set()
    author_uid_map = dict()
    email_re = re.compile(r'[\w\.-]+@[\w\.-]+')

    with open(input_file, 'r') as json_file:
        for chunk in lines_per_n(json_file, 9):
            json_obj = json.loads(chunk)
            from_addr = email_re.search(json_obj['From'])
            author_set.add(
                from_addr.
                group(0) if from_addr is not None else json_obj['From'])
            author_set |= set(email_re.findall(json_obj['To']))
            if json_obj['Cc'] is not None:
                author_set |= set(email_re.findall(json_obj['Cc']))
    print("JSON data loaded.")

    for address in author_set:
        author_uid_map[address] = index
        index += 1

    with open(output_file, 'w') as map_file:
        json.dump(author_uid_map, map_file, indent=1)
        map_file.close()
    print("UID map written to " + output_file + ".")
    return author_uid_map

Пример #20

0

Показать файл

def generate_node_labels(nodelist_filename='graph_nodes.txt',
                         edgelist_filename='graph_edges.txt',
                         json_filename='clean_data.json'):
    """

    This function generates a list of nodes and edges in the graphs from the JSON file and saves it as a TXT file.

    :param nodelist_filename: txt file to store the graph nodes.
    :param edgelist_filename: txt file to store the graph edges.
    :param json_filename: The JSON file containing the cleaned headers.
    """
    # The following set stores all the mail UIDs and the corresponding time as a semi-colon separated string
    nodes = set()
    edges = set()
    with open(json_filename, 'r') as fil:
        for chunk in lines_per_n(fil, 9):
            jfile = json.loads(chunk)
            msg_id = jfile['Message-ID']
            msg_time = jfile['Time']
            msg_from = "".join(jfile['From'].split())
            nodes.add(str(msg_id) + ",")
            if jfile['References']:
                ref_list = str(jfile['References']).split(',')
                # Message Id of the parent mail is appended to the end of the list of references.
                parent_id = int(ref_list[-1])
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
            if jfile['In-Reply-To']:
                parent_id = jfile['In-Reply-To']
                if parent_id and parent_id < msg_id:
                    edges.add((parent_id, msg_id))
    with open(nodelist_filename, 'w') as node_file:
        for node_str in nodes:
            node_file.write(node_str + "\n")
    with open(edgelist_filename, 'w') as edge_file:
        for parent_id, msg_id in edges:
            edge_file.write(str(parent_id) + '\t' + str(msg_id) + "\n")

Пример #21

0

Показать файл

Файл: check_headers.py Проект: prasadtalasila/MailingListParser

	def check_validity(self, check_unavailable_uid='False', json_headers='headers.json'):
		"""

		This function checks for and prints duplicate, missing, and invalid objects in the "headers.json" file.
		This function can be run first to generate a list of duplicate, missing, or invalid objects' UIDs which
		can then be used to add or remove their entries from the JSON file.

		:param check_unavailable_uid: If true, prints the unavailable and unwanted uids
		:param json_headers: The header file to be parsed
		:return: Last UID that was checked by the function.
		"""
		previous_uid = 0

		# The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
		# In case a duplicate exists, it would be read twice and hence would fail the set membership test.
		read_uid = set([])

		# This variable contains the last UID that was checked. This variable is returned by the function.
		last_valid_uid = 0

		header_attrib = {'Message-ID', 'From', 'To', 'Cc', 'In-Reply-To', 'Time'}

		# Read UIDs of mails that are not forwarded from LKML subscription which is stored in a text file.

		with open(json_headers, 'r') as json_file:

			for chunk in lines_per_n(json_file, 9):
				try:
					json_obj = json.loads(chunk)
				except:
					print("Unreadable JSON object after UID: " + str(previous_uid))
					break

				# Checking for duplicate objects
				if not json_obj['Message-ID'] in read_uid:
					read_uid.add(json_obj['Message-ID'])
				else:
					self.duplicate_uid.add(json_obj['Message-ID'])

				# Check if the JSON object has sufficient attributes by checking if "header_attrib" is a subset of its keys
				if not set(header_attrib) <= json_obj.keys() or json_obj['Time'] is None:
					self.invalid_uid.add(json_obj['Message-ID'])

				# Check if it is a mail that is sent directly to "*****@*****.**", in which case it has not been
				# forwarded from the LKML subscription.
				if json_obj['To'] == "*****@*****.**":
					self.unwanted_uid.add(json_obj['Message-ID'])

				previous_uid = json_obj['Message-ID']

		# Calculate the missing UIDs by performing a set difference on all the UIDs possible till the highest UID read
		# from the actual UIDs that have been read.
		if previous_uid != 0:
			self.last_uid_read = max(read_uid)
			self.missing_uid = set(range(min(read_uid), self.last_uid_read+1)) - read_uid

		if check_unavailable_uid:
			self.unavailable_uid = self.get_unavailable_uid()
			print("Unavailable UIDs: ", self.unavailable_uid if len(self.unavailable_uid) > 0 else "None")
			with open("unwanted_uid.txt", 'a') as unw_file:
				for uid in self.unwanted_uid:
					unw_file.write(str(uid) + '\n')
			print("Unwanted UIDs: ", self.unwanted_uid if len(self.unwanted_uid) > 0 else "None")

		print("Duplicate UIDs: ", self.duplicate_uid if len(self.duplicate_uid) > 0 else "None")
		print("Missing UIDs: ", self.missing_uid if len(self.missing_uid) > 0 else "None")
		print("Invalid UIDs: ", self.invalid_uid if len(self.invalid_uid) > 0 else "None")
		return self.last_uid_read

Пример #22

0

Показать файл

Файл: check_headers.py Проект: tejalwakchoure/MLCAT

def check_validity(check_unavailable_uid='False',
                   json_header_filename='headers.json'):
    """

	This function checks for and prints duplicate, missing, and invalid objects in the "headers.json" file.
	This function can be run first to generate a list of duplicate, missing, or invalid objects' UIDs which
	can then be used to add or remove their entries from the JSON file.

	:param check_unavailable_uid: If true, prints the unavailable and unwanted uids
	:param json_header_filename: The header file to be parsed
	:return: Last UID that was checked by the function.
	"""
    previous_uid = 0

    # The "read_uid" set is used to keep track of all the UIDs that have been read from the JSON file.
    # In case a duplicate exists, it would be read twice and hence would fail the set membership test.
    read_uid = set([])

    # This variable contains the last UID that was checked. This variable is returned by the function.
    last_valid_uid = 0

    header_attrib = {'Message-ID', 'From', 'To', 'Cc', 'In-Reply-To', 'Time'}

    # Read UIDs of mails that are not forwarded from LKML subscription which is stored in a text file.

    with open(json_header_filename, 'r') as json_file:

        for chunk in lines_per_n(json_file, 9):
            try:
                json_obj = json.loads(chunk)
            except:
                print("Unreadable JSON object after UID: " + str(previous_uid))
                break

            # Checking for duplicate objects
            if not json_obj['Message-ID'] in read_uid:
                read_uid.add(json_obj['Message-ID'])
            else:
                duplicate_uid.add(json_obj['Message-ID'])

            # Check if the JSON object has sufficient attributes by checking if "header_attrib" is a subset of its keys
            if not set(header_attrib) <= json_obj.keys(
            ) or json_obj['Time'] is None:
                invalid_uid.add(json_obj['Message-ID'])

            # Check if it is a mail that is sent directly to "*****@*****.**", in which caseit has not been
            # forwarded from the LKML subscription.
            if json_obj['To'] == "*****@*****.**":
                unwanted_uid.add(json_obj['Message-ID'])

            previous_uid = json_obj['Message-ID']

    # Calculate the missing UIDs by performing a set difference on all the UIDs possible till the highest UID read
    # from the actual UIDs that have been read.
    if previous_uid != 0:
        global last_uid_read
        last_uid_read = max(read_uid)
        global missing_uid
        missing_uid = set(range(min(read_uid), last_uid_read + 1)) - read_uid
        global unavailable_uid

    if check_unavailable_uid:
        unavailable_uid = get_unavailable_uid()
        print("Unavailable UIDs: ",
              unavailable_uid if len(unavailable_uid) > 0 else "None")
        with open("unwanted_uid.txt", 'a') as unw_file:
            for uid in unwanted_uid:
                unw_file.write(str(uid) + '\n')
        print("Unwanted UIDs: ",
              unwanted_uid if len(unwanted_uid) > 0 else "None")

    print("Duplicate UIDs: ",
          duplicate_uid if len(duplicate_uid) > 0 else "None")
    print("Missing UIDs: ", missing_uid if len(missing_uid) > 0 else "None")
    print("Invalid UIDs: ", invalid_uid if len(invalid_uid) > 0 else "None")
    return last_uid_read

Пример #23

0

Показать файл

Файл: data_cleanup.py Проект: tejalwakchoure/MLCAT

def remove_invalid_references(input_json_filename, output_json_filename, ref_toggle=False):
    """
    This function is used to remove headers associated with invalid references.
    
    :param input_json_filename: The json file containing all the references.
    :param output_json_filename: The output json without invalid references.
    :param ref_toggle: If true, gets the reference list from 'References' attribute.
    """

    # The "unspecified_ref" list is used to keep track of all those mails that have '0' in their reference list.
    # If any mail has any of the element in this list in its list of references, we can eliminate them as well
    unspecified_ref = ['0']

    print("Removing headers associated with invalid references...")

    with open(input_json_filename, 'r') as fil:
        with open(output_json_filename, mode='w', encoding='utf-8') as fin_file :

            for chunk in lines_per_n(fil, 9):
                # The "jfile" is used to store the json object read from the file.
                jfile = json.loads(chunk)

                """
                Mails that have references that are of type None indicate that they maybe the start of threads.
                Anything else could be mail in a thread or something else.
                """
                if jfile['References'] is not None:
                    # Checking if the references is an empty string
                    if not jfile['References'] == "":
                        # The references are stored as a comma separated string. We have to split it at the ',' to get a list.
                        if ref_toggle:
                            ref_list = jfile['References'].split(',')
                        else:
                            if jfile['In-Reply-To'] is not None:
                                ref_list = [str(jfile['In-Reply-To'])]
                            else:
                                ref_list = None
                        # A '0' in the list indicates that the mail contains references to some other mail which is not available to us
                        if '0' not in ref_list or ref_list is None:
                            data = {}
                            data['Message-ID'] = jfile['Message-ID']
                            data['From'] = jfile['From']
                            data['To'] = jfile['To']
                            data['Cc'] = jfile['Cc']
                            data['In-Reply-To'] = jfile['In-Reply-To']
                            data['References'] = jfile['References']
                            data['Time'] = jfile['Time']
                            contain_unspec_ref = False

                            # This is done to eliminate all those mails whose reference list contains mails that have '0' in their reference list
                            for ref in ref_list :
                                if ref in unspecified_ref:
                                    contain_unspec_ref = True
                            if not contain_unspec_ref:
                                    json.dump(data, fin_file, indent=1)
                                    fin_file.write('\n')
                        else:
                            unspecified_ref.append(str(jfile['Message-ID']))

                # Writing all those mails that have None as their References
                else:
                    data = {}
                    data['Message-ID'] = jfile['Message-ID']
                    data['From'] = jfile['From']
                    data['To'] = jfile['To']
                    data['Cc'] = jfile['Cc']
                    data['In-Reply-To'] = jfile['In-Reply-To']
                    data['References'] = jfile['References']
                    data['Time'] = str(jfile['Time'])
                    json.dump(data, fin_file, indent=1)
                    fin_file.write('\n')

        fin_file.close()
    fil.close()

Python lines_per_n примеры использования