def get_references(current_file): result = {} with open(current_file, 'rb') as fp: message = pyzmail.message_from_file(fp) if 'Message-Id' in message.keys(): result['message-id'] = message['Message-Id'] elif 'Message-ID' in message.keys(): result['message-id'] = message['Message-ID'] else: logging.warn('no message id in file %s', current_file) if 'References' in message.keys(): references = message['References'].split(' ') result['references'] = references return result
def ExtractSubPayload(filename): ''' Extract the subject and payload from the .eml file. ''' if not os.path.exists(filename): # dest path doesnot exist print("ERROR: input file does not exist:" + filename) os.exit(1) fp = open(filename) msg = pyzmail.message_from_file(fp) payload = "" if msg.html_part != None: payload = msg.html_part.get_payload() payload = str(payload) # print payload return payload
def ExtractSubPayload(filename): ''' Extract the subject and payload from the .eml file. ''' if not os.path.exists(filename): # dest path doesnot exist print("ERROR: input file does not exist:" + filename) os.exit(1) fp = open(filename) msg = pyzmail.message_from_file(fp) payload1 = "" if msg.text_part != None: msg1 = msg.text_part.get_payload() #payload1=msg1.decode('utf-8') payload1 = str(msg1) #print payload1 return payload1
def get_references(current_file): result = {} with open(current_file, 'rb') as fp: message = pyzmail.message_from_file(fp) if 'Message-Id' in message.keys(): result['message-id'] = message['Message-Id'] elif 'Message-ID' in message.keys(): result['message-id'] = message['Message-ID'] elif 'Message-id' in message.keys(): result['message-id'] = message['Message-id'] else: logging.warn('no message id in file %s', current_file) logging.info([key for key in message.keys()]) if 'References' in message.keys(): references = message['References'].split(' ') result['references'] = references if 'In-Reply-To' in message.keys(): result['in-reply-to'] = message['In-Reply-To'] return result, message
def filter_file_by_content(arg_file_name, arg_senders): with open(arg_file_name, 'rb') as fp: message = pyzmail.message_from_file(fp) # todo clean up internal whitespace senders = message.get_addresses('from') clean_senders = [clean_address(item[1]) for item in senders] if len(set(clean_senders) & arg_senders) != 0: return None text_part = message.text_part if text_part is not None: charset = text_part.charset payload = text_part.get_payload() if charset is not None: try: body = payload.decode(charset, 'ignore').encode(target_encoding) except LookupError as lookupError: if text_part.charset == 'iso-8859-8-i': body = payload.decode('iso-8859-8', 'ignore').encode(target_encoding) else: body = payload.decode('utf-8', 'ignore').encode(target_encoding) logging.warn('lookup error %s', lookupError) else: body = payload.decode('utf-8', 'ignore').encode(target_encoding) body_ascii = body.decode('utf-8', 'ignore').encode('ascii', 'ignore') return body_ascii else: return None
os.makedirs(output_folder) if not str(output_folder).endswith('/'): output_folder += '/' target_encoding = 'utf-8' for item in os.listdir(input_folder): logging.debug(item) if path.isdir(input_folder + item): output_subfolder = output_folder + item + '/' logging.debug(output_subfolder) if not os.path.exists(output_subfolder): os.makedirs(output_subfolder) for current_file in os.listdir(input_folder + item): logging.debug(current_file) with open(input_folder + item + '/' + current_file, 'rb') as fp: message = pyzmail.message_from_file(fp) text_part = message.text_part if text_part is not None: charset = text_part.charset payload = text_part.get_payload() if charset is not None: try: result = payload.decode( charset, 'ignore').encode(target_encoding) except LookupError as lookupError: if text_part.charset == 'iso-8859-8-i': result = payload.decode( 'iso-8859-8', 'ignore').encode(target_encoding) else: result = payload.decode(
def get_json(self, current_file, arg_process_text_part, arg_process_html_part, arg_process_both_empty, arg_kmeans_cluster_dictionary): result = {'original_file': current_file} with open(current_file, 'rb') as fp: message = pyzmail.message_from_file(fp) # todo clean up internal whitespace senders = message.get_addresses('from') result['sender'] = [item[i] for i in [0, 1] for item in senders] result['short_sender'] = [item.split('@')[0] for item in result['sender']] clean_senders = [self.clean_address(item[1]) for item in senders] result['clean_sender'] = clean_senders # todo clean up internal whitespace recipients = message.get_addresses('to') + message.get_addresses('cc') + message.get_addresses('bcc') result['recipient'] = recipients result['party'] = list( ['{name} = {address}'.format(name=item[0], address=item[1]) for item in senders + recipients]) result['clean_recipient'] = [self.clean_address(item[1]) for item in recipients] result['recipient'] = [item[i] for i in [0, 1] for item in recipients] result['short_recipient'] = [item.split('@')[0] for item in result['clean_recipient']] subject = message.get('subject') result['subject'] = '' if subject is None else subject.decode('iso-8859-1').encode(self.target_encoding) raw_date = message.get('date') if raw_date is not None: try: result['date'] = dateutil.parser.parse(raw_date) except ValueError as valueError: # todo find a way to deal with these special cases? # we occasionally get a string the parser won't parse e.g. # Wed, 17 Dec 2008 12:35:42 -0700 (GMT-07:00) # and we need to drop off the trailing time zone and try to parse again logging.warn('%s %s %s', raw_date, valueError, current_file) pieces = str(raw_date).split('(') result['date'] = dateutil.parser.parse(pieces[0]) else: # todo add special code to handle these? logging.warn('no date: %s ', message) text_part = message.text_part if text_part is not None and arg_process_text_part: charset = text_part.charset payload = text_part.get_payload() if charset is not None: try: body = payload.decode(charset, 'ignore').encode(self.target_encoding) except LookupError as lookupError: if text_part.charset == 'iso-8859-8-i': body = payload.decode('iso-8859-8', 'ignore').encode(self.target_encoding) else: body = payload.decode('utf-8', 'ignore').encode(self.target_encoding) logging.warn('lookup error %s', lookupError) else: body = payload.decode('utf-8', 'ignore').encode(self.target_encoding) result['body'] = body short_file_name = os.path.basename(current_file) result['kmeans_cluster'] = arg_kmeans_cluster_dictionary[short_file_name] elif message.html_part is not None and arg_process_html_part: payload = message.html_part.part.get_payload() payload_text = bs4.BeautifulSoup(payload, 'lxml').get_text().strip() charset = message.html_part.charset if message.html_part.charset is not None else 'utf-8' result['body'] = payload_text.decode(charset, 'ignore').encode(self.target_encoding) elif arg_process_both_empty: logging.warn('both text_part and html_part are None: %s', current_file) else: logging.warn('not processing %s', current_file) if 'body' in result.keys(): if len(result['body']) == 0: result['empty_body'] = True if 'Message-Id' in message.keys(): result['message-id'] = message['Message-Id'] if 'In-Reply-To' in message.keys(): result['in-reply-to'] = message['In-Reply-To'] if 'References' in message.keys(): result['references'] = message['References'].split(' ') md5 = hashlib.md5() with open(current_file, 'rb') as fp: md5.update(fp.read()) return result, md5.hexdigest()