示例#1
0
 def get_references(current_file):
     result = {}
     with open(current_file, 'rb') as fp:
         message = pyzmail.message_from_file(fp)
         if 'Message-Id' in message.keys():
             result['message-id'] = message['Message-Id']
         elif 'Message-ID' in message.keys():
             result['message-id'] = message['Message-ID']
         else:
             logging.warn('no message id in file %s', current_file)
         if 'References' in message.keys():
             references = message['References'].split(' ')
             result['references'] = references
     return result
def ExtractSubPayload(filename):
    ''' Extract the subject and payload from the .eml file.

    '''
    if not os.path.exists(filename):  # dest path doesnot exist
        print("ERROR: input file does not exist:" + filename)
        os.exit(1)
    fp = open(filename)
    msg = pyzmail.message_from_file(fp)
    payload = ""

    if msg.html_part != None:

        payload = msg.html_part.get_payload()
        payload = str(payload)
    # print payload

    return payload
def ExtractSubPayload(filename):
    ''' Extract the subject and payload from the .eml file.

    '''
    if not os.path.exists(filename):  # dest path doesnot exist
        print("ERROR: input file does not exist:" + filename)
        os.exit(1)
    fp = open(filename)
    msg = pyzmail.message_from_file(fp)

    payload1 = ""
    if msg.text_part != None:

        msg1 = msg.text_part.get_payload()
        #payload1=msg1.decode('utf-8')
        payload1 = str(msg1)
        #print payload1

    return payload1
示例#4
0
def get_references(current_file):
    result = {}
    with open(current_file, 'rb') as fp:
        message = pyzmail.message_from_file(fp)
        if 'Message-Id' in message.keys():
            result['message-id'] = message['Message-Id']
        elif 'Message-ID' in message.keys():
            result['message-id'] = message['Message-ID']
        elif 'Message-id' in message.keys():
            result['message-id'] = message['Message-id']
        else:
            logging.warn('no message id in file %s', current_file)
            logging.info([key for key in message.keys()])
        if 'References' in message.keys():
            references = message['References'].split(' ')
            result['references'] = references
        if 'In-Reply-To' in message.keys():
            result['in-reply-to'] = message['In-Reply-To']
    return result, message
示例#5
0
def filter_file_by_content(arg_file_name, arg_senders):
    with open(arg_file_name, 'rb') as fp:
        message = pyzmail.message_from_file(fp)
        # todo clean up internal whitespace
        senders = message.get_addresses('from')
        clean_senders = [clean_address(item[1]) for item in senders]

        if len(set(clean_senders) & arg_senders) != 0:
            return None

        text_part = message.text_part
        if text_part is not None:
            charset = text_part.charset
            payload = text_part.get_payload()
            if charset is not None:
                try:
                    body = payload.decode(charset,
                                          'ignore').encode(target_encoding)
                except LookupError as lookupError:
                    if text_part.charset == 'iso-8859-8-i':
                        body = payload.decode('iso-8859-8',
                                              'ignore').encode(target_encoding)
                    else:
                        body = payload.decode('utf-8',
                                              'ignore').encode(target_encoding)
                        logging.warn('lookup error %s', lookupError)
            else:
                body = payload.decode('utf-8',
                                      'ignore').encode(target_encoding)

            body_ascii = body.decode('utf-8',
                                     'ignore').encode('ascii', 'ignore')

            return body_ascii
        else:
            return None
        os.makedirs(output_folder)
    if not str(output_folder).endswith('/'):
        output_folder += '/'

target_encoding = 'utf-8'
for item in os.listdir(input_folder):
    logging.debug(item)
    if path.isdir(input_folder + item):
        output_subfolder = output_folder + item + '/'
        logging.debug(output_subfolder)
        if not os.path.exists(output_subfolder):
            os.makedirs(output_subfolder)
        for current_file in os.listdir(input_folder + item):
            logging.debug(current_file)
            with open(input_folder + item + '/' + current_file, 'rb') as fp:
                message = pyzmail.message_from_file(fp)
                text_part = message.text_part
                if text_part is not None:
                    charset = text_part.charset
                    payload = text_part.get_payload()
                    if charset is not None:
                        try:
                            result = payload.decode(
                                charset, 'ignore').encode(target_encoding)
                        except LookupError as lookupError:
                            if text_part.charset == 'iso-8859-8-i':
                                result = payload.decode(
                                    'iso-8859-8',
                                    'ignore').encode(target_encoding)
                            else:
                                result = payload.decode(
示例#7
0
    def get_json(self, current_file, arg_process_text_part, arg_process_html_part, arg_process_both_empty,
                 arg_kmeans_cluster_dictionary):
        result = {'original_file': current_file}
        with open(current_file, 'rb') as fp:
            message = pyzmail.message_from_file(fp)
            # todo clean up internal whitespace
            senders = message.get_addresses('from')
            result['sender'] = [item[i] for i in [0, 1] for item in senders]
            result['short_sender'] = [item.split('@')[0] for item in result['sender']]
            clean_senders = [self.clean_address(item[1]) for item in senders]
            result['clean_sender'] = clean_senders

            # todo clean up internal whitespace
            recipients = message.get_addresses('to') + message.get_addresses('cc') + message.get_addresses('bcc')
            result['recipient'] = recipients
            result['party'] = list(
                ['{name} = {address}'.format(name=item[0], address=item[1]) for item in senders + recipients])
            result['clean_recipient'] = [self.clean_address(item[1]) for item in recipients]
            result['recipient'] = [item[i] for i in [0, 1] for item in recipients]
            result['short_recipient'] = [item.split('@')[0] for item in result['clean_recipient']]

            subject = message.get('subject')
            result['subject'] = '' if subject is None else subject.decode('iso-8859-1').encode(self.target_encoding)

            raw_date = message.get('date')
            if raw_date is not None:
                try:
                    result['date'] = dateutil.parser.parse(raw_date)
                except ValueError as valueError:
                    # todo find a way to deal with these special cases?
                    # we occasionally get a string the parser won't parse e.g.
                    # Wed, 17 Dec 2008 12:35:42 -0700 (GMT-07:00)
                    # and we need to drop off the trailing time zone and try to parse again
                    logging.warn('%s %s %s', raw_date, valueError, current_file)
                    pieces = str(raw_date).split('(')
                    result['date'] = dateutil.parser.parse(pieces[0])
            else:
                # todo add special code to handle these?
                logging.warn('no date: %s ', message)

            text_part = message.text_part
            if text_part is not None and arg_process_text_part:
                charset = text_part.charset
                payload = text_part.get_payload()
                if charset is not None:
                    try:
                        body = payload.decode(charset, 'ignore').encode(self.target_encoding)
                    except LookupError as lookupError:
                        if text_part.charset == 'iso-8859-8-i':
                            body = payload.decode('iso-8859-8', 'ignore').encode(self.target_encoding)
                        else:
                            body = payload.decode('utf-8', 'ignore').encode(self.target_encoding)
                            logging.warn('lookup error %s', lookupError)
                else:
                    body = payload.decode('utf-8', 'ignore').encode(self.target_encoding)
                result['body'] = body

                short_file_name = os.path.basename(current_file)
                result['kmeans_cluster'] = arg_kmeans_cluster_dictionary[short_file_name]

            elif message.html_part is not None and arg_process_html_part:
                payload = message.html_part.part.get_payload()
                payload_text = bs4.BeautifulSoup(payload, 'lxml').get_text().strip()
                charset = message.html_part.charset if message.html_part.charset is not None else 'utf-8'
                result['body'] = payload_text.decode(charset, 'ignore').encode(self.target_encoding)
            elif arg_process_both_empty:
                logging.warn('both text_part and html_part are None: %s', current_file)
            else:
                logging.warn('not processing %s', current_file)

            if 'body' in result.keys():
                if len(result['body']) == 0:
                    result['empty_body'] = True

            if 'Message-Id' in message.keys():
                result['message-id'] = message['Message-Id']
            if 'In-Reply-To' in message.keys():
                result['in-reply-to'] = message['In-Reply-To']
            if 'References' in message.keys():
                result['references'] = message['References'].split(' ')

        md5 = hashlib.md5()
        with open(current_file, 'rb') as fp:
            md5.update(fp.read())

        return result, md5.hexdigest()