Exemplo n.º 1
0
def extract_pieces(msg_lines, msg_offset, mbx, inner_mesg=False):
    """Takes four parameters.  The first is a list of line strings
	containing the headers and body of a message from a Eudora MBX
	file.  The second is the offset of the first character in the
	first line in the msg_lines list within the MBX file we're
	processing.  The third is the name of the MBX file we're
	reading.  The fourth, inner_mesg, is a boolean controlling
	whether or not the pieces were are extracting are a top-level
	message in the MBX file.  If inner_mesg is true, we will carry
	out our processing under the assumption that we are handling
	an attached message carried in an message/rfc822 segment.
	
	Returns a tuple (header, body, attachments, embeddeds, mbx)
	containing a Header object, a body String containing the body
	of the message, a list of attachment definition tuples, a list
	of embedded definition tuples, and the name of the MBX file
	we're processing."""

    global toc_info, replies
    global target

    headers = Header()
    body = []
    attachments = []
    embeddeds = []

    in_headers = True
    found_rfc822_inner_mesg = False
    is_html = False

    if not inner_mesg:
        headers.add('From ', msg_lines[0][5:].strip())

    for line in msg_lines:
        if in_headers:
            if re_initial_whitespace.match(line):
                # Header "folding" (RFC 2822 3.2.3)
                headers.appendToLast(line)
            elif len(line.strip()) != 0:
                # Message header
                headers.add_line(line)

                attachment_matcher = re_x_attachment.match(line)

                if attachment_matcher:
                    files = attachment_matcher.group(1)
                    attach_list = re.split(';\s*', files)

                    for attachment in attach_list:
                        attachments.append((attachment, target))
            else:
                # End of message headers.

                # scrub the header lines we've scanned

                if not inner_mesg:
                    headers.clean(toc_info, msg_offset, replies)

                in_headers = False

                content_type = headers.getValue('Content-Type:')

                if content_type and content_type.lower() == 'message/rfc822':
                    found_rfc822_inner_mesg = True
                    print "+",
        elif found_rfc822_inner_mesg:
            # We're processing a message/rfc822 message,
            # and so we don't want to process attachments
            # at this level.  Instead, we want to properly
            # extract all body lines for later processing

            body.append(strip_linesep(line) + "\n")
        else:
            # We're in the body of the text and we need to
            # handle attachments

            if not is_html and re_xhtml.search(line) or re_normal_html.search(
                    line):
                is_html = True

            if attachments_dirs and re_attachment.search(line):
                # remove the newline that
                # Eudora inserts before the
                # 'Attachment Converted' line.

                if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'):
                    body.pop()

                #EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype)
                attachments.append((line, target))
            else:
                embedded_matcher = re_embedded.match(line)

                if embedded_matcher:
                    filename = embedded_matcher.group(1)
                    embeddeds.append(filename)
                else:
                    orig_line = line

                    if scrub_xflowed:
                        line = re.sub(re_xflowed, '', line)
                        line = re.sub(re_xhtml, '', line)
                        line = re.sub(re_pete_stuff, '', line)

                    if orig_line == line or line != '':
                        body.append(strip_linesep(line) + "\n")

    return (headers, body, attachments, embeddeds, mbx, is_html)
Exemplo n.º 2
0
def extract_pieces( msg_lines, msg_offset, mbx, inner_mesg=False ):
	"""Takes four parameters.  The first is a list of line strings
	containing the headers and body of a message from a Eudora MBX
	file.  The second is the offset of the first character in the
	first line in the msg_lines list within the MBX file we're
	processing.  The third is the name of the MBX file we're
	reading.  The fourth, inner_mesg, is a boolean controlling
	whether or not the pieces were are extracting are a top-level
	message in the MBX file.  If inner_mesg is true, we will carry
	out our processing under the assumption that we are handling
	an attached message carried in an message/rfc822 segment.
	
	Returns a tuple (header, body, attachments, embeddeds, mbx)
	containing a Header object, a body String containing the body
	of the message, a list of attachment definition tuples, a list
	of embedded definition tuples, and the name of the MBX file
	we're processing."""

	global toc_info, replies
	global target

	headers = Header()
	body = []
	attachments = []
	embeddeds = []

	in_headers = True
	found_rfc822_inner_mesg = False
	is_html = False

	if not inner_mesg:
		headers.add( 'From ', msg_lines[0][5:].strip() )

	for line in msg_lines:
		if in_headers:
			if re_initial_whitespace.match( line ):
				# Header "folding" (RFC 2822 3.2.3)
				headers.appendToLast( line )
			elif len( line.strip() ) != 0:
				# Message header
				headers.add_line(line)

				attachment_matcher = re_x_attachment.match( line )

				if attachment_matcher:
					files = attachment_matcher.group(1)
					attach_list = re.split(';\s*', files)

					for attachment in attach_list:
						attachments.append( (attachment, target) )
			else:
				# End of message headers.

				# scrub the header lines we've scanned

				if not inner_mesg:
					headers.clean(toc_info, msg_offset, replies)

				in_headers = False

				content_type = headers.getValue('Content-Type:')

				if content_type and content_type.lower() == 'message/rfc822':
					found_rfc822_inner_mesg = True
					print "+",
		elif found_rfc822_inner_mesg:
			# We're processing a message/rfc822 message,
			# and so we don't want to process attachments
			# at this level.  Instead, we want to properly
			# extract all body lines for later processing

			body.append(strip_linesep(line) + "\n")
		else:
			# We're in the body of the text and we need to
			# handle attachments

			if not is_html and re_xhtml.search( line ) or re_normal_html.search( line ):
				is_html = True

			if attachments_dirs and re_attachment.search( line ):
				# remove the newline that
				# Eudora inserts before the
				# 'Attachment Converted' line.

				if len(body) > 0 and (body[-1] == '\n' or body[-1] == '\r\n'):
					body.pop()

				#EudoraLog.log.warn("Adding attachment with contenttype = " + contenttype)
				attachments.append( (line, target) )
			else:
				embedded_matcher = re_embedded.match ( line )
					
				if embedded_matcher:
					filename = embedded_matcher.group(1)
					embeddeds.append( filename )
				else:
					orig_line = line

					if scrub_xflowed:
						line = re.sub(re_xflowed, '', line)
						line = re.sub(re_xhtml, '', line)
						line = re.sub(re_pete_stuff, '', line)

					if orig_line == line or line != '':
						body.append(strip_linesep(line) + "\n")

	return ( headers, body, attachments, embeddeds, mbx, is_html )