def setup(token): if token.is_content_type(): return True elif token.is_boundary(): value = token.value[2:] if value in boundaries: token.value = value token.final = False return True if strip_endings(value) in boundaries: token.value = strip_endings(value) token.final = True return True # false boundary return False else: raise DecodingError("Unknown token") return token.is_content_type() or \ (token.is_boundary() and token in boundaries)
def scan(string): """Scanner that uses 1 pass to scan the entire message and build a message tree""" if not isinstance(string, str): raise DecodingError("Scanner works with byte strings only") tokens = tokenize(string) if not tokens: tokens = [default_content_type()] try: return traverse(Start(), TokensIterator(tokens, string)) except DecodingError: raise except Exception: raise DecodingError("Mailformed MIME message")
def parse_header(header): """ Accepts a raw header with name, colons and newlines and returns it's parsed value """ name, val = split2(header) if not is_pure_ascii(name): raise DecodingError("Non-ascii header name") return name, parse_header_value(name, encodedword.unfold(val))
def check(self): """ This function is used to protect our lovely scanner from the deadloops, we count the number of operations performed and will raise an exception if things go wrong (too much ops) """ self.opcount += 1 if self.opcount > _MAX_OPS: raise DecodingError("Too many parts: {0}, max is {1}".format( self.opcount, _MAX_OPS))
def decode_body(content_type, content_encoding, body): # decode the transfer encoding try: body = decode_transfer_encoding(content_encoding, body) except Exception: raise DecodingError("Failed to decode body") # decode the charset next return decode_charset(content_type, body)
def scan(string): """Scanner that uses 1 pass to scan the entire message and build a message tree""" if six.PY2: if not isinstance(string, six.binary_type): raise DecodingError('Scanner works with binary only') else: if isinstance(string, six.binary_type): string = string.decode('utf-8') tokens = tokenize(string) if not tokens: tokens = [default_content_type()] try: return traverse(Start(), TokensIterator(tokens, string)) except DecodingError: raise except Exception as cause: raise six.raise_from(DecodingError("Malformed MIME message"), cause)
def parse_header_value(name, val): if not is_pure_ascii(val): val = to_unicode(val) if parametrized.is_parametrized(name, val): val, params = parametrized.decode(val) if val is not None and not is_pure_ascii(val): raise DecodingError('Non-ascii content header value') if name == 'Content-Type': main, sub = parametrized.fix_content_type(val) return ContentType(main, sub, params) return WithParams(val, params) return val
def parse_header_value(name, val): if not is_pure_ascii(val): if parametrized.is_parametrized(name, val): raise DecodingError("Unsupported value in content- header") return to_unicode(val) else: if parametrized.is_parametrized(name, val): val, params = parametrized.decode(val) if name == 'Content-Type': main, sub = parametrized.fix_content_type(val) return ContentType(main, sub, params) else: return WithParams(val, params) else: return val
def scan(string): """Scanner that uses 1 pass to scan the entire message and build a message tree""" if six.PY2: if not isinstance(string, six.binary_type) \ and not isinstance(string, mmap): raise DecodingError('Scanner works with binary only') else: if isinstance(string, six.binary_type): string = string.decode('utf-8') tokens = tokenize(string) if not tokens: tokens = [default_content_type()] return traverse(Start(), TokensIterator(tokens, string))
def _read_header_lines(fp): """Read lines with headers until the start of body""" lines = deque() for line in fp: if len(line) > _MAX_LINE_LENGTH: raise DecodingError('Line is too long: %d' % len(line)) if is_empty(line): break # tricky case if it's not a header and not an empty line # ususally means that user forgot to separate the body and newlines # so "unread" this line here, what means to treat it like a body if not _RE_HEADER.match(line): fp.seek(fp.tell() - len(line)) break lines.append(line) return lines
def _filter_false_tokens(tokens): """ Traverses a list of pre-scanned tokens and removes false content-type and boundary tokens. A content-type header is false unless it it the first content-type header in a message/part headers section. A boundary token is false if it has not been mentioned in a preceding content-type header. """ current_section = _SECTION_HEADERS current_content_type = None filtered = [] boundaries = [] for token in tokens: if isinstance(token, ContentType): # Only the first content-type header in a headers section is valid. if current_content_type or current_section != _SECTION_HEADERS: continue current_content_type = token boundaries.append(token.get_boundary()) elif isinstance(token, Boundary): value = token.value[2:] if value in boundaries: token.value = value token.final = False current_section = _SECTION_HEADERS current_content_type = None elif _strip_endings(value) in boundaries: token.value = _strip_endings(value) token.final = True current_section = _SECTION_MULTIPART_EPILOGUE else: # False boundary detected! continue elif token == _EMPTY_LINE: if current_section == _SECTION_HEADERS: if not current_content_type: current_content_type = _DEFAULT_CONTENT_TYPE if current_content_type.is_singlepart(): current_section = _SECTION_BODY elif current_content_type.is_multipart(): current_section = _SECTION_MULTIPART_PREAMBLE else: # Start of an enclosed message or just its headers. current_section = _SECTION_HEADERS current_content_type = None # Cast away empty line tokens, for they have been pre-scanned just # to identify a place where a header section completes and a body # section starts. continue else: raise DecodingError("Unknown token") filtered.append(token) return filtered
def traverse(pointer, iterator, parent=None, allow_bad_mime=False): """Recursive-descendant parser""" iterator.check() token = iterator.next() # this means that this part does not have any # content type set, so set it to RFC default (text/plain) # it even can have no headers if token.is_end() or token.is_boundary(): return make_part(content_type=default_content_type(), start=pointer, end=token, iterator=iterator, parent=parent) # this part tells us that it is singlepart # so we should ignore all other content-type headers # until the boundary or the end of message if token.is_singlepart(): while True: iterator.check() end = iterator.next() if not end.is_content_type(): break return make_part(content_type=token, start=pointer, end=end, iterator=iterator, parent=parent) # good old multipart message # here goes the real recursion # we scan part by part until the end elif token.is_multipart(): content_type = token # well, multipart message should provide # some boundary, how could we parse it otherwise? boundary = content_type.get_boundary() if not boundary: raise DecodingError("Multipart message without boundary") parts = deque() token = iterator.next() # we are expecting first boundary for multipart message # something is broken otherwise if not token.is_boundary() or token != boundary: if allow_bad_mime and parent and parent.is_message_container(): return None raise DecodingError("Multipart message without starting boundary") while True: token = iterator.current() if token.is_end(): break if token == boundary and token.is_final(): iterator.next() break parts.append(traverse(token, iterator, content_type)) return make_part(content_type=content_type, start=pointer, end=token, iterator=iterator, parts=parts, parent=parent) # this is a weird mime part, actually # it can contain multiple headers # separated by newlines, so we grab them here elif token.is_delivery_status(): if parent and parent.is_multipart(): while True: iterator.check() end = iterator.next() if not end.is_content_type(): break else: raise DecodingError("Malformed delivery status message") return make_part(content_type=token, start=pointer, end=end, iterator=iterator, parent=parent) # this is a message container that holds # a message inside, delimited from parent # headers by newline elif token.is_message_container(): # Delivery notification body can contain all sorts of bad MIME. allow_bad_mime = parent and parent.is_delivery_report() enclosed = traverse(pointer, iterator, token, allow_bad_mime) return make_part( content_type=token if enclosed else default_content_type(), start=pointer, end=iterator.current(), iterator=iterator, enclosed=enclosed, parent=parent) # this part contains headers separated by newlines, # grab these headers and enclose them in one part elif token.is_headers_container(): enclosed = grab_headers(pointer, iterator, token) return make_part(content_type=token, start=pointer, end=iterator.current(), iterator=iterator, enclosed=enclosed, parent=parent)