def comment(include_parens=False): inner = recursive() > named(u'comment', RFC(7230)) inner.rec = '(' + string(ctext | quoted_pair(sensible_for=u'()\\') | inner) + ')' if not include_parens: inner = (lambda s: s[1:-1]) << inner return inner > named(u'comment', RFC(7230))
def test_parser_edge_cases(): # Our parser implementation is general enough that # some of its branches are not being exercised by our regular tests, # so I had to come up with these contrived examples to test them. p = many(rfc7230.tchar) > named(u'p') p1 = '1' * p > named(u'p1') p2 = '11' * p * skip('\n') > named(u'p2') assert parse(p1 | p2, b'11abc') == (u'1', [u'1', u'a', u'b', u'c']) assert parse(p1 | p2, b'11abc\n') == (u'11', [u'a', u'b', u'c']) p = recursive() > named(u'p') p.rec = (rfc7230.tchar * p | subst(None) << empty) assert parse(p, b'abc') == (u'a', (u'b', (u'c', None))) p = literal('ab') > named(u'p') p0 = subst(u'') << empty | p > named(u'p0') p1 = 'xab' * p0 > named(u'p1') p2 = 'x' * string(p0) * '!' > named(u'p2') assert parse(p1 | p2, b'xabab') == (u'xab', u'ab') assert parse(p1 | p2, b'xabab!') == (u'x', u'abab', u'!') p = empty | literal('a') > named(u'p') p0 = p * 'x' > named(u'x') assert parse(p0, b'x') == u'x'
# In RFC 7230, ``<quoted-pair>`` is a single rule, # but we parametrize it to report no. 1017 depending on the context. @can_complain def check_sensible(complain, c): if c not in sensible_for: complain(1017, char=c) return c return (check_sensible << skip('\\') * (HTAB | SP | VCHAR | obs_text) > named(u'quoted-pair', RFC(7230))) qdtext = (HTAB | SP | octet(0x21) | octet_range(0x23, 0x5B) | octet_range(0x5D, 0x7E) | obs_text) > auto quoted_string = (skip(DQUOTE) * string(qdtext | quoted_pair(sensible_for=u'"\\')) * skip(DQUOTE)) > auto ctext = (HTAB | SP | octet_range(0x21, 0x27) | octet_range(0x2A, 0x5B) | octet_range(0x5D, 0x7E) | obs_text) > auto def comment(include_parens=False): inner = recursive() > named(u'comment', RFC(7230)) inner.rec = '(' + string(ctext | quoted_pair(sensible_for=u'()\\') | inner) + ')' if not include_parens: inner = (lambda s: s[1:-1]) << inner return inner > named(u'comment', RFC(7230))
# -*- coding: utf-8; -*- from httpolice.citation import RFC from httpolice.parse import (auto, can_complain, fill_names, mark, maybe, pivot, skip, string, string1) from httpolice.structure import (AuthScheme, CaseInsensitive, MultiDict, Parametrized) from httpolice.syntax.common import ALPHA, DIGIT, SP from httpolice.syntax.rfc7230 import (BWS, comma_list, comma_list1, quoted_string, token) auth_scheme = AuthScheme << token > pivot token68 = (string1(ALPHA | DIGIT | '-' | '.' | '_' | '~' | '+' | '/') + string('=')) > pivot @can_complain def _check_realm(complain, k, v): (symbol, v) = v if k == u'realm' and symbol is not quoted_string: complain(1196) return (k, v) auth_param = _check_realm << ((CaseInsensitive << token) * skip(BWS * '=' * BWS) * (mark(token) | mark(quoted_string))) > pivot challenge = Parametrized << ( auth_scheme * maybe(skip(string1(SP)) * (token68 | MultiDict << comma_list(auth_param)),
def quoted_pair(sensible_for): # In RFC 7230, ``<quoted-pair>`` is a single rule, # but we parametrize it to report no. 1017 depending on the context. @can_complain def check_sensible(complain, c): if c not in sensible_for: complain(1017, char=c) return c return (check_sensible << skip('\\') * (HTAB | SP | VCHAR | obs_text) > named(u'quoted-pair', RFC(7230))) qdtext = (HTAB | SP | octet(0x21) | octet_range(0x23, 0x5B) | octet_range(0x5D, 0x7E) | obs_text) > auto quoted_string = (skip(DQUOTE) * string(qdtext | quoted_pair(sensible_for=u'"\\')) * skip(DQUOTE)) > auto ctext = (HTAB | SP | octet_range(0x21, 0x27) | octet_range(0x2A, 0x5B) | octet_range(0x5D, 0x7E) | obs_text) > auto def comment(include_parens=False): inner = recursive() > named(u'comment', RFC(7230)) inner.rec = '(' + string(ctext | quoted_pair(sensible_for=u'()\\') | inner) + ')' if not include_parens: inner = (lambda s: s[1:-1]) << inner return inner > named(u'comment', RFC(7230)) OWS = string(SP | HTAB) > auto
from urllib.parse import unquote_to_bytes as pct_decode from httpolice.citation import RFC from httpolice.parse import (auto, can_complain, fill_names, maybe, pivot, skip, string, string1) from httpolice.structure import CaseInsensitive, ExtValue from httpolice.syntax.common import ALPHA, DIGIT, HEXDIG from httpolice.syntax.rfc5646 import Language_Tag as language from httpolice.util.text import force_bytes attr_char = (ALPHA | DIGIT | '!' | '#' | '$' | '&' | '+' | '-' | '.' | '^' | '_' | '`' | '|' | '~') > auto parmname = string(attr_char) > pivot # We don't need to special-case "UTF-8", simplify. mime_charsetc = (ALPHA | DIGIT | '!' | '#' | '$' | '%' | '&' | '+' | '-' | '^' | '_' | '`' | '{' | '}' | '~') > auto mime_charset = string1(mime_charsetc) > auto charset = CaseInsensitive << mime_charset > pivot pct_encoded = '%' + HEXDIG + HEXDIG > auto value_chars = pct_decode << ( force_bytes << string(pct_encoded | attr_char)) > auto @can_complain def _check_ext_value(complain, val): if val.charset == u'UTF-8':
complete_length = int << string1(DIGIT) > auto byte_range_resp = (byte_range * skip('/') * (complete_length | subst(None) << literal('*'))) > pivot unsatisfied_range = ((subst(None) << literal('*/')) * complete_length) > pivot @can_complain def _well_formed2(complain, r): bounds, complete = r.range if bounds is not None: first, last = bounds if (last < first) or ((complete is not None) and (complete <= last)): complain(1148) return r byte_content_range = _well_formed2 << ( ContentRange << (bytes_unit * skip(SP) * (byte_range_resp | unsatisfied_range))) > pivot other_range_resp = string(CHAR) > pivot other_content_range = ContentRange << (other_range_unit * skip(SP) * other_range_resp) > pivot Content_Range = byte_content_range | other_content_range > pivot If_Range = entity_tag | HTTP_date > pivot fill_names(globals(), RFC(7233))
from httpolice.syntax.rfc2616 import LOALPHA from httpolice.syntax.rfc3986 import URI, URI_reference as URI_Reference from httpolice.syntax.rfc5646 import Language_Tag from httpolice.syntax.rfc5987 import ext_value, parmname__excluding from httpolice.syntax.rfc6838 import subtype_name, type_name from httpolice.syntax.rfc7230 import OWS, comma_list, quoted_string # RFC 5988 refers to HTML 4.01 for the ``MediaDesc`` rule, # but HTML 4.01 doesn't actually define a grammar for that; # it only gives a vague idea of what it is supposed to be. # So we use a fairly permissive form. # Also, from RFC 5988 Section 5.4: # "its value MUST be quoted if it contains a semicolon (';') or comma (',')". _MediaDesc = string((VCHAR | HTAB | SP) - literal('"')) _MediaDesc_no_delim = string((VCHAR | HTAB | SP) - literal('"') - literal(';') - literal(',')) # This has been slightly adapted to the rules of RFC 7230. # The ``OWS`` are derived from the "implied ``*LWS``" requirement. ptokenchar = (literal('!') | '#' | '$' | '%' | '&' | "'" | '(' | ')' | '*' | '+' | '-' | '.' | '/' | DIGIT | ':' | '<' | '=' | '>' | '?' | '@' | ALPHA | '[' | ']' | '^' | '_' | '`' | '{' | '|' | '}' | '~') > auto ptoken = string1(ptokenchar) > auto media_type = MediaType << type_name + '/' + subtype_name > pivot
# -*- coding: utf-8; -*- from httpolice.citation import RFC from httpolice.parse import auto, fill_names, pivot, string, string_times from httpolice.structure import CaseInsensitive from httpolice.syntax.common import ALPHA, DIGIT alphanum = ALPHA | DIGIT > auto language_range = CaseInsensitive << (string_times( 1, 8, ALPHA) + string('-' + string_times(1, 8, alphanum)) | '*') > pivot fill_names(globals(), RFC(4647))
if name == u'rev': complain(1226) if u'rel' not in seen: complain(1309) return MultiDict(r) link_param = ((CaseInsensitive << token) * skip(BWS) * maybe( skip(literal('=') * BWS) * (mark(token) | mark(quoted_string)))) > pivot link_value = Parametrized << ( skip('<') * URI_Reference * skip('>') * (_process_params << many(skip(OWS * ';' * OWS) * link_param))) > pivot Link = comma_list(link_value) > pivot anchor = URI_Reference > auto reg_rel_type = CaseInsensitive << (LOALPHA + string(LOALPHA | DIGIT | '.' | '-')) > auto ext_rel_type = URI > auto relation_type = reg_rel_type | ext_rel_type > pivot rel = rev = relation_type % many(skip(string1(SP)) * relation_type) > auto hreflang = Language_Tag > auto type_ = check_media_type << ( MediaType << type_name + '/' + subtype_name) > auto fill_names(globals(), RFC(8288))
# -*- coding: utf-8; -*- from httpolice.citation import RFC from httpolice.parse import (auto, empty, fill_names, literal, maybe_str, octet_range, pivot, string, string1, string_times, subst) from httpolice.syntax.common import ALPHA, DIGIT, HEXDIG pct_encoded = '%' + HEXDIG + HEXDIG > auto sub_delims = (literal('!') | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=') > auto unreserved = ALPHA | DIGIT | '-' | '.' | '_' | '~' > auto pchar = unreserved | sub_delims | ':' | '@' | pct_encoded > auto segment = string(pchar) > auto segment_nz = string1(pchar) > auto segment_nz_nc = string1(unreserved | sub_delims | '@' | pct_encoded) > auto scheme = ALPHA + string(ALPHA | DIGIT | '+' | '-' | '.') > pivot userinfo = string(unreserved | sub_delims | ':' | pct_encoded) > pivot dec_octet = (DIGIT | octet_range(0x31, 0x39) + DIGIT | '1' + DIGIT + DIGIT | '2' + octet_range(0x30, 0x34) + DIGIT | '25' + octet_range(0x30, 0x35)) > auto IPv4address = (dec_octet + '.' + dec_octet + '.' + dec_octet + '.' + dec_octet) > pivot h16 = string_times(1, 4, HEXDIG) > auto ls32 = (h16 + ':' + h16) | IPv4address > auto IPv6address = (
# -*- coding: utf-8; -*- from httpolice.citation import RFC from httpolice.parse import auto, fill_names, pivot, string, string_times from httpolice.structure import CaseInsensitive from httpolice.syntax.common import ALPHA, DIGIT alphanum = ALPHA | DIGIT > auto language_range = CaseInsensitive << ( string_times(1, 8, ALPHA) + string('-' + string_times(1, 8, alphanum)) | '*') > pivot fill_names(globals(), RFC(4647))
'zh-min-nan' | 'zh-xiang') > auto grandfathered = irregular | regular > pivot privateuse = 'x' + string1('-' + string_times(1, 8, alphanum)) > pivot extlang = (string_times(3, 3, ALPHA) + string_times(0, 2, '-' + string_times(3, 3, ALPHA))) > pivot language = (string_times(2, 3, ALPHA) + maybe_str('-' + extlang) | string_times(4, 4, ALPHA) | string_times(5, 8, ALPHA)) > pivot script = string_times(4, 4, ALPHA) > pivot region = string_times(2, 2, ALPHA) | string_times(3, 3, DIGIT) > pivot variant = (string_times(5, 8, alphanum) | (DIGIT + string_times(3, 3, alphanum))) > pivot extension = (singleton + string1('-' + string_times(2, 8, alphanum))) > pivot langtag = (language + maybe_str('-' + script) + maybe_str('-' + region) + string('-' + variant) + string('-' + extension) + maybe_str('-' + privateuse)) > pivot Language_Tag = (LanguageTag << langtag | LanguageTag << privateuse | LanguageTag << grandfathered) > pivot fill_names(globals(), RFC(5646))
from urllib.parse import unquote_to_bytes as pct_decode from httpolice.citation import RFC from httpolice.parse import (auto, can_complain, fill_names, maybe, pivot, skip, string, string1) from httpolice.structure import CaseInsensitive, ExtValue from httpolice.syntax.common import ALPHA, DIGIT, HEXDIG from httpolice.syntax.rfc5646 import Language_Tag as language from httpolice.util.text import force_bytes attr_char = (ALPHA | DIGIT | '!' | '#' | '$' | '&' | '+' | '-' | '.' | '^' | '_' | '`' | '|' | '~') > auto parmname = string(attr_char) > pivot # We don't need to special-case "UTF-8", simplify. mime_charsetc = (ALPHA | DIGIT | '!' | '#' | '$' | '%' | '&' | '+' | '-' | '^' | '_' | '`' | '{' | '}' | '~') > auto mime_charset = string1(mime_charsetc) > auto charset = CaseInsensitive << mime_charset > pivot pct_encoded = '%' + HEXDIG + HEXDIG > auto value_chars = pct_decode << ( force_bytes << string(pct_encoded | attr_char)) > auto @can_complain def _check_ext_value(complain, val): if val.charset == u'UTF-8': try: val.value_bytes.decode(val.charset) except UnicodeError as e:
| 'i-enochian' | 'i-hak' | 'i-klingon' | 'i-lux' | 'i-mingo' | 'i-navajo' | 'i-pwn' | 'i-tao' | 'i-tay' | 'i-tsu' | 'sgn-BE-FR' | 'sgn-BE-NL' | 'sgn-CH-DE') > auto regular = (literal('art-lojban') | 'cel-gaulish' | 'no-bok' | 'no-nyn' | 'zh-guoyu' | 'zh-hakka' | 'zh-min' | 'zh-min-nan' | 'zh-xiang') > auto grandfathered = irregular | regular > pivot privateuse = 'x' + string1('-' + string_times(1, 8, alphanum)) > pivot extlang = (string_times(3, 3, ALPHA) + string_times(0, 2, '-' + string_times(3, 3, ALPHA))) > pivot language = (string_times(2, 3, ALPHA) + maybe_str('-' + extlang) | string_times(4, 4, ALPHA) | string_times(5, 8, ALPHA)) > pivot script = string_times(4, 4, ALPHA) > pivot region = string_times(2, 2, ALPHA) | string_times(3, 3, DIGIT) > pivot variant = (string_times(5, 8, alphanum) | (DIGIT + string_times(3, 3, alphanum))) > pivot extension = (singleton + string1('-' + string_times(2, 8, alphanum))) > pivot langtag = (language + maybe_str('-' + script) + maybe_str('-' + region) + string('-' + variant) + string('-' + extension) + maybe_str('-' + privateuse)) > pivot Language_Tag = (LanguageTag << langtag | LanguageTag << privateuse | LanguageTag << grandfathered) > pivot fill_names(globals(), RFC(5646))
from httpolice.parse import (auto, can_complain, fill_names, maybe, octet, octet_range, pivot, string, subst) from httpolice.structure import EntityTag from httpolice.syntax.common import DQUOTE from httpolice.syntax.rfc7230 import comma_list1, obs_text from httpolice.syntax.rfc7231 import HTTP_date weak = subst(True) << octet(0x57) * octet(0x2F) > auto etagc = octet(0x21) | octet_range(0x23, 0x7E) | obs_text > auto @can_complain def _no_backslashes(complain, s): if u'\\' in s: complain(1119) return s opaque_tag = _no_backslashes << DQUOTE + string(etagc) + DQUOTE > auto entity_tag = EntityTag << maybe(weak, False) * opaque_tag > pivot ETag = entity_tag > pivot Last_Modified = HTTP_date > pivot If_Match = '*' | comma_list1(entity_tag) > pivot If_None_Match = '*' | comma_list1(entity_tag) > pivot If_Modified_Since = HTTP_date > pivot If_Unmodified_Since = HTTP_date > pivot fill_names(globals(), RFC(7232))
byte_range = first_byte_pos * skip('-') * last_byte_pos > auto complete_length = int << string1(DIGIT) > auto byte_range_resp = ( byte_range * skip('/') * (complete_length | subst(None) << literal('*'))) > pivot unsatisfied_range = ( (subst(None) << literal('*/')) * complete_length) > pivot @can_complain def _well_formed2(complain, r): bounds, complete = r.range if bounds is not None: first, last = bounds if (last < first) or ((complete is not None) and (complete <= last)): complain(1148) return r byte_content_range = _well_formed2 << (ContentRange << ( bytes_unit * skip(SP) * (byte_range_resp | unsatisfied_range))) > pivot other_range_resp = string(CHAR) > pivot other_content_range = ContentRange << ( other_range_unit * skip(SP) * other_range_resp) > pivot Content_Range = byte_content_range | other_content_range > pivot If_Range = entity_tag | HTTP_date > pivot fill_names(globals(), RFC(7233))
string, subst, ) from httpolice.structure import EntityTag from httpolice.syntax.common import DQUOTE from httpolice.syntax.rfc7230 import comma_list1, obs_text from httpolice.syntax.rfc7231 import HTTP_date weak = subst(True) << octet(0x57) * octet(0x2F) > auto etagc = octet(0x21) | octet_range(0x23, 0x7E) | obs_text > auto @can_complain def _no_backslashes(complain, s): if u'\\' in s: complain(1119) return s opaque_tag = _no_backslashes << DQUOTE + string(etagc) + DQUOTE > auto entity_tag = EntityTag << maybe(weak, False) * opaque_tag > pivot ETag = entity_tag > pivot Last_Modified = HTTP_date > pivot If_Match = '*' | comma_list1(entity_tag) > pivot If_None_Match = '*' | comma_list1(entity_tag) > pivot If_Modified_Since = HTTP_date > pivot If_Unmodified_Since = HTTP_date > pivot fill_names(globals(), RFC(7232))
def parmname__excluding(exclude): return (string_excluding(attr_char, [''] + exclude) > named( u'parmname', RFC(5987), is_pivot=True)) parmname = parmname__excluding([]) # We don't need to special-case "UTF-8" and "ISO-8859-1", simplify. mime_charsetc = (ALPHA | DIGIT | '!' | '#' | '$' | '%' | '&' | '+' | '-' | '^' | '_' | '`' | '{' | '}' | '~') > auto mime_charset = string1(mime_charsetc) > auto charset = CaseInsensitive << mime_charset > pivot pct_encoded = '%' + HEXDIG + HEXDIG > auto value_chars = pct_decode << ( force_bytes << string(pct_encoded | attr_char)) > auto @can_complain def _check_ext_value(complain, val): if val.charset in [u'UTF-8', u'ISO-8859-1']: try: val.value_bytes.decode(val.charset) except UnicodeError as e: complain(1254, charset=val.charset, error=e) else: complain(1253, charset=val.charset) return val ext_value = _check_ext_value << (
# -*- coding: utf-8; -*- from httpolice.citation import RFC from httpolice.parse import (auto, empty, fill_names, literal, maybe_str, octet_range, pivot, string, string1, string_times, subst) from httpolice.syntax.common import ALPHA, DIGIT, HEXDIG pct_encoded = '%' + HEXDIG + HEXDIG > auto sub_delims = (literal('!') | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=') > auto unreserved = ALPHA | DIGIT | '-' | '.' | '_' | '~' > auto pchar = unreserved | sub_delims | ':' | '@' | pct_encoded > auto segment = string(pchar) > auto segment_nz = string1(pchar) > auto segment_nz_nc = string1(unreserved | sub_delims | '@' | pct_encoded) > auto scheme = ALPHA + string(ALPHA | DIGIT | '+' | '-' | '.') > pivot userinfo = string(unreserved | sub_delims | ':' | pct_encoded) > pivot dec_octet = (DIGIT | octet_range(0x31, 0x39) + DIGIT | '1' + DIGIT + DIGIT | '2' + octet_range(0x30, 0x34) + DIGIT | '25' + octet_range(0x30, 0x35)) > auto IPv4address = (dec_octet + '.' + dec_octet + '.' + dec_octet + '.' + dec_octet) > pivot h16 = string_times(1, 4, HEXDIG) > auto ls32 = (h16 + ':' + h16) | IPv4address > auto IPv6address = ( string_times(6, 6, h16 + ':') + ls32 | '::' + string_times(5, 5, h16 + ':') + ls32 | maybe_str(h16) + '::' + string_times(4, 4, h16 + ':') + ls32
def parmname__excluding(exclude): return (string_excluding(attr_char, [''] + exclude) > named(u'parmname', RFC(5987), is_pivot=True)) parmname = parmname__excluding([]) # We don't need to special-case "UTF-8" and "ISO-8859-1", simplify. mime_charsetc = (ALPHA | DIGIT | '!' | '#' | '$' | '%' | '&' | '+' | '-' | '^' | '_' | '`' | '{' | '}' | '~') > auto mime_charset = string1(mime_charsetc) > auto charset = CaseInsensitive << mime_charset > pivot pct_encoded = '%' + HEXDIG + HEXDIG > auto value_chars = pct_decode << ( force_bytes << string(pct_encoded | attr_char)) > auto @can_complain def _check_ext_value(complain, val): if val.charset in [u'UTF-8', u'ISO-8859-1']: try: val.value_bytes.decode(val.charset) except UnicodeError as e: complain(1254, charset=val.charset, error=e) else: complain(1253, charset=val.charset) return val ext_value = _check_ext_value << ( ExtValue << (charset * skip("'") * maybe(language) * skip("'") *
complain(1309) return MultiDict(r) link_param = ( (CaseInsensitive << token) * skip(BWS) * maybe(skip(literal('=') * BWS) * (mark(token) | mark(quoted_string)))) > pivot link_value = Parametrized << ( skip('<') * URI_Reference * skip('>') * (_process_params << many(skip(OWS * ';' * OWS) * link_param))) > pivot Link = comma_list(link_value) > pivot anchor = URI_Reference > auto reg_rel_type = CaseInsensitive << ( LOALPHA + string(LOALPHA | DIGIT | '.' | '-')) > auto ext_rel_type = URI > auto relation_type = reg_rel_type | ext_rel_type > pivot rel = rev = relation_type % many(skip(string1(SP)) * relation_type) > auto hreflang = Language_Tag > auto type_ = check_media_type << ( MediaType << type_name + '/' + subtype_name) > auto fill_names(globals(), RFC(8288))