def urls_from_urlset_or_sitemapindex(response): """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per `sitemaps.org <http://www.sitemaps.org/protocol.html>`_. """ sitemap = URL(response.url).fragment_dict.get('sitemap') content_subtypes = response.headers.get_content_subtype().split('+') if not sitemap and not 'xml' in content_subtypes: return root = None for _, elem in iterparse(decode(response)): if root is None: root = elem.getroottree().getroot() if not (root.tag.endswith('}sitemapindex') or root.tag.endswith('}urlset')): # root element has wrong tag - give up break if elem.tag.endswith('}loc') and elem.text is not None: text = elem.text.strip() if text: # http://www.sitemaps.org/protocol.html#locdef url = URL(urljoin(response.url, text)) if elem.getparent().tag.endswith('}sitemap'): # set sitemap=True to help downstream processing url = url.update_fragment_dict(sitemap=True) yield "url", url if elem.getparent() is root: # release memory for previous elements while elem.getprevious() is not None: del root[0]
def urls_from_robots_txt(response): """ Yields sitemap URLs from "/robots.txt" """ url = URL(response.request_url or response.url or '') if url.parsed.path != '/robots.txt': return charset = response.headers.get_content_charset() lines = getreader(charset or 'ISO-8859-1')(response) for line in lines: content, _, comment = line.partition('#') field, _, value = content.partition(':') if field.strip().lower() != 'sitemap': continue # we shouldn't need to urljoin but we do just in case joined = URL(urljoin(response.url, value.strip())) # set sitemap=True in fragment to help downstream processing yield "url", joined.update_fragment_dict(sitemap=True)
def urls_from_urlset_or_sitemapindex(response): """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per `sitemaps.org <http://www.sitemaps.org/protocol.html>`_. """ sitemap = URL(response.url).fragment_dict.get('sitemap') content_subtypes = response.headers.get_content_subtype().split('+') if not sitemap and 'xml' not in content_subtypes: return root = None try: for _, elem in iterparse(decode(response)): if root is None: root = elem.getroottree().getroot() if not (root.tag.endswith('}sitemapindex') or root.tag.endswith('}urlset')): # root element has wrong tag - give up break if elem.tag.endswith('}loc') and elem.text is not None: text = elem.text.strip() if text: # http://www.sitemaps.org/protocol.html#locdef url = URL(urljoin(response.url, text)) if elem.getparent().tag.endswith('}sitemap'): # set sitemap=True to help downstream processing url = url.update_fragment_dict(sitemap=True) yield "url", url if elem.getparent() is root: # release memory for previous elements while elem.getprevious() is not None: del root[0] except XMLSyntaxError: log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code)
def run(**kw): url = URL('http://httpbin.org/forms/post') custname = 'Giles' toppings = ('bacon', 'onion') comments = 'Using CSS selector' method = { 'form': { 'form': [ ('custname', custname), ('topping', toppings), ('textarea', comments), ] } } url = url.update_fragment_dict(method=method) responses = list(map(Response.from_readable, url.get(**kw))) # we should have GET and then POST assert len(responses) == 2 data = json.loads(responses[1].read().decode('utf-8')) assert (set(data['form'].keys()) == set(['comments', 'custname', 'topping'])) assert data['form']['custname'] == custname assert data['form']['topping'] == list(toppings) assert data['form']['comments'] == comments
from subprocess import check_output, CalledProcessError from six.moves import map import pytest from wex.response import Response from wex.etree import parse from wex.url import URL from httpproxy import HttpProxy, skipif_travis_ci url = URL("http://httpbin.org/html") method = {"phantomjs": {"requires": [["wex", "js/bcr.js"]]}} url = url.update_fragment_dict(method=method) try: version = check_output(["phantomjs", "--version"]) except CalledProcessError: version_info = (0, 0, 0) else: version_info = tuple(map(int, version.split(b"."))) old_phantomjs_version = pytest.mark.skipif(version_info < (2, 0, 0), reason="phantomjs version to old") @old_phantomjs_version def test_phantomjs(): elements = [] context = {"foo": "bar"} for response in map(Response.from_readable, url.get(context=context)): tree = parse(response) elements.extend(tree.xpath("//h1")) assert response.headers.get("X-wex-context-foo") == "bar" assert len(elements) == 1