예제 #1
0
def urls_from_urlset_or_sitemapindex(response):
    """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per 
        `sitemaps.org <http://www.sitemaps.org/protocol.html>`_.
    """

    sitemap = URL(response.url).fragment_dict.get('sitemap')
    content_subtypes = response.headers.get_content_subtype().split('+')
    if not sitemap and not 'xml' in content_subtypes:
        return

    root = None
    for _, elem in iterparse(decode(response)):

        if root is None:
            root = elem.getroottree().getroot()
            if not (root.tag.endswith('}sitemapindex') or
                    root.tag.endswith('}urlset')):
                # root element has wrong tag - give up
                break

        if elem.tag.endswith('}loc') and elem.text is not None:
            text = elem.text.strip()
            if text:
                # http://www.sitemaps.org/protocol.html#locdef
                url = URL(urljoin(response.url, text))
                if elem.getparent().tag.endswith('}sitemap'):
                    # set sitemap=True to help downstream processing
                    url = url.update_fragment_dict(sitemap=True)
                yield "url", url

        if elem.getparent() is root:
            # release memory for previous elements
            while elem.getprevious() is not None:
                del root[0]
예제 #2
0
파일: sitemaps.py 프로젝트: eBay/wextracto
def urls_from_robots_txt(response):
    """ Yields sitemap URLs from "/robots.txt" """

    url = URL(response.request_url or response.url or '')
    if url.parsed.path != '/robots.txt':
        return

    charset = response.headers.get_content_charset()
    lines = getreader(charset or 'ISO-8859-1')(response)

    for line in lines:

        content, _, comment = line.partition('#')
        field, _, value = content.partition(':')
        if field.strip().lower() != 'sitemap':
            continue

        # we shouldn't need to urljoin but we do just in case
        joined = URL(urljoin(response.url, value.strip()))
        # set sitemap=True in fragment to help downstream processing
        yield "url", joined.update_fragment_dict(sitemap=True)
예제 #3
0
def urls_from_robots_txt(response):
    """ Yields sitemap URLs from "/robots.txt" """

    url = URL(response.request_url or response.url or '')
    if url.parsed.path != '/robots.txt':
        return

    charset = response.headers.get_content_charset()
    lines = getreader(charset or 'ISO-8859-1')(response)

    for line in lines:

        content, _, comment = line.partition('#')
        field, _, value = content.partition(':')
        if field.strip().lower() != 'sitemap':
            continue

        # we shouldn't need to urljoin but we do just in case
        joined = URL(urljoin(response.url, value.strip()))
        # set sitemap=True in fragment to help downstream processing
        yield "url", joined.update_fragment_dict(sitemap=True)
예제 #4
0
def urls_from_urlset_or_sitemapindex(response):
    """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per 
        `sitemaps.org <http://www.sitemaps.org/protocol.html>`_.
    """

    sitemap = URL(response.url).fragment_dict.get('sitemap')
    content_subtypes = response.headers.get_content_subtype().split('+')
    if not sitemap and 'xml' not in content_subtypes:
        return

    root = None
    try:
        for _, elem in iterparse(decode(response)):

            if root is None:
                root = elem.getroottree().getroot()
                if not (root.tag.endswith('}sitemapindex')
                        or root.tag.endswith('}urlset')):
                    # root element has wrong tag - give up
                    break

            if elem.tag.endswith('}loc') and elem.text is not None:
                text = elem.text.strip()
                if text:
                    # http://www.sitemaps.org/protocol.html#locdef
                    url = URL(urljoin(response.url, text))
                    if elem.getparent().tag.endswith('}sitemap'):
                        # set sitemap=True to help downstream processing
                        url = url.update_fragment_dict(sitemap=True)
                    yield "url", url

            if elem.getparent() is root:
                # release memory for previous elements
                while elem.getprevious() is not None:
                    del root[0]

    except XMLSyntaxError:
        log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code)
예제 #5
0
def run(**kw):
    url = URL('http://httpbin.org/forms/post')
    custname = 'Giles'
    toppings = ('bacon', 'onion')
    comments = 'Using CSS selector'
    method = {
        'form': {
            'form': [
                ('custname', custname),
                ('topping', toppings),
                ('textarea', comments),
            ]
        }
    }
    url = url.update_fragment_dict(method=method)
    responses = list(map(Response.from_readable, url.get(**kw)))
    # we should have GET and then POST
    assert len(responses) == 2
    data = json.loads(responses[1].read().decode('utf-8'))
    assert (set(data['form'].keys()) ==
            set(['comments', 'custname', 'topping']))
    assert data['form']['custname'] == custname
    assert data['form']['topping'] == list(toppings)
    assert data['form']['comments'] == comments
예제 #6
0
def run(**kw):
    url = URL('http://httpbin.org/forms/post')
    custname = 'Giles'
    toppings = ('bacon', 'onion')
    comments = 'Using CSS selector'
    method = {
        'form': {
            'form': [
                ('custname', custname),
                ('topping', toppings),
                ('textarea', comments),
            ]
        }
    }
    url = url.update_fragment_dict(method=method)
    responses = list(map(Response.from_readable, url.get(**kw)))
    # we should have GET and then POST
    assert len(responses) == 2
    data = json.loads(responses[1].read().decode('utf-8'))
    assert (set(data['form'].keys()) ==
            set(['comments', 'custname', 'topping']))
    assert data['form']['custname'] == custname
    assert data['form']['topping'] == list(toppings)
    assert data['form']['comments'] == comments
예제 #7
0
from subprocess import check_output, CalledProcessError
from six.moves import map
import pytest
from wex.response import Response
from wex.etree import parse
from wex.url import URL
from httpproxy import HttpProxy, skipif_travis_ci

url = URL("http://httpbin.org/html")
method = {"phantomjs": {"requires": [["wex", "js/bcr.js"]]}}
url = url.update_fragment_dict(method=method)

try:
    version = check_output(["phantomjs", "--version"])
except CalledProcessError:
    version_info = (0, 0, 0)
else:
    version_info = tuple(map(int, version.split(b".")))

old_phantomjs_version = pytest.mark.skipif(version_info < (2, 0, 0), reason="phantomjs version to old")


@old_phantomjs_version
def test_phantomjs():
    elements = []
    context = {"foo": "bar"}
    for response in map(Response.from_readable, url.get(context=context)):
        tree = parse(response)
        elements.extend(tree.xpath("//h1"))
        assert response.headers.get("X-wex-context-foo") == "bar"
    assert len(elements) == 1