Python remove_entities示例，scrapy.utils.markup.remove_entities Python示例

示例#1

0

显示文件

文件： processors.py 项目： saidimu/django-scrape

 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
         new_values.append(int(v))
     return new_values

示例#2

0

显示文件

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(
        region,
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' ')
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()

示例#3

0

显示文件

文件： extractors.py 项目： chzealot/scrapy

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(region, 
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' '
    )
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()

示例#4

0

显示文件

 def mklink(url, anchortext=None, nofollow=False):
     url = url.strip()
     fullurl = urljoin(base_href,
                       remove_entities(url, encoding=htmlpage.encoding))
     return Link(fullurl.encode(htmlpage.encoding),
                 text=anchortext,
                 nofollow=nofollow)

示例#5

0

显示文件

 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
         new_values.append(int(v))
     return new_values

示例#6

0

显示文件

文件： processors.py 项目： saidimu/django-scrape

 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
             v = v.lower() == "true"
         else:
             v = bool(v)
         new_values.append(v)
     return new_values

示例#7

0

显示文件

文件： regex.py 项目： serkanh/scrapy

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = self.base_url if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]

示例#8

0

显示文件

 def __call__(self, values):
     new_values = []
     for v in arg_to_iter(values):
         if isinstance(v, (str, unicode)):
             v = remove_entities(v).strip()
             v = (v.lower() == 'true')
         else:
             v = bool(v)
         new_values.append(v)
     return new_values

示例#9

0

显示文件

文件： extractors.py 项目： chzealot/scrapy

def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

示例#10

0

显示文件

def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

示例#11

0

显示文件

文件： misc.py 项目： kenzouyeh/scrapy

def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [remove_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]

示例#12

0

显示文件

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]

示例#13

0

显示文件

def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex)

    try:
        strings = [regex.search(text).group('extract')]  # named group
    except:
        strings = regex.findall(text)  # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [remove_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [
            remove_entities(unicode(s, encoding), keep=['lt', 'amp'])
            for s in strings
        ]

示例#14

0

显示文件

文件： response.py 项目： chenhbzl/book-crawler

def get_meta_refresh(response):
    """Parse the http-equiv parameter of the HTML meta element from the given
    response and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    if response not in _metaref_cache:
        body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096]))
        match = META_REFRESH_RE.search(body_chunk)
        if match:
            interval = float(match.group('int'))
            url = safe_url_string(match.group('url').strip(' "\''))
            url = urljoin_rfc(response.url, url)
            _metaref_cache[response] = (interval, url)
        else:
            _metaref_cache[response] = (None, None)
        #_metaref_cache[response] = match.groups() if match else (None, None)
    return _metaref_cache[response]

示例#15

0

显示文件

文件： response.py 项目： richard-ma/CodeReading

def get_meta_refresh(response):
    """Parse the http-equiv parameter of the HTML meta element from the given
    response and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    if response not in _metaref_cache:
        body_chunk = remove_comments(
            remove_entities(response.body_as_unicode()[0:4096]))
        match = META_REFRESH_RE.search(body_chunk)
        if match:
            interval = float(match.group('int'))
            url = safe_url_string(match.group('url').strip(' "\''))
            url = urljoin_rfc(response.url, url)
            _metaref_cache[response] = (interval, url)
        else:
            _metaref_cache[response] = (None, None)
        #_metaref_cache[response] = match.groups() if match else (None, None)
    return _metaref_cache[response]

示例#16

0

显示文件

文件： response.py 项目： robyoung/scrapy

def get_meta_refresh(response):
    """Parse the http-equiv parameter of the HTML meta element from the given
    response and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    if response not in _metaref_cache:
        body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096]))
        for match1 in META_TAG_RE.finditer(body_chunk):
            params = {}
            for match2 in META_TAG_ATTRS_RE.finditer(match1.group(1)):
                params[match2.group("key")] = match2.group("value")
            if params.get("http-equiv") == "refresh":
                match = META_CONTENT_RE.search(params.get("content", ""))
                if match:
                    interval = float(match.group("int"))
                    url = urljoin_rfc(response.url, safe_url_string((match.group("url") or "").strip(' "\'')))
                    _metaref_cache[response] = (interval, url)
                    return (interval, url)
        _metaref_cache[response] = (None, None)
    return _metaref_cache[response]

示例#17

0

显示文件

文件： linkextractor.py 项目： amcclosky/slybot

 def mklink(url, anchortext=None, nofollow=False):
     url = url.strip()
     fullurl = urljoin(base_href, remove_entities(url, encoding=htmlpage.encoding))
     return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow)

示例#18

0

显示文件

 def __call__(self, values):
     return [remove_entities(v) for v in arg_to_iter(values)]

示例#19

0

显示文件

def iterlinks(htmlpage):
    """Iterate through the links in the HtmlPage passed

    For example:
    >>> from scrapely.htmlpage import HtmlPage
    >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>")
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False)
    >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>")
    >>> iterlinks(p).next()
    Link(url='home.html', text=u'Home', fragment='', nofollow=False)
    
    When a url is specified, absolute urls are made:
    >>> p.url = 'http://scrapinghub.com/'
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False)

    Base href attributes in the page are respected
    >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>"
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False)
    >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>"
    >>> iterlinks(p).next()
    Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False)

    Frameset and iframe urls are extracted
    >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>")
    >>> [l.url for l in iterlinks(p)]
    ['frame1.html', 'frame2.html', 'iframe.html']
    
    As are meta refresh tags:
    >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />")
    >>> iterlinks(p).next().url
    'http://example.com/'
    
    nofollow is set to True if the link has a rel='nofollow' attribute:
    >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>")
    >>> list(iterlinks(p))
    [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)]
    
    It does not require well formed HTML and behaves similar to many browsers
    >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz")
    >>> list(iterlinks(p))
    [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)]

    Leading and trailing whitespace should be removed, including in base href
    >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz")
    >>> list(iterlinks(p))
    [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)]

    Test standard onclick links
    >>> p = HtmlPage(url="http://www.example.com", body=u"<html><td onclick=window.open('page.html?productid=23','win2') >")
    >>> list(iterlinks(p))
    [Link(url='http://www.example.com/page.html?productid=23', text=None, fragment='', nofollow=False)]

    >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#' onclick=window.open('page.html?productid=24','win2') >")
    >>> list(iterlinks(p))
    [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)]

    >>> p = HtmlPage(body=u"<html><div onclick=window.location.href='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm'>")
    >>> list(iterlinks(p))
    [Link(url='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm', text=None, fragment='', nofollow=False)]

    Onclick with no href
    >>> p = HtmlPage("http://www.example.com", body=u"<html><a onclick=window.open('page.html?productid=24','win2') >")
    >>> list(iterlinks(p))
    [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)]

    Dont generate link when target is an anchor
    >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#section1' >")
    >>> list(iterlinks(p))
    []

    Extract links from <link> tags in page header
    >>> p = HtmlPage("http://example.blogspot.com/", body=u"<html><head><link rel='me' href='http://www.blogger.com/profile/987372' /></head><body>This is my body!</body></html>")
    >>> list(iterlinks(p))
    [Link(url='http://www.blogger.com/profile/987372', text=None, fragment='', nofollow=False)]
    """
    base_href = remove_entities(htmlpage.url, encoding=htmlpage.encoding)

    def mklink(url, anchortext=None, nofollow=False):
        url = url.strip()
        fullurl = urljoin(base_href,
                          remove_entities(url, encoding=htmlpage.encoding))
        return Link(fullurl.encode(htmlpage.encoding),
                    text=anchortext,
                    nofollow=nofollow)

    # iter to quickly scan only tags
    tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag))

    # parse body
    astart = ahref = None
    nofollow = False
    for nexttag in tag_iter:
        tagname = nexttag.tag
        attributes = nexttag.attributes
        if tagname == 'a' and (nexttag.tag_type == HtmlTagType.CLOSE_TAG or attributes.get('href') \
                    and not attributes.get('href', '').startswith('#')):
            if astart:
                yield mklink(ahref, htmlpage.body[astart:nexttag.start],
                             nofollow)
                astart = ahref = None
                nofollow = False
            href = attributes.get('href')
            if href:
                ahref = href
                astart = nexttag.end
                nofollow = attributes.get('rel') == u'nofollow'
        elif tagname == 'head':
            # scan ahead until end of head section
            for nexttag in tag_iter:
                tagname = nexttag.tag
                if (tagname == 'head' and \
                        nexttag.tag_type == HtmlTagType.CLOSE_TAG) or \
                        tagname == 'body':
                    break
                if tagname == 'base':
                    href = nexttag.attributes.get('href')
                    if href:
                        joined_base = urljoin(htmlpage.url, href.strip(),
                                              htmlpage.encoding)
                        base_href = remove_entities(joined_base,
                                                    encoding=htmlpage.encoding)
                elif tagname == 'meta':
                    attrs = nexttag.attributes
                    if attrs.get('http-equiv') == 'refresh':
                        m = _META_REFRESH_CONTENT_RE.search(
                            attrs.get('content', ''))
                        if m:
                            target = m.group('url')
                            if target:
                                yield mklink(target)
                elif tagname == 'link':
                    href = nexttag.attributes.get('href')
                    if href:
                        yield mklink(href)
        elif tagname == 'area':
            href = attributes.get('href')
            if href:
                nofollow = attributes.get('rel') == u'nofollow'
                yield mklink(href, attributes.get('alt', ''), nofollow)
        elif tagname in ('frame', 'iframe'):
            target = attributes.get('src')
            if target:
                yield mklink(target)
        elif 'onclick' in attributes:
            match = _ONCLICK_LINK_RE.search(attributes["onclick"] or "")
            if not match:
                continue
            target = match.group("url")
            nofollow = attributes.get('rel') == u'nofollow'
            yield mklink(target, nofollow=nofollow)

    if astart:
        yield mklink(ahref, htmlpage.body[astart:])

示例#20

0

显示文件

文件： linkextractor.py 项目： amcclosky/slybot

def iterlinks(htmlpage):
    """Iterate through the links in the HtmlPage passed

    For example:
    >>> from scrapely.htmlpage import HtmlPage
    >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>")
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False)
    >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>")
    >>> iterlinks(p).next()
    Link(url='home.html', text=u'Home', fragment='', nofollow=False)
    
    When a url is specified, absolute urls are made:
    >>> p.url = 'http://scrapinghub.com/'
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False)

    Base href attributes in the page are respected
    >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>"
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False)
    >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>"
    >>> iterlinks(p).next()
    Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False)

    Frameset and iframe urls are extracted
    >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>")
    >>> [l.url for l in iterlinks(p)]
    ['frame1.html', 'frame2.html', 'iframe.html']
    
    As are meta refresh tags:
    >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />")
    >>> iterlinks(p).next().url
    'http://example.com/'
    
    nofollow is set to True if the link has a rel='nofollow' attribute:
    >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>")
    >>> list(iterlinks(p))
    [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)]
    
    It does not require well formed HTML and behaves similar to many browsers
    >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz")
    >>> list(iterlinks(p))
    [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)]

    Leading and trailing whitespace should be removed, including in base href
    >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz")
    >>> list(iterlinks(p))
    [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)]

    Test standard onclick links
    >>> p = HtmlPage(url="http://www.example.com", body=u"<html><td onclick=window.open('page.html?productid=23','win2') >")
    >>> list(iterlinks(p))
    [Link(url='http://www.example.com/page.html?productid=23', text=None, fragment='', nofollow=False)]

    >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#' onclick=window.open('page.html?productid=24','win2') >")
    >>> list(iterlinks(p))
    [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)]

    >>> p = HtmlPage(body=u"<html><div onclick=window.location.href='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm'>")
    >>> list(iterlinks(p))
    [Link(url='http://www.jungleberry.co.uk/Fair-Trade-Earrings/Aguas-Earrings.htm', text=None, fragment='', nofollow=False)]

    Onclick with no href
    >>> p = HtmlPage("http://www.example.com", body=u"<html><a onclick=window.open('page.html?productid=24','win2') >")
    >>> list(iterlinks(p))
    [Link(url='http://www.example.com/page.html?productid=24', text=None, fragment='', nofollow=False)]

    Dont generate link when target is an anchor
    >>> p = HtmlPage("http://www.example.com", body=u"<html><a href='#section1' >")
    >>> list(iterlinks(p))
    []
    """
    base_href = remove_entities(htmlpage.url, encoding=htmlpage.encoding)
    def mklink(url, anchortext=None, nofollow=False):
        url = url.strip()
        fullurl = urljoin(base_href, remove_entities(url, encoding=htmlpage.encoding))
        return Link(fullurl.encode(htmlpage.encoding), text=anchortext, nofollow=nofollow)

    # iter to quickly scan only tags
    tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag))

    # parse body
    astart = ahref = None
    nofollow = False
    for nexttag in tag_iter:
        tagname = nexttag.tag
        attributes = nexttag.attributes
        if tagname == 'a' and (nexttag.tag_type == HtmlTagType.CLOSE_TAG or attributes.get('href') \
                    and not attributes.get('href', '').startswith('#')):
            if astart:
                yield mklink(ahref, htmlpage.body[astart:nexttag.start], nofollow)
                astart = ahref = None
                nofollow = False
            href = attributes.get('href')
            if href:
                ahref = href
                astart = nexttag.end
                nofollow = attributes.get('rel') == u'nofollow'
        elif tagname == 'head':
            # scan ahead until end of head section
            for nexttag in tag_iter:
                tagname = nexttag.tag
                if (tagname == 'head' and \
                        nexttag.tag_type == HtmlTagType.CLOSE_TAG) or \
                        tagname == 'body':
                    break
                if tagname == 'base':
                    href = nexttag.attributes.get('href')
                    if href:
                        joined_base = urljoin(htmlpage.url, href.strip(),
                            htmlpage.encoding)
                        base_href = remove_entities(joined_base, 
                            encoding=htmlpage.encoding)
                elif tagname == 'meta':
                    attrs = nexttag.attributes
                    if attrs.get('http-equiv') == 'refresh':
                        m = _META_REFRESH_CONTENT_RE.search(attrs.get('content', ''))
                        if m:
                            target = m.group('url')
                            if target:
                                yield mklink(target)
        elif tagname == 'area':
            href = attributes.get('href')
            if href:
                nofollow = attributes.get('rel') == u'nofollow'
                yield mklink(href, attributes.get('alt', ''), nofollow)
        elif tagname in ('frame', 'iframe'):
            target = attributes.get('src')
            if target:
                yield mklink(target)
        elif 'onclick' in attributes:
            match = _ONCLICK_LINK_RE.search(attributes["onclick"] or "")
            if not match:
                continue
            target = match.group("url")
            nofollow = attributes.get('rel') == u'nofollow'
            yield mklink(target, nofollow=nofollow)

    if astart:
        yield mklink(ahref, htmlpage.body[astart:])

示例#21

0

显示文件

文件： test_utils_markup.py 项目： chzealot/scrapy

    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'),  unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'),
                         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
        # keep some entities
        self.assertEqual(remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                         u'a < b &illegal; c &#12345678; six')
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                         u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')

示例#22

0

显示文件

文件： extractors.py 项目： herberthamaral/scrapy

def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    txt = url(txt)
    imgurl = None
    if txt:
        # check if the text is style content
        m = _CSS_IMAGERE.search(txt)
        txt = m.groups()[0] if m else txt
        parsed = urlparse.urlparse(txt)
        path = None
        m = _IMAGE_PATH_RE.search(parsed.path)
        if m:
            path = m.group()
        elif parsed.query:
            m = _GENERIC_PATH_RE.search(parsed.path)
            if m:
                path = m.group()
        if path is not None:
            parsed = list(parsed)
            parsed[2] = path
            imgurl = urlparse.urlunparse(parsed)
        if not imgurl:
            imgurl = txt
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

示例#23

0

显示文件

    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(
            remove_entities(
                'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
            ),
            u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
        )
        # keep some entities
        self.assertEqual(
            remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>',
                            keep=['lt', 'amp']),
            u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=False),
            u'a < b &illegal; c &#12345678; six')
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=True), u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                         u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')

示例#24

0

显示文件

文件： extractors.py 项目： richard-ma/CodeReading

def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    txt = url(txt)
    imgurl = None
    if txt:
        # check if the text is style content
        m = _CSS_IMAGERE.search(txt)
        txt = m.groups()[0] if m else txt
        parsed = urlparse.urlparse(txt)
        path = None
        m = _IMAGE_PATH_RE.search(parsed.path)
        if m:
            path = m.group()
        elif parsed.query:
            m = _GENERIC_PATH_RE.search(parsed.path)
            if m:
                path = m.group()
        if path is not None:
            parsed = list(parsed)
            parsed[2] = path
            imgurl = urlparse.urlunparse(parsed)
        if not imgurl:
            imgurl = txt
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

示例#25

0

显示文件

文件： linkextractor.py 项目： netconstructor/slybot

def iterlinks(htmlpage):
    """Iterate through the links in the HtmlPage passed

    For example:
    >>> from scrapely.htmlpage import HtmlPage
    >>> p = HtmlPage(body=u"Please visit <a href='http://scrapinghub.com/'>Scrapinghub</a>")
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/', text=u'Scrapinghub', fragment='', nofollow=False)
    >>> p = HtmlPage(body=u"Go <a href='home.html'>Home</a>")
    >>> iterlinks(p).next()
    Link(url='home.html', text=u'Home', fragment='', nofollow=False)
    
    When a url is specified, absolute urls are made:
    >>> p.url = 'http://scrapinghub.com/'
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/home.html', text=u'Home', fragment='', nofollow=False)

    Base href attributes in the page are respected
    >>> p.body = u"<html><head><base href='myproject/'/></head><body>see my <a href='index.html'>project</a></body>"
    >>> iterlinks(p).next()
    Link(url='http://scrapinghub.com/myproject/index.html', text=u'project', fragment='', nofollow=False)
    >>> p.body = u"<html><head><base href='http://scrape.io/myproject/'/></head><body>see my <a href='index.html'>project</a></body>"
    >>> iterlinks(p).next()
    Link(url='http://scrape.io/myproject/index.html', text=u'project', fragment='', nofollow=False)

    Frameset and iframe urls are extracted
    >>> p = HtmlPage(body=u"<html><frameset><frame src=frame1.html><frame src=frame2.html></frameset><iframe src='iframe.html'/></html>")
    >>> [l.url for l in iterlinks(p)]
    ['frame1.html', 'frame2.html', 'iframe.html']
    
    As are meta refresh tags:
    >>> p = HtmlPage(body=u"<html><head><meta http-equiv='refresh' content='5;url=http://example.com/' />")
    >>> iterlinks(p).next().url
    'http://example.com/'
    
    nofollow is set to True if the link has a rel='nofollow' attribute:
    >>> p = HtmlPage(body=u"<a href='somewhere.html' rel='nofollow'>somewhere</a>")
    >>> list(iterlinks(p))
    [Link(url='somewhere.html', text=u'somewhere', fragment='', nofollow=True)]
    
    It does not require well formed HTML and behaves similar to many browsers
    >>> p = HtmlPage(body=u"<a href='foo'>foo <a href=bar>bar</a><a href='baz'/>baz")
    >>> list(iterlinks(p))
    [Link(url='foo', text=u'foo ', fragment='', nofollow=False), Link(url='bar', text=u'bar', fragment='', nofollow=False), Link(url='baz', text=u'baz', fragment='', nofollow=False)]

    Leading and trailing whitespace should be removed, including in base href
    >>> p = HtmlPage(body=u"<head><base href=' foo/ '/></head><a href='bar '/>baz")
    >>> list(iterlinks(p))
    [Link(url='foo/bar', text=u'baz', fragment='', nofollow=False)]
    """
    base_href = remove_entities(htmlpage.url, encoding=htmlpage.encoding)

    def mklink(url, anchortext=None, nofollow=False):
        url = url.strip()
        fullurl = urljoin_rfc(base_href, remove_entities(url, encoding=htmlpage.encoding), htmlpage.encoding)
        return Link(fullurl, text=anchortext, nofollow=nofollow)

    # iter to quickly scan only tags
    tag_iter = (t for t in htmlpage.parsed_body if isinstance(t, HtmlTag))

    # parse body
    astart = ahref = None
    nofollow = False
    for nexttag in tag_iter:
        tagname = nexttag.tag
        attributes = nexttag.attributes
        if tagname == "a":
            if astart:
                yield mklink(ahref, htmlpage.body[astart : nexttag.start], nofollow)
                astart = ahref = None
                nofollow = False
            if nexttag.tag_type != HtmlTagType.CLOSE_TAG:
                href = attributes.get("href")
                if href and not href.startswith("#"):
                    ahref = href
                    astart = nexttag.end
                    nofollow = attributes.get("rel") == u"nofollow"
        elif tagname == "head":
            # scan ahead until end of head section
            for nexttag in tag_iter:
                tagname = nexttag.tag
                if (tagname == "head" and nexttag.tag_type == HtmlTagType.CLOSE_TAG) or tagname == "body":
                    break
                if tagname == "base":
                    href = nexttag.attributes.get("href")
                    if href:
                        joined_base = urljoin_rfc(htmlpage.url, href.strip(), htmlpage.encoding)
                        base_href = remove_entities(joined_base, encoding=htmlpage.encoding)
                elif tagname == "meta":
                    attrs = nexttag.attributes
                    if attrs.get("http-equiv") == "refresh":
                        m = _META_REFRESH_CONTENT_RE.search(attrs.get("content", ""))
                        if m:
                            target = m.group("url")
                            if target:
                                yield mklink(target)
        elif tagname == "area":
            href = attributes.get("href")
            if href:
                nofollow = attributes.get("rel") == u"nofollow"
                yield mklink(href, attributes.get("alt", ""), nofollow)
        elif tagname in ("frame", "iframe"):
            target = attributes.get("src")
            if target:
                yield mklink(target)
        elif "onclick" in attributes:
            # FIXME: extract URLs in onclick and add doctests
            pass

    if astart:
        yield mklink(ahref, htmlpage.body[astart:])

示例#26

0

显示文件

    def parse_product(self, response):
        soup = BeautifulSoup(response.body)
        if not soup.find('div', attrs={'class': 'product'}):
            retry_request = _retry_page(response)
            if retry_request:
                yield retry_request
            else:
                self.log(
                    "Error parsing page, couldn't extract product name: %s" %
                    response.url)
            return
        main_name = soup.find('div', attrs={'class': 'product'}).h1.text
        main_name = remove_entities(main_name)
        brand_el = soup.find(
            lambda tag: tag.name == 'td' and 'brand' in tag.text.lower())
        brand = brand_el.findNextSibling('td').text.strip() if brand_el else ''
        cat_names = [
            span.a.text
            for span in soup.find('div', attrs={
                'class': 'breadcrumbtrail'
            }).span.findAll('span') if span.a
        ][2:]
        image_url = soup.find('img', {'itemprop': 'image'})
        image_url = image_url['src'] if image_url else None

        table = soup.find('table', id='responsive-table')
        options = soup.findAll('div', attrs={'class': 'option'})
        if table:
            for row in table.findAll('tr'):
                # Skip head row
                if not row.td:
                    continue

                name = row.find('span', attrs={'class': 'name'}).text
                name = remove_entities(name)
                if not _main_name_in_opt_name(main_name, name):
                    name = main_name + ' ' + name
                identifier = row.find('span', attrs={'class': 'codenumber'})
                if not identifier:
                    self.errors.append(
                        "Identifier not found for products on page: %s" %
                        response.url)
                    continue
                identifier = identifier.text

                price = row.find(_is_price_tag).text
                real_price = extract_price(price)
                if real_price < 15:
                    shipping_cost = 3
                elif real_price < 40:
                    shipping_cost = 4
                elif real_price < 130:
                    shipping_cost = 7
                else:
                    shipping_cost = None

                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()
        elif options:
            main_id = response.url.split('.')[-2].split('p-')[-1]
            price = soup.find('span', attrs={'class': 'inctax'}).span.text
            real_price = extract_price(price)
            if real_price < 15:
                shipping_cost = 3
            elif real_price < 40:
                shipping_cost = 4
            elif real_price < 130:
                shipping_cost = 7
            else:
                shipping_cost = None

            results = {}
            for opt in options:
                opt_name = opt.label.span.text
                results[opt_name] = []
                for subopt in opt.select.findAll('option'):
                    subopt_name = subopt.text
                    subopt_value = _soup_el_get_attr(subopt, 'value')
                    if subopt_value == '0':
                        continue
                    results[opt_name].append({
                        'id':
                        remove_entities(subopt_name).replace('"', ''),
                        'name':
                        opt_name + ': ' + subopt_name
                    })
            for opt_tuple in product(*results.values()):
                name = _build_opt_name(main_name, opt_tuple)
                identifier = _build_opt_id(main_id, opt_tuple)
                loader = ProductLoaderWithNameStrip(Product(),
                                                    response=response)
                loader.add_value('name', name)
                loader.add_value('url', response.url)
                loader.add_value('brand', brand)
                loader.add_value('identifier', identifier)
                loader.add_value('sku', identifier)
                loader.add_value('price', price)
                for cat_name in cat_names:
                    loader.add_value('category', cat_name)
                loader.add_value('shipping_cost', shipping_cost)
                loader.add_value('image_url', image_url)

                yield loader.load_item()

示例#27

0

显示文件

文件： processors.py 项目： saidimu/django-scrape

 def __call__(self, values):
     return [remove_entities(v) for v in arg_to_iter(values)]