Пример #1
0
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(remove_entities(b'no entities'), six.text_type)
     assert isinstance(remove_entities(b'Price: £100!'),
                       six.text_type)
     assert isinstance(remove_entities(u'no entities'), six.text_type)
     assert isinstance(remove_entities(u'Price: £100!'),
                       six.text_type)
Пример #2
0
 def test_regular(self):
     # regular conversions
     self.assertEqual(remove_entities(u'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities(b'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
                      u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
Пример #3
0
 def test_illegal_entities(self):
     self.assertEqual(
         remove_entities('a < b &illegal; c � six',
                         remove_illegal=False),
         u'a < b &illegal; c &#12345678; six')
     self.assertEqual(
         remove_entities('a &lt; b &illegal; c &#12345678; six',
                         remove_illegal=True), u'a < b  c  six')
     self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')
Пример #4
0
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(
         remove_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>',
                         keep=['lt', 'amp']),
         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(
         remove_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>',
                         keep=[u'lt', u'amp']),
         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
Пример #5
0
 def test_regular(self):
     # regular conversions
     self.assertEqual(remove_entities(u'As low as &#163;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities(b'As low as &pound;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(
         remove_entities(
             'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
         ),
         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
     )
Пример #6
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(
        region,
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' ')
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()
Пример #7
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
Пример #8
0
def join_url(base_url, url, encoding):
    """
        Remove leading and trailing whitespace and punctuation
        join base url and url
    """
    url = url.decode(encoding)
    return urljoin(base_url, remove_entities(url))
Пример #9
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(region, 
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' '
    )
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()
Пример #10
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,

    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'

    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'

    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'

    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'

    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = remove_entities(region.text_content,
                           encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
Пример #11
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
Пример #12
0
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
Пример #13
0
def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
Пример #14
0
def image_url(txt):
    """convert text to a url

    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
Пример #15
0
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [remove_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
Пример #16
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
Пример #17
0
    def clean_content(self, text):
        """
        Return a string of text cleaned up by tags, entities,
        escape chars, quotes and spaces
        """

        temp = remove_tags_with_content(text,
                                        which_ones=('style', 'script',
                                                    'figcaption'))
        temp = remove_tags(temp)
        temp = remove_entities(temp)
        temp = replace_escape_chars(temp)
        temp = unquote_markup(temp)
        temp = " ".join(temp.split())
        return temp
Пример #18
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url).encode('utf-8') for url in links_text])

        return [Link(url, "") for url in urlstext]
Пример #19
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url) for url in links_text])

        return [Link(url, "") for url in urlstext]
Пример #20
0
def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = _script_re.sub(u'', text)
    text = _noscript_re.sub(u'', text)
    text = html.remove_comments(html.remove_entities(text))
    return _ajax_crawlable_re.search(text) is not None
Пример #21
0
def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = _script_re.sub(u'', text)
    text = _noscript_re.sub(u'', text)
    text = html.remove_comments(html.remove_entities(text))
    return _ajax_crawlable_re.search(text) is not None
Пример #22
0
 def test_illegal_entities(self):
     self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                      u'a < b &illegal; c &#12345678; six')
     self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                      u'a < b  c  six')
     self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')
Пример #23
0
 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')
Пример #24
0
 def test_encoding(self):
     self.assertEqual(remove_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')
Пример #25
0
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(remove_entities(b'no entities'), six.text_type)
     assert isinstance(remove_entities(b'Price: &pound;100!'),  six.text_type)
     assert isinstance(remove_entities(u'no entities'), six.text_type)
     assert isinstance(remove_entities(u'Price: &pound;100!'),  six.text_type)
Пример #26
0
    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(
            remove_entities(
                'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
            ),
            u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
        )
        # keep some entities
        self.assertEqual(
            remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>',
                            keep=['lt', 'amp']),
            u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=False),
            u'a < b &illegal; c &#12345678; six')
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=True), u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                         u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')
Пример #27
0
from w3lib.html import remove_entities
from urlparse import urljoin


def clean_link(link_text):
    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, remove_entities(clean_link(u.decode(response_encoding)))
)
Пример #28
0
 def test_encoding(self):
     self.assertEqual(remove_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')
Пример #29
0
 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                      u'x\u2122y')
Пример #30
0
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(remove_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(remove_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=[u'lt', u'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
Пример #31
0
def deduplication(arg):
    """
        deduplication the arg.

        @param:
            arg:the variable to deduplication

        if arg is list,then deduplication it and then the new list.
        if arg is tuple,then deduplication it and then the new tuple.
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg

def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
Пример #32
0
#-*-coding:utf-8-*-

from urllib import quote
from w3lib.html import remove_entities
from w3lib.url import _safe_chars
from urlparse import urljoin

list_first_item = lambda x:x[0] if x else None

def clean_link(link_text):
    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))

def parse_query_string(query):
    params = query.split("&")
    keyvals = []
    for param in params:
        kv = param.split("=") + [None]
        keyvals.append((kv[0], kv[1]))
    return keyvals


def filter_query(query, remove_re=None, keep_re=None):
    keyvals = parse_query_string(query)
    qargs = []
    for k, v in keyvals:
        if remove_re is not None and remove_re.search(k):
            continue
        if keep_re is None or keep_re.search(k):
            qarg = quote(k, _safe_chars)
Пример #33
0
import re
from urlparse import urljoin

from w3lib.html import remove_tags, remove_entities, replace_escape_chars
from w3lib.url import safe_url_string

linkre = re.compile(
        "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", 
        re.DOTALL | re.IGNORECASE)

def clean_link(link_text):
    """Remove leading and trailing whitespace and punctuation"""
    return link_text.strip("\t\r\n '\"")

base_url="http://www.singeat.com"
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode('utf-8'))))
url = "/shop/34566#index=9#pageid=14"
print clean_url(url)
print safe_url_string(clean_url(url), 'utf-8')
Пример #34
0
    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'),  unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'),
                         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
        # keep some entities
        self.assertEqual(remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                         u'a < b &illegal; c &#12345678; six')
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                         u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')
Пример #35
0
 def cleanHtml(html):
     return remove_comments(remove_tags(remove_entities(html))).encode('ascii','ignore')
Пример #36
0
def good_to_int(arg):
    ret = []
    for i in arg:
        ret.append(int(i[len('\xa0'):]))
    return ret


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u: urljoin(base_url, remove_entities(clean_link(u)
                                                                  ))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""

prefix = 'http://so.gushiwen.org/authors/authorsw_'
posfix = '.aspx'


def get_author_page(url):
    v = url[len(prefix):-len(posfix)]
    a_pos = v.find('A')
    author = int(v[:a_pos])
    page = int(v[a_pos + 1:])
    return author, page
Пример #37
0
        deduplication the arg.

        @param:
            arg:the variable to deduplication

        if arg is list,then deduplication it and then the new list.
        if arg is tuple,then deduplication it and then the new tuple.
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, remove_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
Пример #38
0
    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities("no entities"), unicode)
        assert isinstance(remove_entities("Price: &pound;100!"), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u"As low as &#163;100!"), u"As low as \xa3100!")
        self.assertEqual(remove_entities("As low as &pound;100!"), u"As low as \xa3100!")
        self.assertEqual(
            remove_entities(
                "redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant"
            ),
            u"redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant",
        )
        # keep some entities
        self.assertEqual(
            remove_entities("<b>Low &lt; High &amp; Medium &pound; six</b>", keep=["lt", "amp"]),
            u"<b>Low &lt; High &amp; Medium \xa3 six</b>",
        )

        # illegal entities
        self.assertEqual(
            remove_entities("a &lt; b &illegal; c &#12345678; six", remove_illegal=False),
            u"a < b &illegal; c &#12345678; six",
        )
        self.assertEqual(remove_entities("a &lt; b &illegal; c &#12345678; six", remove_illegal=True), u"a < b  c  six")
        self.assertEqual(remove_entities("x&#x2264;y"), u"x\u2264y")

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities("x&#153;y", encoding="cp1252"), u"x\u2122y")

        # encoding
        self.assertEqual(remove_entities("x\x99&#153;&#8482;y", encoding="cp1252"), u"x\u2122\u2122\u2122y")