예제 #1
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(remove_entities(b'no entities'), six.text_type)
     assert isinstance(remove_entities(b'Price: £100!'),
                       six.text_type)
     assert isinstance(remove_entities(u'no entities'), six.text_type)
     assert isinstance(remove_entities(u'Price: £100!'),
                       six.text_type)
예제 #2
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_regular(self):
     # regular conversions
     self.assertEqual(remove_entities(u'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities(b'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
                      u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
예제 #3
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_illegal_entities(self):
     self.assertEqual(
         remove_entities('a < b &illegal; c � six',
                         remove_illegal=False),
         u'a < b &illegal; c &#12345678; six')
     self.assertEqual(
         remove_entities('a &lt; b &illegal; c &#12345678; six',
                         remove_illegal=True), u'a < b  c  six')
     self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')
예제 #4
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(
         remove_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>',
                         keep=['lt', 'amp']),
         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(
         remove_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>',
                         keep=[u'lt', u'amp']),
         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
예제 #5
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_regular(self):
     # regular conversions
     self.assertEqual(remove_entities(u'As low as &#163;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities(b'As low as &pound;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(
         remove_entities(
             'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
         ),
         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
     )
예제 #6
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(
        region,
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' ')
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()
예제 #7
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
예제 #8
0
def join_url(base_url, url, encoding):
    """
        Remove leading and trailing whitespace and punctuation
        join base url and url
    """
    url = url.decode(encoding)
    return urljoin(base_url, remove_entities(url))
예제 #9
0
파일: extractors.py 프로젝트: 4iji/scrapely
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(region, 
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' '
    )
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()
예제 #10
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,

    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'

    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'

    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'

    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'

    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = remove_entities(region.text_content,
                           encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
예제 #11
0
파일: regex.py 프로젝트: bihicheng/scrapy
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
예제 #12
0
파일: regex.py 프로젝트: BillWangCS/scrapy
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
예제 #13
0
파일: extractors.py 프로젝트: 4iji/scrapely
def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
예제 #14
0
def image_url(txt):
    """convert text to a url

    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None
예제 #15
0
파일: misc.py 프로젝트: tskylee/scrapy
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [remove_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
예제 #16
0
파일: regex.py 프로젝트: ilustreous/scrapy
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
예제 #17
0
    def clean_content(self, text):
        """
        Return a string of text cleaned up by tags, entities,
        escape chars, quotes and spaces
        """

        temp = remove_tags_with_content(text,
                                        which_ones=('style', 'script',
                                                    'figcaption'))
        temp = remove_tags(temp)
        temp = remove_entities(temp)
        temp = replace_escape_chars(temp)
        temp = unquote_markup(temp)
        temp = " ".join(temp.split())
        return temp
예제 #18
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url).encode('utf-8') for url in links_text])

        return [Link(url, "") for url in urlstext]
예제 #19
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url) for url in links_text])

        return [Link(url, "") for url in urlstext]
예제 #20
0
def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = _script_re.sub(u'', text)
    text = _noscript_re.sub(u'', text)
    text = html.remove_comments(html.remove_entities(text))
    return _ajax_crawlable_re.search(text) is not None
예제 #21
0
def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = _script_re.sub(u'', text)
    text = _noscript_re.sub(u'', text)
    text = html.remove_comments(html.remove_entities(text))
    return _ajax_crawlable_re.search(text) is not None
예제 #22
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_illegal_entities(self):
     self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                      u'a < b &illegal; c &#12345678; six')
     self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                      u'a < b  c  six')
     self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')
예제 #23
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')
예제 #24
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_encoding(self):
     self.assertEqual(remove_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')
예제 #25
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(remove_entities(b'no entities'), six.text_type)
     assert isinstance(remove_entities(b'Price: &pound;100!'),  six.text_type)
     assert isinstance(remove_entities(u'no entities'), six.text_type)
     assert isinstance(remove_entities(u'Price: &pound;100!'),  six.text_type)
예제 #26
0
    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(
            remove_entities(
                'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
            ),
            u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
        )
        # keep some entities
        self.assertEqual(
            remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>',
                            keep=['lt', 'amp']),
            u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=False),
            u'a < b &illegal; c &#12345678; six')
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=True), u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                         u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')
예제 #27
0
from w3lib.html import remove_entities
from urlparse import urljoin


def clean_link(link_text):
    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, remove_entities(clean_link(u.decode(response_encoding)))
)
예제 #28
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_encoding(self):
     self.assertEqual(remove_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')
예제 #29
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                      u'x\u2122y')
예제 #30
0
파일: test_html.py 프로젝트: azizur77/w3lib
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(remove_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(remove_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=[u'lt', u'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
예제 #31
0
def deduplication(arg):
    """
        deduplication the arg.

        @param:
            arg:the variable to deduplication

        if arg is list,then deduplication it and then the new list.
        if arg is tuple,then deduplication it and then the new tuple.
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg

def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
예제 #32
0
#-*-coding:utf-8-*-

from urllib import quote
from w3lib.html import remove_entities
from w3lib.url import _safe_chars
from urlparse import urljoin

list_first_item = lambda x:x[0] if x else None

def clean_link(link_text):
    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))

def parse_query_string(query):
    params = query.split("&")
    keyvals = []
    for param in params:
        kv = param.split("=") + [None]
        keyvals.append((kv[0], kv[1]))
    return keyvals


def filter_query(query, remove_re=None, keep_re=None):
    keyvals = parse_query_string(query)
    qargs = []
    for k, v in keyvals:
        if remove_re is not None and remove_re.search(k):
            continue
        if keep_re is None or keep_re.search(k):
            qarg = quote(k, _safe_chars)
예제 #33
0
파일: link.py 프로젝트: lambokini/se-python
import re
from urlparse import urljoin

from w3lib.html import remove_tags, remove_entities, replace_escape_chars
from w3lib.url import safe_url_string

linkre = re.compile(
        "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", 
        re.DOTALL | re.IGNORECASE)

def clean_link(link_text):
    """Remove leading and trailing whitespace and punctuation"""
    return link_text.strip("\t\r\n '\"")

base_url="http://www.singeat.com"
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode('utf-8'))))
url = "/shop/34566#index=9#pageid=14"
print clean_url(url)
print safe_url_string(clean_url(url), 'utf-8')
예제 #34
0
파일: test_html.py 프로젝트: Dior222/w3lib
    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'),  unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'),
                         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
        # keep some entities
        self.assertEqual(remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                         u'a < b &illegal; c &#12345678; six')
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                         u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')
예제 #35
0
 def cleanHtml(html):
     return remove_comments(remove_tags(remove_entities(html))).encode('ascii','ignore')
예제 #36
0
def good_to_int(arg):
    ret = []
    for i in arg:
        ret.append(int(i[len('\xa0'):]))
    return ret


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u: urljoin(base_url, remove_entities(clean_link(u)
                                                                  ))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""

prefix = 'http://so.gushiwen.org/authors/authorsw_'
posfix = '.aspx'


def get_author_page(url):
    v = url[len(prefix):-len(posfix)]
    a_pos = v.find('A')
    author = int(v[:a_pos])
    page = int(v[a_pos + 1:])
    return author, page
예제 #37
0
파일: select_result.py 프로젝트: beforeWQ/-
        deduplication the arg.

        @param:
            arg:the variable to deduplication

        if arg is list,then deduplication it and then the new list.
        if arg is tuple,then deduplication it and then the new tuple.
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, remove_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
예제 #38
0
파일: test_html.py 프로젝트: kmike/w3lib
    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities("no entities"), unicode)
        assert isinstance(remove_entities("Price: &pound;100!"), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u"As low as &#163;100!"), u"As low as \xa3100!")
        self.assertEqual(remove_entities("As low as &pound;100!"), u"As low as \xa3100!")
        self.assertEqual(
            remove_entities(
                "redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant"
            ),
            u"redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant",
        )
        # keep some entities
        self.assertEqual(
            remove_entities("<b>Low &lt; High &amp; Medium &pound; six</b>", keep=["lt", "amp"]),
            u"<b>Low &lt; High &amp; Medium \xa3 six</b>",
        )

        # illegal entities
        self.assertEqual(
            remove_entities("a &lt; b &illegal; c &#12345678; six", remove_illegal=False),
            u"a < b &illegal; c &#12345678; six",
        )
        self.assertEqual(remove_entities("a &lt; b &illegal; c &#12345678; six", remove_illegal=True), u"a < b  c  six")
        self.assertEqual(remove_entities("x&#x2264;y"), u"x\u2264y")

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities("x&#153;y", encoding="cp1252"), u"x\u2122y")

        # encoding
        self.assertEqual(remove_entities("x\x99&#153;&#8482;y", encoding="cp1252"), u"x\u2122\u2122\u2122y")