Python remove_entities примеры, w3lib.html.remove_entities Python примеры использования

Пример #1

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(remove_entities(b'no entities'), six.text_type)
     assert isinstance(remove_entities(b'Price: &pound;100!'),
                       six.text_type)
     assert isinstance(remove_entities(u'no entities'), six.text_type)
     assert isinstance(remove_entities(u'Price: &pound;100!'),
                       six.text_type)

Пример #2

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_regular(self):
     # regular conversions
     self.assertEqual(remove_entities(u'As low as &#163;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities(b'As low as &pound;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'),
                      u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')

Пример #3

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_illegal_entities(self):
     self.assertEqual(
         remove_entities('a &lt; b &illegal; c &#12345678; six',
                         remove_illegal=False),
         u'a < b &illegal; c &#12345678; six')
     self.assertEqual(
         remove_entities('a &lt; b &illegal; c &#12345678; six',
                         remove_illegal=True), u'a < b  c  six')
     self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

Пример #4

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(
         remove_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>',
                         keep=['lt', 'amp']),
         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(
         remove_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>',
                         keep=[u'lt', u'amp']),
         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

Пример #5

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_regular(self):
     # regular conversions
     self.assertEqual(remove_entities(u'As low as &#163;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(remove_entities(b'As low as &pound;100!'),
                      u'As low as \xa3100!')
     self.assertEqual(
         remove_entities(
             'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
         ),
         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
     )

Пример #6

0

Показать файл

Файл: extractors.py Проект: wmelton/scrapely

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(
        region,
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' ')
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()

Пример #7

0

Показать файл

Файл: extractors.py Проект: 249550148/scrapely

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = remove_entities(region.text_content, encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()

Пример #8

0

Показать файл

Файл: common.py Проект: bingyangli/webcrawler

def join_url(base_url, url, encoding):
    """
        Remove leading and trailing whitespace and punctuation
        join base url and url
    """
    url = url.decode(encoding)
    return urljoin(base_url, remove_entities(url))

Пример #9

0

Показать файл

Файл: extractors.py Проект: 4iji/scrapely

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,
    
    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'
    
    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'
    
    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'
    
    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'
    
    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'
    """
    chunks = _process_markup(region, 
        lambda text: remove_entities(text, encoding=region.htmlpage.encoding),
        lambda tag: u' '
    )
    text = u''.join(chunks)
    return _WS.sub(u' ', text).strip()

Пример #10

0

Показать файл

Файл: extractors.py Проект: bopopescu/vinalo

def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,

    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'

    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'

    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'

    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'

    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = remove_entities(region.text_content,
                           encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()

Пример #11

0

Показать файл

Файл: regex.py Проект: bihicheng/scrapy

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]

Пример #12

0

Показать файл

Файл: regex.py Проект: BillWangCS/scrapy

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]

Пример #13

0

Показать файл

Файл: extractors.py Проект: 4iji/scrapely

def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

Пример #14

0

Показать файл

Файл: extractors.py Проект: bopopescu/vinalo

def image_url(txt):
    """convert text to a url

    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

Пример #15

0

Показать файл

Файл: misc.py Проект: tskylee/scrapy

def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, unicode):
        return [remove_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [remove_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]

Пример #16

0

Показать файл

Файл: regex.py Проект: ilustreous/scrapy

    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]

Пример #17

0

Показать файл

    def clean_content(self, text):
        """
        Return a string of text cleaned up by tags, entities,
        escape chars, quotes and spaces
        """

        temp = remove_tags_with_content(text,
                                        which_ones=('style', 'script',
                                                    'figcaption'))
        temp = remove_tags(temp)
        temp = remove_entities(temp)
        temp = replace_escape_chars(temp)
        temp = unquote_markup(temp)
        temp = " ".join(temp.split())
        return temp

Пример #18

0

Показать файл

Файл: linkextractor.py Проект: uelramon/hyphe

    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url).encode('utf-8') for url in links_text])

        return [Link(url, "") for url in urlstext]

Пример #19

0

Показать файл

Файл: linkextractor.py Проект: noscripter/hyphe

    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([clean_url(url) for url in links_text])

        return [Link(url, "") for url in urlstext]

Пример #20

0

Показать файл

Файл: ajaxcrawl.py Проект: xtmhm2000/scrapy-0.22

def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = _script_re.sub(u'', text)
    text = _noscript_re.sub(u'', text)
    text = html.remove_comments(html.remove_entities(text))
    return _ajax_crawlable_re.search(text) is not None

Пример #21

0

Показать файл

Файл: ajaxcrawl.py Проект: pyarnold/scrapy

def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = _script_re.sub(u'', text)
    text = _noscript_re.sub(u'', text)
    text = html.remove_comments(html.remove_entities(text))
    return _ajax_crawlable_re.search(text) is not None

Пример #22

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_illegal_entities(self):
     self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                      u'a < b &illegal; c &#12345678; six')
     self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                      u'a < b  c  six')
     self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

Пример #23

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')

Пример #24

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_encoding(self):
     self.assertEqual(remove_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')

Пример #25

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(remove_entities(b'no entities'), six.text_type)
     assert isinstance(remove_entities(b'Price: &pound;100!'),  six.text_type)
     assert isinstance(remove_entities(u'no entities'), six.text_type)
     assert isinstance(remove_entities(u'Price: &pound;100!'),  six.text_type)

Пример #26

0

Показать файл

Файл: test_html.py Проект: nasirsphi/w3lib

    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(
            remove_entities(
                'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'
            ),
            u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant'
        )
        # keep some entities
        self.assertEqual(
            remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>',
                            keep=['lt', 'amp']),
            u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=False),
            u'a < b &illegal; c &#12345678; six')
        self.assertEqual(
            remove_entities('a &lt; b &illegal; c &#12345678; six',
                            remove_illegal=True), u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                         u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')

Пример #27

0

Показать файл

Файл: select_result.py Проект: youtaya/Line2u

from w3lib.html import remove_entities
from urlparse import urljoin


def clean_link(link_text):
    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, remove_entities(clean_link(u.decode(response_encoding)))
)

Пример #28

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_encoding(self):
     self.assertEqual(remove_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')

Пример #29

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'),
                      u'x\u2122y')

Пример #30

0

Показать файл

Файл: test_html.py Проект: azizur77/w3lib

 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(remove_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(remove_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=[u'lt', u'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

Пример #31

0

Показать файл

Файл: select_result.py Проект: LaveyD/spider

def deduplication(arg):
    """
        deduplication the arg.

        @param:
            arg:the variable to deduplication

        if arg is list,then deduplication it and then the new list.
        if arg is tuple,then deduplication it and then the new tuple.
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg

def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""

Пример #32

0

Показать файл

Файл: utils.py Проект: mezhou887/scrapysystem

#-*-coding:utf-8-*-

from urllib import quote
from w3lib.html import remove_entities
from w3lib.url import _safe_chars
from urlparse import urljoin

list_first_item = lambda x:x[0] if x else None

def clean_link(link_text):
    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url, remove_entities(clean_link(u.decode(response_encoding))))

def parse_query_string(query):
    params = query.split("&")
    keyvals = []
    for param in params:
        kv = param.split("=") + [None]
        keyvals.append((kv[0], kv[1]))
    return keyvals


def filter_query(query, remove_re=None, keep_re=None):
    keyvals = parse_query_string(query)
    qargs = []
    for k, v in keyvals:
        if remove_re is not None and remove_re.search(k):
            continue
        if keep_re is None or keep_re.search(k):
            qarg = quote(k, _safe_chars)

Пример #33

0

Показать файл

Файл: link.py Проект: lambokini/se-python

import re
from urlparse import urljoin

from w3lib.html import remove_tags, remove_entities, replace_escape_chars
from w3lib.url import safe_url_string

linkre = re.compile(
        "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>", 
        re.DOTALL | re.IGNORECASE)

def clean_link(link_text):
    """Remove leading and trailing whitespace and punctuation"""
    return link_text.strip("\t\r\n '\"")

base_url="http://www.singeat.com"
clean_url = lambda u: urljoin(base_url, remove_entities(clean_link(u.decode('utf-8'))))
url = "/shop/34566#index=9#pageid=14"
print clean_url(url)
print safe_url_string(clean_url(url), 'utf-8')

Пример #34

0

Показать файл

Файл: test_html.py Проект: Dior222/w3lib

    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities('no entities'), unicode)
        assert isinstance(remove_entities('Price: &pound;100!'),  unicode)

        # regular conversions
        self.assertEqual(remove_entities(u'As low as &#163;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('As low as &pound;100!'),
                         u'As low as \xa3100!')
        self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant'),
                         u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
        # keep some entities
        self.assertEqual(remove_entities('<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                         u'<b>Low &lt; High &amp; Medium \xa3 six</b>')

        # illegal entities
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=False),
                         u'a < b &illegal; c &#12345678; six')
        self.assertEqual(remove_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                         u'a < b  c  six')
        self.assertEqual(remove_entities('x&#x2264;y'), u'x\u2264y')

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')

        # encoding
        self.assertEqual(remove_entities('x\x99&#153;&#8482;y', encoding='cp1252'), \
                         u'x\u2122\u2122\u2122y')

Пример #35

0

Показать файл

Файл: pipelines.py Проект: Kasperek/clobot

 def cleanHtml(html):
     return remove_comments(remove_tags(remove_entities(html))).encode('ascii','ignore')

Пример #36

0

Показать файл

Файл: result_parse.py Проект: sw5cc/gushiwenorg-spider

def good_to_int(arg):
    ret = []
    for i in arg:
        ret.append(int(i[len('\xa0'):]))
    return ret


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u: urljoin(base_url, remove_entities(clean_link(u)
                                                                  ))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""

prefix = 'http://so.gushiwen.org/authors/authorsw_'
posfix = '.aspx'


def get_author_page(url):
    v = url[len(prefix):-len(posfix)]
    a_pos = v.find('A')
    author = int(v[:a_pos])
    page = int(v[a_pos + 1:])
    return author, page

Пример #37

0

Показать файл

Файл: select_result.py Проект: beforeWQ/-

        deduplication the arg.

        @param:
            arg:the variable to deduplication

        if arg is list,then deduplication it and then the new list.
        if arg is tuple,then deduplication it and then the new tuple.
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, remove_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""

Пример #38

0

Показать файл

Файл: test_html.py Проект: kmike/w3lib

    def test_remove_entities(self):
        # make sure it always return uncode
        assert isinstance(remove_entities("no entities"), unicode)
        assert isinstance(remove_entities("Price: &pound;100!"), unicode)

        # regular conversions
        self.assertEqual(remove_entities(u"As low as &#163;100!"), u"As low as \xa3100!")
        self.assertEqual(remove_entities("As low as &pound;100!"), u"As low as \xa3100!")
        self.assertEqual(
            remove_entities(
                "redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold &frac12;oz solid crucifix pendant"
            ),
            u"redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant",
        )
        # keep some entities
        self.assertEqual(
            remove_entities("<b>Low &lt; High &amp; Medium &pound; six</b>", keep=["lt", "amp"]),
            u"<b>Low &lt; High &amp; Medium \xa3 six</b>",
        )

        # illegal entities
        self.assertEqual(
            remove_entities("a &lt; b &illegal; c &#12345678; six", remove_illegal=False),
            u"a < b &illegal; c &#12345678; six",
        )
        self.assertEqual(remove_entities("a &lt; b &illegal; c &#12345678; six", remove_illegal=True), u"a < b  c  six")
        self.assertEqual(remove_entities("x&#x2264;y"), u"x\u2264y")

        # check browser hack for numeric character references in the 80-9F range
        self.assertEqual(remove_entities("x&#153;y", encoding="cp1252"), u"x\u2122y")

        # encoding
        self.assertEqual(remove_entities("x\x99&#153;&#8482;y", encoding="cp1252"), u"x\u2122\u2122\u2122y")

Python remove_entities примеры использования