示例#1
0
 def _get_title(self, url):
     title = ''
     root, ext = os.path.splitext(url)
     if ext in image_extensions:
         time.sleep(3)  # for avoiding to be treated as spam by Google
         logger.info('Search by google: %s' % url)
         results = google_image.search(url, best_kwds_max_length=18)
         keywords = filter(lambda x: not x.isdigit(), results['best_keywords'])
         title = ''.join(keywords)
     elif not ext in ignore_extensions:
         logger.info('Retrieve web resource: %s' % url)
         html = web.open_url(url)
         soup = BeautifulSoup(html)
         if soup.title and soup.title.string:
             title = soup.title.string
             title = normalize.htmlentity2unicode(title)
             title = self._shorten_title(title)
     return title
示例#2
0
def test_htmlentity2unicode():
    got = normalize.htmlentity2unicode(' ')
    assert got == u'\xa0'
    got = normalize.htmlentity2unicode('♥')
    assert got == u'♥'
示例#3
0
def test_htmlentity2unicode():
    got = normalize.htmlentity2unicode(' ')
    assert_equals(got, u'\xa0')
    got = normalize.htmlentity2unicode('♥')
    assert_equals(got, u'♥')