def _get_title(self, url): title = '' root, ext = os.path.splitext(url) if ext in image_extensions: time.sleep(3) # for avoiding to be treated as spam by Google logger.info('Search by google: %s' % url) results = google_image.search(url, best_kwds_max_length=18) keywords = filter(lambda x: not x.isdigit(), results['best_keywords']) title = ''.join(keywords) elif not ext in ignore_extensions: logger.info('Retrieve web resource: %s' % url) html = web.open_url(url) soup = BeautifulSoup(html) if soup.title and soup.title.string: title = soup.title.string title = normalize.htmlentity2unicode(title) title = self._shorten_title(title) return title
def test_htmlentity2unicode(): got = normalize.htmlentity2unicode(' ') assert got == u'\xa0' got = normalize.htmlentity2unicode('♥') assert got == u'♥'
def test_htmlentity2unicode(): got = normalize.htmlentity2unicode(' ') assert_equals(got, u'\xa0') got = normalize.htmlentity2unicode('♥') assert_equals(got, u'♥')