def __init__(self, url): self.url = url try: self.html = urlfetch.fetch(url).content except urlfetch_errors.InvalidURLError: return self.soup = BeautifulSoup(self.html) self.populate_me_links()
def extractLogo(self, html): soup = BeautifulSoup(html) img = soup.find('img', id='logo') div = soup.find('div', id='logo') description = '' if img: pic = img['src'] if 'title' in img: description = img['title'] elif div: pic = re.search(r'(?<=url\()\S+(?=\))', str(div)).group(0) else: msg = 'Cronjob: Logos: could not find IMG or DIV tag with logo id!' logging.error(msg) print msg return False #pic = pic.decode(charset) return [pic, description]
class Page: url = "" html = "" soup = None hcards = [] me_links = [] def __init__(self, url): self.url = url try: self.html = urlfetch.fetch(url).content except urlfetch_errors.InvalidURLError: return self.soup = BeautifulSoup(self.html) self.populate_me_links() def populate_me_links(self): self.me_links = [ urlparse.urljoin(self.url, tag["href"]) for tag in self.soup.findAll(attrs={"rel": "me"}) if tag.get("href") ]