def __extract_url(self, url): """Extracts the links in the input URL """ import urllib2 from urllister import URLLister from sgmllib import SGMLParseError req = urllib2.Request(url, headers={'User-Agent' : self.useragent}) try: usock = urllib2.urlopen(req) parser = URLLister(url) try: parser.feed(usock.read()) parser.close() except Exception as exception: if (self.debug > 0): print "sgmllib: Unable to parse web page.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() pass usock.close() return parser.urls except (KeyboardInterrupt, SystemExit): raise except Exception as exception: if (self.debug > 0): print "urllib2: Page does not exist or Malformed web address.\n sgmllib: Raised exception %s"%(type(exception).__name__) fd = open(self.outdir+"%s.err"%type(exception).__name__, "a") fd.write( "%s\n"%(url)) fd.close() return []
def postPingBacks(newbody, post_url): """ Make the pingback call """ pingbackresults = [] parser = URLLister() parser.feed(newbody) parser.close() urls = parser.urls for url in urls: url = str(url) result = sendPingback(url, post_url) pingbackresults.append((url, result)) return pingbackresults
def hasLinkToTarget(self, sourceURI, targetURI): sock = urllib.urlopen(sourceURI) html = sock.read() sock.close() # use Mark Pilgrim's URLLister from dive into python, chapter 8 parser = URLLister() parser.feed(html) parser.close() links = parser.urls if targetURI in links: return 1 else: return 0