class DOCParser(HTMLParser): def __init__(self, *args, **kwargs): HTMLParser.__init__(self, *args, **kwargs) self._logger = logging.getLogger(self.__class__.__name__) self.msg = {} # local (per-document) href cache self.hrefs = [] self.protocols_r = r"^http://" self.mimetype_r = r"^text\/html" self.queue = QueueManager( mqhost="localhost", mqport=61613, userid="", passwd="", qin="to_parse", qout="to_persist", recv_callback=self.worker, encode_parms=["data"], encoding="base64", ) def start(self): self.queue.subscribe() def handle_starttag(self, tag, attrs): attrs_h = dict(attrs) if ( tag == "a" and attrs_h.has_key("href") and attrs_h["href"] is not self.msg["url"] and attrs_h["href"] not in self.hrefs and re.search(self.protocols_r, attrs_h["href"], re.IGNORECASE) ): self._logger.info("Found %s. Enqueuing..." % attrs_h["href"]) try: self.queue.enqueue(msg={"parent": self.msg["url"], "url": attrs_h["href"]}) except: pass else: self.hrefs.append(attrs_h["href"]) def worker(self, msg={}): self.reset() self.msg = msg self.hrefs = [] if re.search(self.mimetype_r, self.msg["headers"]["content-type"], re.IGNORECASE): self._logger.info("Received %s" % msg["url"]) self.feed(msg["data"])