def add_url(self, url): if not isValidScheme(url): logger.warning("not vaild_scheme") return logger.debug("get url: %s" % url) self.fetch_queue.put(url)
def do_work(self, url): if not isValidScheme(url): logger.warning("not vaild_scheme") return None try: response = yield self.fetch(url) except tornado.httpclient.HTTPError as e: # import traceback # traceback.print_exc() # TODO # Some bug here. Too many file open. # with open('httperror.txt', "a") as f: # f.write("Url: %s HTTPError: %s \n"% (url,e.code)) logger.error("Url: %s HTTPError: %s " % (url, e.code)) except: import traceback traceback.print_exc() logger.error("Unknow error with url: %s" % url) else: url_gen = self.parse(response) self.fetch_finished.append(url) sender = Sender() for u in url_gen: sender.add_url(u) logging.info("fetched %s" % url) self.fetching -= 1