class MiningServer(object): def __init__(self): self.reactor = HttpReactor() self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False) self.html_parser = lxml.html.HTMLParser(encoding="utf-8") self.logger = logging.getLogger("root") def process_body(self, body, url, obj_id): body = to_unicode(body) body.replace('<?xml version="1.0" encoding="utf-8"?>', "") body = self.cleaner.clean_html(body) with open("../data/mining_task/" + str(obj_id), "wb") as fout: g = gzip.GzipFile(mode="wb", fileobj=fout) try: g.write(body.encode("utf-8")) finally: g.close() print url # print body[:100].encode('utf-8') def process_error(self, failure, url, obj_id): print failure.getErrorMessage() self.logger.error("download error, url:%s, msg:%s" % (url, failure.getTraceback())) def process_task(self, url, obj_id): url = url.encode("utf-8") requestProcess = (lambda x: None, (), {}) bodyProcess = (self.process_body, (url, obj_id), {}) errorProcess = (self.process_error, (url, obj_id), {}) # print "process_task:", url self.reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True) def run(self): self.reactor.run()
def main(): reactor = HttpReactor() url = 'http://3g.163.com/news/16/0101/00/BC70TOEK00014AED.html' requestProcess = (process_request, (url,), {}) bodyProcess = (process_body, (url,), {}) errorProcess = (process_error, (url,), {}) reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True) reactor.run()