Пример #1
0
class MiningServer(object):
    def __init__(self):
        self.reactor = HttpReactor()
        self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False)
        self.html_parser = lxml.html.HTMLParser(encoding="utf-8")
        self.logger = logging.getLogger("root")

    def process_body(self, body, url, obj_id):
        body = to_unicode(body)
        body.replace('<?xml version="1.0" encoding="utf-8"?>', "")
        body = self.cleaner.clean_html(body)
        with open("../data/mining_task/" + str(obj_id), "wb") as fout:
            g = gzip.GzipFile(mode="wb", fileobj=fout)
            try:
                g.write(body.encode("utf-8"))
            finally:
                g.close()
        print url
        # print body[:100].encode('utf-8')

    def process_error(self, failure, url, obj_id):
        print failure.getErrorMessage()
        self.logger.error("download error, url:%s, msg:%s" % (url, failure.getTraceback()))

    def process_task(self, url, obj_id):
        url = url.encode("utf-8")
        requestProcess = (lambda x: None, (), {})
        bodyProcess = (self.process_body, (url, obj_id), {})
        errorProcess = (self.process_error, (url, obj_id), {})

        # print "process_task:", url
        self.reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True)

    def run(self):
        self.reactor.run()
Пример #2
0
def main():
    reactor = HttpReactor()

    url = 'http://3g.163.com/news/16/0101/00/BC70TOEK00014AED.html'
    requestProcess = (process_request, (url,), {})
    bodyProcess = (process_body, (url,), {})
    errorProcess = (process_error, (url,), {})

    reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True)
    reactor.run()