Пример #1
0
class MiningServer(object):
    def __init__(self):
        self.reactor = HttpReactor()
        self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False)
        self.html_parser = lxml.html.HTMLParser(encoding="utf-8")
        self.logger = logging.getLogger("root")

    def process_body(self, body, url, obj_id):
        body = to_unicode(body)
        body.replace('<?xml version="1.0" encoding="utf-8"?>', "")
        body = self.cleaner.clean_html(body)
        with open("../data/mining_task/" + str(obj_id), "wb") as fout:
            g = gzip.GzipFile(mode="wb", fileobj=fout)
            try:
                g.write(body.encode("utf-8"))
            finally:
                g.close()
        print url
        # print body[:100].encode('utf-8')

    def process_error(self, failure, url, obj_id):
        print failure.getErrorMessage()
        self.logger.error("download error, url:%s, msg:%s" % (url, failure.getTraceback()))

    def process_task(self, url, obj_id):
        url = url.encode("utf-8")
        requestProcess = (lambda x: None, (), {})
        bodyProcess = (self.process_body, (url, obj_id), {})
        errorProcess = (self.process_error, (url, obj_id), {})

        # print "process_task:", url
        self.reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True)

    def run(self):
        self.reactor.run()
Пример #2
0
def main():
    #signal.signal(signal.SIGINT, lambda : sys.exit(0))
    #signal.signal(signal.SIGTERM, lambda : sys.exit(0))

    logging.config.fileConfig("../conf/log_mining_crawler.conf")
    reactor = HttpReactor()
    threadpool = HttpThreadpool(40, 200)
    config = ConfigParser.ConfigParser()
    config.read('../conf/mining_crawler.conf')
    init_url = [
            'http://news.qq.com/',
            'http://news.163.com/',
            'http://news.sina.com.cn/',
            'http://news.ifeng.com/',
            'http://news.sohu.com/',
            'http://www.xinhuanet.com/',
            ]
    init_url = ['http://news.qq.com/']
    for url in init_url:
        miner_server = MinerServer(reactor, threadpool, [url], config, False)
        t = threading.Thread(target=miner_server.start, args=(False,))
        t.setDaemon(True)
        t.start()

    url = 'http://sports.163.com/'
    #first_task = miner_server.db_helper.init_mining_job(url)
    #miner_server.process_task(first_task)

    reactor.run()
Пример #3
0
def main():
    #signal.signal(signal.SIGINT, lambda : sys.exit(0))
    #signal.signal(signal.SIGTERM, lambda : sys.exit(0))

    logging.config.fileConfig("../conf/seed_log.conf")
    conf = dict(address="localhost", port=10010, db_name="news_crawler")
    queue_service = BlockingQueueService(100)
    handler = SeedHandler(queue_service)
    scheduler = SeedScheduler('background', handler, conf)
    scheduler.start()

    reactor = HttpReactor()
    config = ConfigParser.ConfigParser()
    config.read('../conf/url_dedup.conf')
    hubserver = HubServer(reactor, queue_service, config)
    t = threading.Thread(target=hubserver.start)
    t.daemon = True
    t.start()

    url = "http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=1"
    url = "http://www.163.com/"
    for _ in xrange(2):
        queue_service.put(SeedTask(url), 1)
    hubserver.process_task(url)
    reactor.run()
Пример #4
0
def main():
    reactor = HttpReactor()

    url = 'http://3g.163.com/news/16/0101/00/BC70TOEK00014AED.html'
    requestProcess = (process_request, (url,), {})
    bodyProcess = (process_body, (url,), {})
    errorProcess = (process_error, (url,), {})

    reactor.download_and_process(url, None, requestProcess, bodyProcess, errorProcess, redirect=True)
    reactor.run()
Пример #5
0
 def __init__(self):
     self.reactor = HttpReactor()
     self.cleaner = lxml.html.clean.Cleaner(style=True, scripts=True, page_structure=False, safe_attrs_only=False)
     self.html_parser = lxml.html.HTMLParser(encoding="utf-8")
     self.logger = logging.getLogger("root")