def del_bloomd_filter(cfg, crawler_name): client = bloomd.get_client(nodes=cfg["BloomdNodes"]) f = client.create_filter(crawler_name) try: f.drop() print "[info] drop bloomd %s success" % crawler_name except Exception: print "[error] drop bloomd %s fail" % crawler_name
def setUp(self): self.crawler_name = 'test' self.req_d = { 'crawler_name': self.crawler_name, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'proxy_name': 'http_china', 'method': 'GET', 'headers': {}, 'files': None, 'data': None, 'params': {}, 'auth': None, 'cookies': {}, 'hooks': None, 'json': None, 'timeout': 10, } test_html_file = os.path.join(os.path.dirname(__file__), "test.html") with open(test_html_file, 'r') as f: html = f.read() self.resp_d = { 'crawler_name': self.crawler_name, 'http_request': json.dumps(self.req_d), 'error_code': 0, 'error_msg': '', 'status_code': 200, 'reason': 'OK', 'html': html, 'cookies': {}, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'headers': {}, 'encoding': None, 'elapsed': None, 'http_proxy': '127.0.0.1:8000' } cfg = Config().get() self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) self.publish_channel = conn.channel() self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) self.filter_q = FilterQueue( crawler_name=self.crawler_name, bloomd_client=self.bloomd_client ) self.req_q = RequestQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, filter_q=self.filter_q ) html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]] fake_worker = type("Worker", (object, ), {}) setattr(fake_worker, "crawler_name", self.crawler_name) setattr(fake_worker, "req_q", self.req_q) setattr(fake_worker, "publish_channel", self.publish_channel) setattr(fake_worker, "html_404_strings", html_404_strings) self.error_handler = Plugin(fake_worker)
def test(): bloomd_client = get_client() crawler_name = "test_crawler_1" queue = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name) page_cnt = 50000 cnt = 0 logger = get_logger("test_queue") start_time = time.time() for i in xrange(1, page_cnt + 1): url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i cnt += 1 if cnt % 1000 == 0: logger.info(str(cnt)) queue.push(url) end_time = time.time() print "start time: ", start_time print "end time: ", end_time print "speed: %f times/second" % (page_cnt / (end_time - start_time))
def test(): crawler_name = "weibo" cfg = Config().get() ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name) queue_name = "http_request:%s:test" % crawler_name queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name) ch = conn.channel() ch.exchange_declare( exchange=crawler_name, exchange_type="topic", durable=True ) ch.queue_declare( queue=queue_name, durable=True ) ch.queue_bind( exchange=crawler_name, queue=queue_name, routing_key=queue_name ) ch.close() publish_channel = conn.channel() publish_channel.confirm_delivery() page_cnt = 50000 cnt = 0 logger = get_logger("test_queue") start_time = time.time() for i in xrange(1, page_cnt + 1): url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i cnt += 1 if cnt % 1000 == 0: logger.info(str(cnt)) r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name) queue.push(r, publish_channel) end_time = time.time() print "start time: ", start_time print "end time: ", end_time print "speed: %f times/second" % (page_cnt / (end_time - start_time))
def test(): crawler_name = "weibo" cfg = Config().get() ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name) queue_name = "http_request:%s:test" % crawler_name queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name) ch = conn.channel() ch.exchange_declare(exchange=crawler_name, exchange_type="topic", durable=True) ch.queue_declare(queue=queue_name, durable=True) ch.queue_bind(exchange=crawler_name, queue=queue_name, routing_key=queue_name) ch.close() publish_channel = conn.channel() publish_channel.confirm_delivery() page_cnt = 50000 cnt = 0 logger = get_logger("test_queue") start_time = time.time() for i in xrange(1, page_cnt + 1): url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i cnt += 1 if cnt % 1000 == 0: logger.info(str(cnt)) r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name) queue.push(r, publish_channel) end_time = time.time() print "start time: ", start_time print "end time: ", end_time print "speed: %f times/second" % (page_cnt / (end_time - start_time))
def setUp(self): cfg = Config().get() self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"]) conn = create_conn(cfg) self.crawler_name = 'test_crawler' self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) self.filter_q = FilterQueue( crawler_name=self.crawler_name, bloomd_client=self.bloomd_client ) self.req_q = RequestQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, filter_q=self.filter_q ) self.resp_q = ResponseQueue( self.crawler_name, ssdb_clients=self.ssdb_clients, ) self.publish_channel = conn.channel() self.req_d = { 'crawler_name': self.crawler_name, 'url': 'http://stackoverflow.com/users/1144035/gordon-linoff', 'proxy_name': 'http_china', 'method': 'GET', 'headers': {}, 'files': None, 'data': None, 'params': {}, 'auth': None, 'cookies': {}, 'hooks': None, 'json': None, 'timeout': 10, } self.req = Request(**self.req_d)