示例#1
0
 def __init__(self, cfg):
     self.from_queue = cfg["from_queue"]
     self.from_host = cfg["from_host"]
     self.from_port = cfg["from_port"]
     self.crawler = cfg["crawler"]
     self.credentials = pika.PlainCredentials(cfg["user"], cfg["password"])
     nodes = Config().get()["SSDBNodes"]
     self.ssdb_clients = get_clients(nodes=nodes)
示例#2
0
    def setUp(self):
        self.crawler_name = 'test'
        self.req_d = {
            'crawler_name': self.crawler_name,
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'proxy_name': 'http_china',
            'method': 'GET',
            'headers': {},
            'files': None,
            'data': None,
            'params': {},
            'auth': None,
            'cookies': {},
            'hooks': None,
            'json': None,
            'timeout': 10,
        }
        test_html_file = os.path.join(os.path.dirname(__file__), "test.html")
        with open(test_html_file, 'r') as f:
            html = f.read()

        self.resp_d = {
            'crawler_name': self.crawler_name,
            'http_request': json.dumps(self.req_d),
            'error_code': 0,
            'error_msg': '',
            'status_code': 200,
            'reason': 'OK',
            'html': html,
            'cookies': {},
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'headers': {},
            'encoding': None,
            'elapsed': None,
            'http_proxy': '127.0.0.1:8000'
        }
        cfg = Config().get()
        self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
        conn = create_conn(cfg)
        self.publish_channel = conn.channel()
        self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
        self.filter_q = FilterQueue(
            crawler_name=self.crawler_name,
            bloomd_client=self.bloomd_client
        )
        self.req_q = RequestQueue(
            self.crawler_name,
            ssdb_clients=self.ssdb_clients,
            filter_q=self.filter_q
        )
        html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]]
        fake_worker = type("Worker", (object, ), {})
        setattr(fake_worker, "crawler_name", self.crawler_name)
        setattr(fake_worker, "req_q", self.req_q)
        setattr(fake_worker, "publish_channel", self.publish_channel)
        setattr(fake_worker, "html_404_strings", html_404_strings)
        self.error_handler = Plugin(fake_worker)
示例#3
0
def get(args):
    k = args.key
    if k == "":
        print "key can not be empty"
        return
    nodes = Config().get()["SSDBNodes"]
    clients = get_clients(nodes=nodes)
    client = get_client(clients, k)
    r = redis.Redis(connection_pool=client["connection_pool"])
    resp = r.get(k)
    print resp
    print "client: %s:%s" % (client["node"]["Host"], client["node"]["Port"])
示例#4
0
def get(args):
    k = args.key
    if k == "":
        print "key can not be empty"
        return
    nodes = Config().get()["SSDBNodes"]
    clients = get_clients(nodes=nodes)
    client = get_client(clients, k)
    r = redis.Redis(connection_pool=client["connection_pool"])
    resp = r.get(k)
    print resp
    print "client: %s:%s" % (client["node"]["Host"], client["node"]["Port"])
示例#5
0
def status(args):
    crawler_name = args.crawler_name
    if crawler_name == "":
        print "crawler_name can not be empty"
        return
    print "show %s crawler http_response status..." % crawler_name
    start = "http_response:%s:" % crawler_name
    end = "http_response:%s:z" % crawler_name
    nodes = Config().get()["SSDBNodes"]
    clients = get_clients(nodes=nodes)
    total = 0
    for client in clients:
        print "%s:%s" % (client["node"]["Host"], client["node"]["Port"])
        r = redis.Redis(connection_pool=client["connection_pool"])
        keys = r.execute_command("keys", start, end, -1)
        total += len(keys)
        print "length: ", len(keys)
    print "total: ", total
示例#6
0
def status(args):
    crawler_name = args.crawler_name
    if crawler_name == "":
        print "crawler_name can not be empty"
        return
    print "show %s crawler http_response status..." % crawler_name
    start = "http_response:%s:" % crawler_name
    end = "http_response:%s:z" % crawler_name
    nodes = Config().get()["SSDBNodes"]
    clients = get_clients(nodes=nodes)
    total = 0
    for client in clients:
        print "%s:%s" % (client["node"]["Host"], client["node"]["Port"])
        r = redis.Redis(connection_pool=client["connection_pool"])
        keys = r.execute_command("keys", start, end, -1)
        total += len(keys)
        print "length: ", len(keys)
    print "total: ", total
示例#7
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"]) 
    filter_q = FilterQueue(bloomd_client=bloomd_client, crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name, ssdb_clients=ssdb_clients, filter_q=filter_q, queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(
        exchange=crawler_name, 
        exchange_type="topic", 
        durable=True
    )
    ch.queue_declare(
        queue=queue_name,
        durable=True
    )
    ch.queue_bind(
        exchange=crawler_name, 
        queue=queue_name, 
        routing_key=queue_name
    )
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
示例#8
0
def test():
    crawler_name = "weibo"
    cfg = Config().get()
    ssdb_clients, ring = get_clients(nodes=cfg["SSDBNodes"])
    conn = create_conn(cfg)
    bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
    filter_q = FilterQueue(bloomd_client=bloomd_client,
                           crawler_name=crawler_name)
    queue_name = "http_request:%s:test" % crawler_name
    queue = RequestQueue(crawler_name,
                         ssdb_clients=ssdb_clients,
                         filter_q=filter_q,
                         queue_name=queue_name)
    ch = conn.channel()
    ch.exchange_declare(exchange=crawler_name,
                        exchange_type="topic",
                        durable=True)
    ch.queue_declare(queue=queue_name, durable=True)
    ch.queue_bind(exchange=crawler_name,
                  queue=queue_name,
                  routing_key=queue_name)
    ch.close()
    publish_channel = conn.channel()
    publish_channel.confirm_delivery()
    page_cnt = 50000
    cnt = 0
    logger = get_logger("test_queue")
    start_time = time.time()
    for i in xrange(1, page_cnt + 1):
        url = "http://stackoverflow.com/users?page=%d&tab=reputation&filter=week" % i
        cnt += 1
        if cnt % 1000 == 0:
            logger.info(str(cnt))
        r = Request(url=url, timeout=15, headers={}, crawler_name=crawler_name)
        queue.push(r, publish_channel)
    end_time = time.time()
    print "start time: ", start_time
    print "end time: ", end_time
    print "speed: %f times/second" % (page_cnt / (end_time - start_time))
示例#9
0
 def setUp(self):
     cfg = Config().get()
     self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
     conn = create_conn(cfg)
     self.crawler_name = 'test_crawler'
     self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
     self.filter_q = FilterQueue(
         crawler_name=self.crawler_name, 
         bloomd_client=self.bloomd_client
     )
     self.req_q = RequestQueue(
         self.crawler_name, 
         ssdb_clients=self.ssdb_clients, 
         filter_q=self.filter_q
     )
     self.resp_q = ResponseQueue(
         self.crawler_name, 
         ssdb_clients=self.ssdb_clients,
     )
     self.publish_channel = conn.channel()
     self.req_d = {
         'crawler_name': self.crawler_name,
         'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
         'proxy_name': 'http_china',
         'method': 'GET',
         'headers': {},
         'files': None,
         'data': None,
         'params': {},
         'auth': None,
         'cookies': {},
         'hooks': None,
         'json': None,
         'timeout': 10,
     }
     self.req = Request(**self.req_d)