Exemplo n.º 1
0
    def setUp(self):
        self.crawler_name = 'test'
        self.req_d = {
            'crawler_name': self.crawler_name,
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'proxy_name': 'http_china',
            'method': 'GET',
            'headers': {},
            'files': None,
            'data': None,
            'params': {},
            'auth': None,
            'cookies': {},
            'hooks': None,
            'json': None,
            'timeout': 10,
        }
        test_html_file = os.path.join(os.path.dirname(__file__), "test.html")
        with open(test_html_file, 'r') as f:
            html = f.read()

        self.resp_d = {
            'crawler_name': self.crawler_name,
            'http_request': json.dumps(self.req_d),
            'error_code': 0,
            'error_msg': '',
            'status_code': 200,
            'reason': 'OK',
            'html': html,
            'cookies': {},
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'headers': {},
            'encoding': None,
            'elapsed': None,
            'http_proxy': '127.0.0.1:8000'
        }
        cfg = Config().get()
        self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
        conn = create_conn(cfg)
        self.publish_channel = conn.channel()
        self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
        self.filter_q = FilterQueue(
            crawler_name=self.crawler_name,
            bloomd_client=self.bloomd_client
        )
        self.req_q = RequestQueue(
            self.crawler_name,
            ssdb_clients=self.ssdb_clients,
            filter_q=self.filter_q
        )
        html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]]
        fake_worker = type("Worker", (object, ), {})
        setattr(fake_worker, "crawler_name", self.crawler_name)
        setattr(fake_worker, "req_q", self.req_q)
        setattr(fake_worker, "publish_channel", self.publish_channel)
        setattr(fake_worker, "html_404_strings", html_404_strings)
        self.error_handler = Plugin(fake_worker)
Exemplo n.º 2
0
class TestErrorHandler(unittest.TestCase):
    def setUp(self):
        self.crawler_name = 'test'
        self.req_d = {
            'crawler_name': self.crawler_name,
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'proxy_name': 'http_china',
            'method': 'GET',
            'headers': {},
            'files': None,
            'data': None,
            'params': {},
            'auth': None,
            'cookies': {},
            'hooks': None,
            'json': None,
            'timeout': 10,
        }
        test_html_file = os.path.join(os.path.dirname(__file__), "test.html")
        with open(test_html_file, 'r') as f:
            html = f.read()

        self.resp_d = {
            'crawler_name': self.crawler_name,
            'http_request': json.dumps(self.req_d),
            'error_code': 0,
            'error_msg': '',
            'status_code': 200,
            'reason': 'OK',
            'html': html,
            'cookies': {},
            'url': 'http://stackoverflow.com/users/1144035/gordon-linoff',
            'headers': {},
            'encoding': None,
            'elapsed': None,
            'http_proxy': '127.0.0.1:8000'
        }
        cfg = Config().get()
        self.ssdb_clients = get_clients(nodes=cfg["SSDBNodes"])
        conn = create_conn(cfg)
        self.publish_channel = conn.channel()
        self.bloomd_client = bloomd.get_client(nodes=cfg["BloomdNodes"])
        self.filter_q = FilterQueue(
            crawler_name=self.crawler_name,
            bloomd_client=self.bloomd_client
        )
        self.req_q = RequestQueue(
            self.crawler_name,
            ssdb_clients=self.ssdb_clients,
            filter_q=self.filter_q
        )
        html_404_strings = [['Page', 'Not', 'Found'], [u"页面不存在"]]
        fake_worker = type("Worker", (object, ), {})
        setattr(fake_worker, "crawler_name", self.crawler_name)
        setattr(fake_worker, "req_q", self.req_q)
        setattr(fake_worker, "publish_channel", self.publish_channel)
        setattr(fake_worker, "html_404_strings", html_404_strings)
        self.error_handler = Plugin(fake_worker)

    def test_404_err(self):
        err_html_file = os.path.join(os.path.dirname(__file__), '404_err.html')
        self.handle_error_file(err_html_file)

    def test_zh_404_err(self):
        err_html_file = os.path.join(os.path.dirname(__file__), "zh_404_err.html")
        self.handle_error_file(err_html_file)

    def handle_error_file(self, filename):
        with open(filename, "r") as f:
            html = f.read()
        encoding = chardet.detect(html)["encoding"]
        html = html.decode(encoding)

        self.err_resp_d = {
            'crawler_name': self.crawler_name,
            'http_request': json.dumps(self.req_d),
            'error_code': 0,
            'error_msg': '',
            'status_code': 404,
            'reason': 'Not Found',
            'html': html,
            'cookies': {},
            'url': 'http://stackoverflow.com/test',
            'headers': {},
            'encoding': None,
            'elapsed': None,
            'http_proxy': '127.0.0.1:8000'
        }

        resp = Response()
        resp.from_json(json.dumps(self.err_resp_d))
        self.assertTrue(self.error_handler.handle_err(resp))

        wrong_err_resp = self.resp_d    # 不是正常的404页面
        wrong_err_resp['status_code'] = 404
        resp.from_json(json.dumps(wrong_err_resp))
        self.assertFalse(self.error_handler.handle_err(resp))