예제 #1
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
예제 #2
0
def run_fetcher(g=g):
    from fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=g.scheduler2fetcher, outqueue=g.fetcher2processor)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    run_in_thread(fetcher.xmlrpc_run, port=g.fetcher_xmlrpc_port, bind=g.webui_host)
    fetcher.run()
예제 #3
0
class TestTaskDB(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': 'http://httpbin.org/get',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'timeout': 60,
            'save': 'abc',
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    def setUp(self):
        self.fetcher = Fetcher(None, None)
        self.thread = utils.run_in_thread(self.fetcher.run)

    def tearDown(self):
        self.fetcher.quit()
        self.thread.join()

    def test_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'],
                         self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = json.loads(result['content'])
        self.assertIn('headers', content)
        self.assertIn('A', content['headers'])
        self.assertIn('Cookie', content['headers'])
        self.assertEqual(content['headers']['Cookie'], 'a=b')

    def test_dataurl_get(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello'
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')
예제 #4
0
 def setUpClass(self):
     self.inqueue = Queue(10)
     self.outqueue = Queue(10)
     self.fetcher = Fetcher(self.inqueue, self.outqueue)
     self.rpc = xmlrpclib.ServerProxy("http://localhost:%d" % 24444)
     self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
     self.thread = utils.run_in_thread(self.fetcher.run)
예제 #5
0
class TestTaskDB(unittest.TestCase):
    sample_task_http = {
            'taskid': 'taskid',
            'project': 'project',
            'url': 'http://httpbin.org/get',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b', 
                    'a': 'b'
                    },
                'timeout': 60,
                'save': 'abc',
                },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
                },
            }
    def setUp(self):
        self.fetcher = Fetcher(None, None)
        self.thread = utils.run_in_thread(self.fetcher.run)

    def tearDown(self):
        self.fetcher.quit()
        self.thread.join()

    def test_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = json.loads(result['content'])
        self.assertIn('headers', content)
        self.assertIn('A', content['headers'])
        self.assertIn('Cookie', content['headers'])
        self.assertEqual(content['headers']['Cookie'], 'a=b')

    def test_dataurl_get(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')
예제 #6
0
class TestTaskDB(unittest.TestCase):
    sample_task_http = {
        'taskid': 'taskid',
        'project': 'project',
        'url': 'http://httpbin.org/get',
        'fetch': {
            'method': 'GET',
            'headers': {
                'Cookie': 'a=b',
                'a': 'b'
            },
            'data': 'a=b&c=d',
            'timeout': 60,
        },
        'process': {
            'callback': 'callback',
            'save': [1, 2, 3],
        },
    }

    def setUp(self):
        self.fetcher = Fetcher(None, None)
        self.thread = threading.Thread(target=self.fetcher.run)
        self.thread.daemon = True
        self.thread.start()

    def tearDown(self):
        self.fetcher.quit()
        self.thread.join()

    def test_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertIn('content', result)
        content = json.loads(result['content'])
        self.assertIn('headers', content)
        self.assertIn('A', content['headers'])
        self.assertIn('Cookie', content['headers'])
        self.assertEqual(content['headers']['Cookie'], 'a=b')
예제 #7
0
class TestTaskDB(unittest.TestCase):
    sample_task_http = {
            'taskid': 'taskid',
            'project': 'project',
            'url': 'http://httpbin.org/get',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b', 
                    'a': 'b'
                    },
                'data': 'a=b&c=d', 
                'timeout': 60,
                },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
                },
            }
    def setUp(self):
        self.fetcher = Fetcher(None, None)
        self.thread = threading.Thread(target=self.fetcher.run)
        self.thread.daemon = True
        self.thread.start()

    def tearDown(self):
        self.fetcher.quit()
        self.thread.join()

    def test_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertIn('content', result)
        content = json.loads(result['content'])
        self.assertIn('headers', content)
        self.assertIn('A', content['headers'])
        self.assertIn('Cookie', content['headers'])
        self.assertEqual(content['headers']['Cookie'], 'a=b')
예제 #8
0
def run_webui(g=g):
    import cPickle as pickle

    from fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=None, outqueue=None, async=False)
    fetcher.phantomjs_proxy = g.phantomjs_proxy

    from webui.app import app
    app.config['taskdb'] = g.taskdb
    app.config['projectdb'] = g.projectdb
    app.config['resultdb'] = g.resultdb
    app.config['fetch'] = lambda x: fetcher.fetch(x)[1]
    app.config['scheduler_rpc'] = g.scheduler_rpc
    #app.config['cdn'] = '//cdnjs.cloudflare.com/ajax/libs/'
    if g.demo_mode:
        app.config['max_rate'] = 0.2
        app.config['max_burst'] = 3.0
    if 'WEBUI_USERNAME' in os.environ:
        app.config['webui_username'] = os.environ['WEBUI_USERNAME']
        app.config['webui_password'] = os.environ.get('WEBUI_PASSWORD', '')
    if not getattr(g, 'all_in_one', False):
        app.debug = g.debug
    app.run(host=g.webui_host, port=g.webui_port)
예제 #9
0
파일: run.py 프로젝트: 7uk0n/pyspider
def run_fetcher():
    from fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=scheduler2fetcher, outqueue=fetcher2processor)

    run_in_thread(fetcher.xmlrpc_run, port=fetcher_xmlrpc_port)
    fetcher.run()
예제 #10
0
 def setUp(self):
     self.fetcher = Fetcher(None, None)
     self.thread = utils.run_in_thread(self.fetcher.run)
예제 #11
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
        "taskid": "taskid",
        "project": "project",
        "url": "http://echo.opera.com/",
        "fetch": {"method": "GET", "headers": {"Cookie": "a=b", "a": "b"}, "timeout": 60, "save": "abc"},
        "process": {"callback": "callback", "save": [1, 2, 3]},
    }

    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.rpc = xmlrpclib.ServerProxy("http://localhost:%d" % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)

    @classmethod
    def tearDownClass(self):
        self.rpc._quit()
        self.thread.join()

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result["status_code"], 200)
        self.assertEqual(result["orig_url"], self.sample_task_http["url"])
        self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"])
        self.assertIn("content", result)

        content = result["content"]
        self.assertIn("..A:", content)
        self.assertIn("..Cookie:", content)
        self.assertIn("a=b", content)

    def test_10_http_post(self):
        request = dict(self.sample_task_http)
        request["fetch"]["method"] = "POST"
        request["fetch"]["data"] = "binux"
        request["fetch"]["cookies"] = {"c": "d"}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result["status_code"], 200)
        self.assertEqual(result["orig_url"], self.sample_task_http["url"])
        self.assertEqual(result["save"], self.sample_task_http["fetch"]["save"])
        self.assertIn("content", result)

        content = result["content"]
        self.assertIn("<h2>POST", content)
        self.assertIn("..A:", content)
        self.assertIn("..Cookie:", content)
        # FIXME: cookies in headers not supported
        self.assertNotIn("a=b", content)
        self.assertIn("c=d", content)
        self.assertIn("binux", content)

    def test_20_dataurl_get(self):
        data = dict(self.sample_task_http)
        data["url"] = "data:,hello"
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result["status_code"], 200)
        self.assertIn("content", result)
        self.assertEqual(result["content"], "hello")

    def test_30_with_queue(self):
        data = dict(self.sample_task_http)
        data["url"] = "data:,hello"
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result["status_code"], 200)
        self.assertIn("content", result)
        self.assertEqual(result["content"], "hello")

    def test_40_with_rpc(self):
        data = dict(self.sample_task_http)
        data["url"] = "data:,hello"
        result = pickle.loads(self.rpc.fetch(data).data)
        self.assertEqual(result["status_code"], 200)
        self.assertIn("content", result)
        self.assertEqual(result["content"], "hello")
예제 #12
0
 def setUp(self):
     self.fetcher = Fetcher(None, None)
     self.thread = threading.Thread(target=self.fetcher.run)
     self.thread.daemon = True
     self.thread.start()
예제 #13
0
 def setUp(self):
     self.fetcher = Fetcher(None, None)
     self.thread = threading.Thread(target=self.fetcher.run)
     self.thread.daemon = True
     self.thread.start()
예제 #14
0
파일: run.py 프로젝트: rmaC7/pyspider
def run_fetcher():
    from fetcher.tornado_fetcher import Fetcher
    fetcher = Fetcher(inqueue=scheduler2fetcher, outqueue=fetcher2processor)

    run_in_thread(fetcher.xmlrpc_run, port=fetcher_xmlrpc_port)
    fetcher.run()
예제 #15
0
 def setUp(self):
     self.fetcher = Fetcher(None, None)
     self.thread = utils.run_in_thread(self.fetcher.run)
예제 #16
0
class TestFetcher(unittest.TestCase):
    sample_task_http = {
            'taskid': 'taskid',
            'project': 'project',
            'url': 'http://echo.opera.com/',
            'fetch': {
                'method': 'GET',
                'headers': {
                    'Cookie': 'a=b', 
                    'a': 'b'
                    },
                'timeout': 60,
                'save': 'abc',
                },
            'process': {
                'callback': 'callback',
                'save': [1, 2, 3],
                },
            }
    @classmethod
    def setUpClass(self):
        self.inqueue = Queue(10)
        self.outqueue = Queue(10)
        self.fetcher = Fetcher(self.inqueue, self.outqueue)
        self.rpc = xmlrpclib.ServerProxy('http://localhost:%d' % 24444)
        self.xmlrpc_thread = utils.run_in_thread(self.fetcher.xmlrpc_run, port=24444)
        self.thread = utils.run_in_thread(self.fetcher.run)

    @classmethod
    def tearDownClass(self):
        self.rpc._quit()
        self.thread.join()

    def test_10_http_get(self):
        result = self.fetcher.sync_fetch(self.sample_task_http)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('..A:', content)
        self.assertIn('..Cookie:', content)
        self.assertIn('a=b', content)

    def test_10_http_post(self):
        request = dict(self.sample_task_http)
        request['fetch']['method'] = 'POST'
        request['fetch']['data'] = 'binux'
        request['fetch']['cookies'] = {'c': 'd'}
        result = self.fetcher.sync_fetch(request)
        self.assertEqual(result['status_code'], 200)
        self.assertEqual(result['orig_url'], self.sample_task_http['url'])
        self.assertEqual(result['save'], self.sample_task_http['fetch']['save'])
        self.assertIn('content', result)

        content = result['content']
        self.assertIn('<h2>POST', content)
        self.assertIn('..A:', content)
        self.assertIn('..Cookie:', content)
        # FIXME: cookies in headers not supported
        self.assertNotIn('a=b', content)
        self.assertIn('c=d', content)
        self.assertIn('binux', content)

    def test_20_dataurl_get(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        result = self.fetcher.sync_fetch(data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_30_with_queue(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        self.inqueue.put(data)
        task, result = self.outqueue.get()
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')

    def test_40_with_rpc(self):
        data = dict(self.sample_task_http)
        data['url'] = 'data:,hello';
        result = pickle.loads(self.rpc.fetch(data).data)
        self.assertEqual(result['status_code'], 200)
        self.assertIn('content', result)
        self.assertEqual(result['content'], 'hello')