def last_request_status(request): """Returns the last requests requested. The request accepts an optional parameter size, which is the maximum number of items returned. """ settings = request.registry.settings default_size = 10 size_str = request.params.get('size', default_size) try: size = int(size_str) except ValueError: raise exc.HTTPBadGateway(detail="Size parameter has incorrect value") # Get last requests dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) reqs = dbinterf.get_last_requests(size) dbinterf.close() # Get the jobid status dictionary. scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY] scrapyd_interf = Scrapyd(scrapyd_baseurl) jobids_status = scrapyd_interf.get_jobs() # For each request, determine the request status gathering # the information from all jobids related to it for req in reqs: req['status'] = get_request_status(req, jobids_status) return reqs
class ScrapydTest(unittest.TestCase): maxDiff = None URL = 'http://example.com' EXPECTED_LIST_JOBS_URL = URL + '/listjobs.json?project=test' EXPECTED_LIST_PROJECTS_URL = URL + '/listprojects.json' EXPECTED_LIST_SPIDERS_URL = URL + '/listspiders.json?project=test' EMPTY_QUEUE = {'running': 0, 'finished': 0, 'pending': 0} def setUp(self): # Always clear the cache so that tests are independent. Scrapyd._CACHE.clear() self.subject = Scrapyd(ScrapydTest.URL) def test_when_status_is_not_ok_then_it_should_report_an_error(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = {"status": "ERROR", "message": "Test"} self.assertRaises(exc.HTTPBadGateway, self.subject.get_queues, ['test']) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_queues_are_empty_then_it_should_return_empty_queues(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [], "running": [], "finished": [], } queues, summary = self.subject.get_queues(['test']) self.assertEqual({'test': self.EMPTY_QUEUE}, queues) self.assertEqual(self.EMPTY_QUEUE, summary) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_queues_have_jobs_then_it_should_return_their_state(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [{ "id": "78391cc0fcaf11e1b0090800272a6d06", "project_name": "spider1", }], "running": [], "finished": [{ "id": "2f16646cfcaf11e1b0090800272a6d06", "spider": "spider3", "start_time": "2012-09-12 10:14:03.594664", "end_time": "2012-09-12 10:24:03.594664" }], } queues, summary = self.subject.get_queues(['test']) expected_queue = {'running': 0, 'finished': 1, 'pending': 1} self.assertEqual({'test': expected_queue}, queues) self.assertEqual(expected_queue, summary) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_a_request_is_repeated_then_it_should_query_just_once(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [], "running": [], "finished": [], } queues, summary = self.subject.get_queues(['test']) self.assertEqual({'test': self.EMPTY_QUEUE}, queues) self.assertEqual(self.EMPTY_QUEUE, summary) queues, summary = self.subject.get_queues(['test']) self.assertEqual({'test': self.EMPTY_QUEUE}, queues) self.assertEqual(self.EMPTY_QUEUE, summary) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_there_are_no_project_then_it_should_get_an_empty_list(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = {"status": "ok", "projects": []} projects = self.subject.get_projects() self.assertEqual([], projects) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_PROJECTS_URL) def test_when_there_are_projects_then_it_should_get_a_list(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "projects": [ "proj1", "proj2", ], } projects = self.subject.get_projects() self.assertEqual(["proj1", "proj2"], projects) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_PROJECTS_URL) def test_when_there_are_no_jobs_then_it_should_get_an_empty_dict(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [], "running": [], "finished": [], } jobs = self.subject.get_jobs(['test']) self.assertEqual({}, jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_there_are_jobs_then_it_should_return_them(self): # Had to remove dates from jobs to make tests reliable. # The time conversion that's performed adds a configuration dependent # offset and a small, millisecond, error. with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [{ "id": "78391cc0fcaf11e1b0090800272a6d06", "project_name": "spider1", }], "running": [], "finished": [{ "id": "2f16646cfcaf11e1b0090800272a6d06", "spider": "spider3", }], } jobs = self.subject.get_jobs(['test']) expected = { '2f16646cfcaf11e1b0090800272a6d06': { 'id': '2f16646cfcaf11e1b0090800272a6d06', 'spider': 'spider3', 'status': 'finished', }, '78391cc0fcaf11e1b0090800272a6d06': { 'id': '78391cc0fcaf11e1b0090800272a6d06', 'project_name': 'spider1', 'status': 'pending', }, } self.assertEqual(expected, jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_there_are_no_spiders_then_it_should_get_an_empty_list(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = {"status": "ok", "spiders": []} jobs = self.subject.get_spiders('test') self.assertEqual([], jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_SPIDERS_URL) def test_when_there_are_spiders_then_it_should_return_them(self): # Had to remove dates from jobs to make tests reliable. # The time conversion that's performed adds a configuration dependent # offset and a small, millisecond, error. with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "spiders": ["spider1", "spider2", "spider3"], } jobs = self.subject.get_spiders('test') self.assertEqual(["spider1", "spider2", "spider3"], jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_SPIDERS_URL) def test_when_scrapyd_is_down_then_it_should_make_no_further_reqs(self): with mock.patch( 'web_runner.scrapyd.requests.get') as mock_requests_get: response = mock_requests_get.return_value response.status_code = 500 status = self.subject.get_operational_status() self.assertEqual( { 'scrapyd_alive': False, 'scrapyd_operational': False, 'scrapyd_projects': None, 'spiders': None, 'queues': None, 'summarized_queue': None, }, status, ) mock_requests_get.assert_called_once_with(self.URL) def test_when_scrapyd_fails_then_it_should_not_be_operational(self): with mock.patch( 'web_runner.scrapyd.requests.get') as mock_requests_get: alive_response = mock.MagicMock() alive_response.status_code = 200 mock_requests_get.side_effect = [ alive_response, exc.HTTPBadGateway(detail="Test"), ] status = self.subject.get_operational_status() self.assertEqual( { 'scrapyd_alive': True, 'scrapyd_operational': False, 'scrapyd_projects': None, 'spiders': None, 'queues': None, 'summarized_queue': None, }, status, ) mock_requests_get.assert_any_call(self.URL) mock_requests_get.assert_called_with( self.EXPECTED_LIST_PROJECTS_URL) def test_when_scrapyd_responds_then_it_should_provide_an_ok_status(self): with mock.patch( 'web_runner.scrapyd.requests.get') as mock_requests_get: alive_resp = mock.MagicMock() alive_resp.status_code = 200 projects_resp = mock.MagicMock() projects_resp.status_code = 200 projects_resp.json.return_value = { 'status': 'ok', 'projects': ['test', 'p2'], } spiders1_resp = mock.MagicMock() spiders1_resp.status_code = 200 spiders1_resp.json.return_value = { 'status': 'ok', 'spiders': ['p1_sp1', 'p1_sp2'], } spiders2_resp = mock.MagicMock() spiders2_resp.status_code = 200 spiders2_resp.json.return_value = { 'status': 'ok', 'spiders': ['p2_sp1', 'p2_sp2'], } jobs1_resp = mock.MagicMock() jobs1_resp.status_code = 200 jobs1_resp.json.return_value = { 'status': 'ok', "pending": [{ "id": "78391cc0fcaf11e1b0090800272a6d06", "project_name": "spider1", }], "running": [], "finished": [{ "id": "2f16646cfcaf11e1b0090800272a6d06", "spider": "spider3", }], } jobs2_resp = mock.MagicMock() jobs2_resp.status_code = 200 jobs2_resp.json.return_value = { 'status': 'ok', "pending": [{ "id": "XXXX1cc0fcaf11e1b0090800272a6d06", "project_name": "spider10", }], "finished": [], "running": [{ "id": "XXXX646cfcaf11e1b0090800272a6d06", "spider": "spider30", }], } mock_requests_get.side_effect = [ alive_resp, projects_resp, spiders1_resp, spiders2_resp, jobs1_resp, jobs2_resp, ] status = self.subject.get_operational_status() self.assertEqual( { 'scrapyd_alive': True, 'scrapyd_operational': True, 'scrapyd_projects': ['test', 'p2'], 'spiders': { 'test': ['p1_sp1', 'p1_sp2'], 'p2': ['p2_sp1', 'p2_sp2'], }, 'queues': { 'test': { 'finished': 1, 'pending': 1, 'running': 0 }, 'p2': { 'finished': 0, 'pending': 1, 'running': 1 }, }, 'summarized_queue': { 'finished': 1, 'pending': 2, 'running': 1, }, }, status, ) # More requests than these are actually performed. mock_requests_get.assert_any_call(self.URL) mock_requests_get.assert_any_call(self.EXPECTED_LIST_SPIDERS_URL) mock_requests_get.assert_any_call(self.EXPECTED_LIST_JOBS_URL) mock_requests_get.assert_any_call(self.EXPECTED_LIST_PROJECTS_URL) def test_when_a_job_is_started_ok_then_we_return_its_id(self): with mock.patch('web_runner.scrapyd.requests.post') as mock_post: response = mock_post.return_value response.json.return_value = {"status": "ok", "jobid": "XXX"} job_id = self.subject.schedule_job('project', 'spider', {}) self.assertEqual('XXX', job_id)
def request_history(request): """Returns the history of a request The view expects to receive a requestid. The view returns a dictionary with the following keys: * request: dictionary with main request infomation stored in the DB * jobids_info: dictionary whose key are all jobids related to requestid. The values is a dictionary with jobid information. * history: List with history content. * status: String with the requestid status Example of request: {'creation': u'2014-07-30 19:38:53.659982', 'params': u'{"searchterms_str": "laundry detergent", "group_name": "Gabo test1", "site": "walmart", "quantity": "100"}', 'requestid': 252, 'jobids': (u'236c257c182111e4906150465d4bc079',), 'remote_ip': u'127.0.0.1', 'group_name': u'Gabo test1', 'type': u'command', 'site': u'walmart', 'name': u'cat1'} Example of jobids_info: {u'17ae4f1c182111e4906150465d4bc079': { 'spider': u'walmart_products', 'status': 'finished', 'start_time': u'2014-07-30 16:38:34.218200', 'end_time': u'2014-07-30 16:40:50.766396', 'id': u'17ae4f1c182111e4906150465d4bc079'}, u'236c257c182111e4906150465d4bc079': { 'spider': u'walmart_products', 'status': 'finished', 'start_time': '2014-07-30 16:38:54.116999', 'end_time': u'2014-07-30 16:41:06.851201', 'id': u'236c257c182111e4906150465d4bc079'}} Exanmple of history: [["2014-07-30 21:13:02.829964", "1 hour", "Request arrived from 127.0.0.1."], ["2014-07-30 21:16:02.829964", "1 hour", "Request Finished"]] """ settings = request.registry.settings try: requestid = int(request.matchdict['requestid']) except ValueError: raise exc.HTTPBadGateway(detail="Request id is not valid") # Get request info dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) request_info = dbinterf.get_request(requestid) operations_info = dbinterf.get_req_operations(requestid) dbinterf.close() if not request_info: # The requestid is not recognized raise exc.HTTPBadGateway(detail="No info from Request id") # Get the jobid status dictionary. scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY] scrapyd_interf = Scrapyd(scrapyd_baseurl) jobids_status = scrapyd_interf.get_jobs() try: # Get only the jobids of the current request. jobids_info = { jobid: jobids_status[jobid] for jobid in request_info['jobids'] } except KeyError: jobids_info = None if jobids_info: history = _get_history(requestid, request_info, jobids_info, operations_info) status = get_request_status(request_info, jobids_status) else: history = None status = UNAVAILABLE info = { 'request': request_info, 'jobids_info': jobids_info, 'history': history, 'status': status, } return info