def last_request_status(request): """Returns the last requests requested. The request accepts an optional parameter size, which is the maximum number of items returned. """ settings = request.registry.settings default_size = 10 size_str = request.params.get('size', default_size) try: size = int(size_str) except ValueError: raise exc.HTTPBadGateway(detail="Size parameter has incorrect value") # Get last requests dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) reqs = dbinterf.get_last_requests(size) dbinterf.close() # Get the jobid status dictionary. scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY] scrapyd_interf = Scrapyd(scrapyd_baseurl) jobids_status = scrapyd_interf.get_jobs() # For each request, determine the request status gathering # the information from all jobids related to it for req in reqs: req['status'] = get_request_status(req, jobids_status) return reqs
def status(request): """Check the Web Runner and Scrapyd Status""" settings = request.registry.settings scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY] scrapyd_interf = Scrapyd(scrapyd_baseurl) output = scrapyd_interf.get_operational_status() if request.params: items = [x.split(':', 1) for x in request.params.getall('return')] output = dict_filter(output, items) if 'application/json' in request.accept: pass elif 'text/plain' in request.accept: request.override_renderer = 'string' if len(output) != 1: raise exc.exception_response(406) else: output = output.values()[0] if not isinstance(output, numbers.Number) \ and type(output) != type('a'): raise exc.exception_response(406) else: raise exc.exception_response(406) return output
def spider_start_view(request): """Starts job in Scrapyd and redirects to the "spider pending jobs" view.""" settings = request.registry.settings cfg_template = find_spider_config_from_path(settings, request.path) cfg = render_spider_config(cfg_template, request.params) scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY]) try: jobid = ScrapydJobHelper(settings, cfg, scrapyd).start_job(request.params) id = request.route_path("spider pending jobs", project=cfg.project_name, spider=cfg.spider_name, jobid=jobid) # Storing the request in the internal DB. dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) dbinterf.new_spider(cfg.spider_name, dict(request.params), jobid, request.remote_addr, id) dbinterf.close() raise exc.HTTPFound(location=id, detail="Job '%s' started." % jobid) except ScrapydJobStartError as e: raise exc.HTTPBadGateway( "Scrapyd error when starting job. Status '{}': {}".format( e.status, e.message)) except ScrapydJobException as e: raise exc.HTTPBadGateway( "When contacting Scrapyd there was an unexpected error: {}".format( e.message))
def command_result(request): """Report result of job.""" name = request.matchdict['name'] encoded_job_ids = request.matchdict['jobid'] try: job_ids = decode_ids(encoded_job_ids) except TypeError: # Malformed Job ID. raise exc.HTTPBadRequest("The job ID is invalid.") settings = request.registry.settings cfg_template = find_command_config_from_name(settings, name) spider_cfgs = starmap( render_spider_config, zip( cfg_template.spider_configs, cfg_template.spider_params, repeat(request.params), )) # Storing the request in the internal DB dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) dbinterf.new_request_event(web_runner.db.COMMAND_RESULT, job_ids, request.remote_addr) dbinterf.close() scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY]) args = dict(request.params) for i, (job_id, spider_cfg) in enumerate(zip(job_ids, spider_cfgs)): fn = ScrapydJobHelper(settings, spider_cfg, scrapyd).retrieve_job_data_fn(job_id) args['spider %d' % i] = fn cmd_line = cfg_template.cmd.format(**args) LOG.info("Starting command: %s", cmd_line) process = subprocess.Popen( cmd_line, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) LOG.info("Waiting until conn timeout for command to finish...") stdout, stderr = process.communicate() LOG.info("Process finished.") if process.returncode != 0: msg = "The command terminated with an return value of %s." \ " Process' standard error: %s" \ % (process.returncode, stderr) LOG.warn(msg) raise exc.HTTPBadGateway(detail=msg) LOG.info("Command generated %s bytes.", len(stdout)) request.response.content_type = cfg_template.content_type request.response.body = stdout return request.response
def command_pending(request): """Report on running job status.""" name = request.matchdict['name'] encoded_job_ids = request.matchdict['jobid'] try: job_ids = decode_ids(encoded_job_ids) except TypeError: # Malformed Job ID. raise exc.HTTPBadRequest("The job ID is invalid.") settings = request.registry.settings cfg_template = find_command_config_from_name(settings, name) spider_cfgs = starmap( render_spider_config, zip( cfg_template.spider_configs, cfg_template.spider_params, repeat(request.params), )) scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY]) running = 0 for job_id, spider_cfg in zip(job_ids, spider_cfgs): scrapyd_helper = ScrapydJobHelper(settings, spider_cfg, scrapyd) status = scrapyd_helper.report_on_job(job_id) if status is ScrapydJobHelper.JobStatus.unknown: msg = "Job for spider '{}' with id '{}' has an unknown status." \ " Aborting command run.".format(spider_cfg.spider_name, job_id) LOG.error(msg) raise exc.HTTPNotFound(msg) if status is not ScrapydJobHelper.JobStatus.finished: running += 1 # Storing the request in the internal DB dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) dbinterf.new_request_event(web_runner.db.COMMAND_STATUS, job_ids, request.remote_addr) dbinterf.close() if running: raise exc.HTTPAccepted(detail="Crawlers still running: %d" % running) else: raise exc.HTTPFound(location=request.route_path( "command job results", name=name, jobid=encoded_job_ids, _query=request.params, ), detail="Crawlers finished.")
def spider_pending_view(request): project_name = request.matchdict['project'] spider_name = request.matchdict['spider'] job_id = request.matchdict['jobid'] settings = request.registry.settings scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY]) status = ScrapydJobHelper(settings, SpiderConfig(spider_name, project_name), scrapyd).report_on_job(job_id) # Storing the request in the internal DB dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) dbinterf.new_request_event(web_runner.db.SPIDER_STATUS, (job_id, ), request.remote_addr) dbinterf.close() if status is ScrapydJobHelper.JobStatus.finished: raise exc.HTTPFound(location=request.route_path( "spider job results", project=project_name, spider=spider_name, jobid=job_id, ), detail="Job finished.") if status is ScrapydJobHelper.JobStatus.unknown: msg = "Job for spider '{}/{}' with id '{}' has an unknown status." \ " Aborting command run.".format(project_name, spider_name, job_id) LOG.error(msg) raise exc.HTTPNotFound(msg) state = 'Job state unknown.' if status is ScrapydJobHelper.JobStatus.pending: state = "Job still waiting to run" elif status is ScrapydJobHelper.JobStatus.running: state = "Job running." raise exc.HTTPAccepted(detail=state)
def spider_results_view(request): settings = request.registry.settings project_name = request.matchdict['project'] spider_name = request.matchdict['spider'] job_id = request.matchdict['jobid'] # Storing the request in the internal DB dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) dbinterf.new_request_event(web_runner.db.SPIDER_RESULT, (job_id, ), request.remote_addr) dbinterf.close() scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY]) try: data_stream = ScrapydJobHelper(settings, SpiderConfig(spider_name, project_name), scrapyd).retrieve_job_data(job_id) request.response.body_file = data_stream return request.response except ScrapydJobException as e: raise exc.HTTPBadGateway( detail="The content could not be retrieved: %s" % e)
def setUp(self): # Always clear the cache so that tests are independent. Scrapyd._CACHE.clear() self.subject = Scrapyd(ScrapydTest.URL)
class ScrapydTest(unittest.TestCase): maxDiff = None URL = 'http://example.com' EXPECTED_LIST_JOBS_URL = URL + '/listjobs.json?project=test' EXPECTED_LIST_PROJECTS_URL = URL + '/listprojects.json' EXPECTED_LIST_SPIDERS_URL = URL + '/listspiders.json?project=test' EMPTY_QUEUE = {'running': 0, 'finished': 0, 'pending': 0} def setUp(self): # Always clear the cache so that tests are independent. Scrapyd._CACHE.clear() self.subject = Scrapyd(ScrapydTest.URL) def test_when_status_is_not_ok_then_it_should_report_an_error(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = {"status": "ERROR", "message": "Test"} self.assertRaises(exc.HTTPBadGateway, self.subject.get_queues, ['test']) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_queues_are_empty_then_it_should_return_empty_queues(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [], "running": [], "finished": [], } queues, summary = self.subject.get_queues(['test']) self.assertEqual({'test': self.EMPTY_QUEUE}, queues) self.assertEqual(self.EMPTY_QUEUE, summary) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_queues_have_jobs_then_it_should_return_their_state(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [{ "id": "78391cc0fcaf11e1b0090800272a6d06", "project_name": "spider1", }], "running": [], "finished": [{ "id": "2f16646cfcaf11e1b0090800272a6d06", "spider": "spider3", "start_time": "2012-09-12 10:14:03.594664", "end_time": "2012-09-12 10:24:03.594664" }], } queues, summary = self.subject.get_queues(['test']) expected_queue = {'running': 0, 'finished': 1, 'pending': 1} self.assertEqual({'test': expected_queue}, queues) self.assertEqual(expected_queue, summary) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_a_request_is_repeated_then_it_should_query_just_once(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [], "running": [], "finished": [], } queues, summary = self.subject.get_queues(['test']) self.assertEqual({'test': self.EMPTY_QUEUE}, queues) self.assertEqual(self.EMPTY_QUEUE, summary) queues, summary = self.subject.get_queues(['test']) self.assertEqual({'test': self.EMPTY_QUEUE}, queues) self.assertEqual(self.EMPTY_QUEUE, summary) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_there_are_no_project_then_it_should_get_an_empty_list(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = {"status": "ok", "projects": []} projects = self.subject.get_projects() self.assertEqual([], projects) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_PROJECTS_URL) def test_when_there_are_projects_then_it_should_get_a_list(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "projects": [ "proj1", "proj2", ], } projects = self.subject.get_projects() self.assertEqual(["proj1", "proj2"], projects) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_PROJECTS_URL) def test_when_there_are_no_jobs_then_it_should_get_an_empty_dict(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [], "running": [], "finished": [], } jobs = self.subject.get_jobs(['test']) self.assertEqual({}, jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_there_are_jobs_then_it_should_return_them(self): # Had to remove dates from jobs to make tests reliable. # The time conversion that's performed adds a configuration dependent # offset and a small, millisecond, error. with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "pending": [{ "id": "78391cc0fcaf11e1b0090800272a6d06", "project_name": "spider1", }], "running": [], "finished": [{ "id": "2f16646cfcaf11e1b0090800272a6d06", "spider": "spider3", }], } jobs = self.subject.get_jobs(['test']) expected = { '2f16646cfcaf11e1b0090800272a6d06': { 'id': '2f16646cfcaf11e1b0090800272a6d06', 'spider': 'spider3', 'status': 'finished', }, '78391cc0fcaf11e1b0090800272a6d06': { 'id': '78391cc0fcaf11e1b0090800272a6d06', 'project_name': 'spider1', 'status': 'pending', }, } self.assertEqual(expected, jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_JOBS_URL) def test_when_there_are_no_spiders_then_it_should_get_an_empty_list(self): with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = {"status": "ok", "spiders": []} jobs = self.subject.get_spiders('test') self.assertEqual([], jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_SPIDERS_URL) def test_when_there_are_spiders_then_it_should_return_them(self): # Had to remove dates from jobs to make tests reliable. # The time conversion that's performed adds a configuration dependent # offset and a small, millisecond, error. with mock.patch('web_runner.scrapyd.requests') as mock_requests: response = mock_requests.get.return_value response.json.return_value = { "status": "ok", "spiders": ["spider1", "spider2", "spider3"], } jobs = self.subject.get_spiders('test') self.assertEqual(["spider1", "spider2", "spider3"], jobs) mock_requests.get.assert_called_once_with( self.EXPECTED_LIST_SPIDERS_URL) def test_when_scrapyd_is_down_then_it_should_make_no_further_reqs(self): with mock.patch( 'web_runner.scrapyd.requests.get') as mock_requests_get: response = mock_requests_get.return_value response.status_code = 500 status = self.subject.get_operational_status() self.assertEqual( { 'scrapyd_alive': False, 'scrapyd_operational': False, 'scrapyd_projects': None, 'spiders': None, 'queues': None, 'summarized_queue': None, }, status, ) mock_requests_get.assert_called_once_with(self.URL) def test_when_scrapyd_fails_then_it_should_not_be_operational(self): with mock.patch( 'web_runner.scrapyd.requests.get') as mock_requests_get: alive_response = mock.MagicMock() alive_response.status_code = 200 mock_requests_get.side_effect = [ alive_response, exc.HTTPBadGateway(detail="Test"), ] status = self.subject.get_operational_status() self.assertEqual( { 'scrapyd_alive': True, 'scrapyd_operational': False, 'scrapyd_projects': None, 'spiders': None, 'queues': None, 'summarized_queue': None, }, status, ) mock_requests_get.assert_any_call(self.URL) mock_requests_get.assert_called_with( self.EXPECTED_LIST_PROJECTS_URL) def test_when_scrapyd_responds_then_it_should_provide_an_ok_status(self): with mock.patch( 'web_runner.scrapyd.requests.get') as mock_requests_get: alive_resp = mock.MagicMock() alive_resp.status_code = 200 projects_resp = mock.MagicMock() projects_resp.status_code = 200 projects_resp.json.return_value = { 'status': 'ok', 'projects': ['test', 'p2'], } spiders1_resp = mock.MagicMock() spiders1_resp.status_code = 200 spiders1_resp.json.return_value = { 'status': 'ok', 'spiders': ['p1_sp1', 'p1_sp2'], } spiders2_resp = mock.MagicMock() spiders2_resp.status_code = 200 spiders2_resp.json.return_value = { 'status': 'ok', 'spiders': ['p2_sp1', 'p2_sp2'], } jobs1_resp = mock.MagicMock() jobs1_resp.status_code = 200 jobs1_resp.json.return_value = { 'status': 'ok', "pending": [{ "id": "78391cc0fcaf11e1b0090800272a6d06", "project_name": "spider1", }], "running": [], "finished": [{ "id": "2f16646cfcaf11e1b0090800272a6d06", "spider": "spider3", }], } jobs2_resp = mock.MagicMock() jobs2_resp.status_code = 200 jobs2_resp.json.return_value = { 'status': 'ok', "pending": [{ "id": "XXXX1cc0fcaf11e1b0090800272a6d06", "project_name": "spider10", }], "finished": [], "running": [{ "id": "XXXX646cfcaf11e1b0090800272a6d06", "spider": "spider30", }], } mock_requests_get.side_effect = [ alive_resp, projects_resp, spiders1_resp, spiders2_resp, jobs1_resp, jobs2_resp, ] status = self.subject.get_operational_status() self.assertEqual( { 'scrapyd_alive': True, 'scrapyd_operational': True, 'scrapyd_projects': ['test', 'p2'], 'spiders': { 'test': ['p1_sp1', 'p1_sp2'], 'p2': ['p2_sp1', 'p2_sp2'], }, 'queues': { 'test': { 'finished': 1, 'pending': 1, 'running': 0 }, 'p2': { 'finished': 0, 'pending': 1, 'running': 1 }, }, 'summarized_queue': { 'finished': 1, 'pending': 2, 'running': 1, }, }, status, ) # More requests than these are actually performed. mock_requests_get.assert_any_call(self.URL) mock_requests_get.assert_any_call(self.EXPECTED_LIST_SPIDERS_URL) mock_requests_get.assert_any_call(self.EXPECTED_LIST_JOBS_URL) mock_requests_get.assert_any_call(self.EXPECTED_LIST_PROJECTS_URL) def test_when_a_job_is_started_ok_then_we_return_its_id(self): with mock.patch('web_runner.scrapyd.requests.post') as mock_post: response = mock_post.return_value response.json.return_value = {"status": "ok", "jobid": "XXX"} job_id = self.subject.schedule_job('project', 'spider', {}) self.assertEqual('XXX', job_id)
def request_history(request): """Returns the history of a request The view expects to receive a requestid. The view returns a dictionary with the following keys: * request: dictionary with main request infomation stored in the DB * jobids_info: dictionary whose key are all jobids related to requestid. The values is a dictionary with jobid information. * history: List with history content. * status: String with the requestid status Example of request: {'creation': u'2014-07-30 19:38:53.659982', 'params': u'{"searchterms_str": "laundry detergent", "group_name": "Gabo test1", "site": "walmart", "quantity": "100"}', 'requestid': 252, 'jobids': (u'236c257c182111e4906150465d4bc079',), 'remote_ip': u'127.0.0.1', 'group_name': u'Gabo test1', 'type': u'command', 'site': u'walmart', 'name': u'cat1'} Example of jobids_info: {u'17ae4f1c182111e4906150465d4bc079': { 'spider': u'walmart_products', 'status': 'finished', 'start_time': u'2014-07-30 16:38:34.218200', 'end_time': u'2014-07-30 16:40:50.766396', 'id': u'17ae4f1c182111e4906150465d4bc079'}, u'236c257c182111e4906150465d4bc079': { 'spider': u'walmart_products', 'status': 'finished', 'start_time': '2014-07-30 16:38:54.116999', 'end_time': u'2014-07-30 16:41:06.851201', 'id': u'236c257c182111e4906150465d4bc079'}} Exanmple of history: [["2014-07-30 21:13:02.829964", "1 hour", "Request arrived from 127.0.0.1."], ["2014-07-30 21:16:02.829964", "1 hour", "Request Finished"]] """ settings = request.registry.settings try: requestid = int(request.matchdict['requestid']) except ValueError: raise exc.HTTPBadGateway(detail="Request id is not valid") # Get request info dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) request_info = dbinterf.get_request(requestid) operations_info = dbinterf.get_req_operations(requestid) dbinterf.close() if not request_info: # The requestid is not recognized raise exc.HTTPBadGateway(detail="No info from Request id") # Get the jobid status dictionary. scrapyd_baseurl = settings[SCRAPYD_BASE_URL_KEY] scrapyd_interf = Scrapyd(scrapyd_baseurl) jobids_status = scrapyd_interf.get_jobs() try: # Get only the jobids of the current request. jobids_info = { jobid: jobids_status[jobid] for jobid in request_info['jobids'] } except KeyError: jobids_info = None if jobids_info: history = _get_history(requestid, request_info, jobids_info, operations_info) status = get_request_status(request_info, jobids_status) else: history = None status = UNAVAILABLE info = { 'request': request_info, 'jobids_info': jobids_info, 'history': history, 'status': status, } return info
def command_start_view(request): """Schedules running a command plus spiders.""" settings = request.registry.settings cfg_template = find_command_config_from_path(settings, request.path) spider_cfgs = starmap( render_spider_config, zip( cfg_template.spider_configs, cfg_template.spider_params, repeat(request.params), )) scrapyd = Scrapyd(settings[SCRAPYD_BASE_URL_KEY]) spider_job_ids = [] try: for spider_cfg, spider_params in zip(spider_cfgs, cfg_template.spider_params): all_params = dict(spider_params) all_params.update(request.params) jobid = ScrapydJobHelper(settings, spider_cfg, scrapyd).start_job(all_params) spider_job_ids.append(jobid) LOG.info( "For command at '%s', started crawl job with id '%s'.", cfg_template.name, jobid, ) except ScrapydJobStartError as e: raise exc.HTTPBadGateway( "Failed to start a required crawl for command '{}'." " Scrapyd was not OK, it was '{}': {}".format( cfg_template.name, e.status, e.message)) except ScrapydJobException as e: raise exc.HTTPBadGateway( "For command {}, unexpected error when contacting Scrapyd:" " {}".format(cfg_template.name, e.message)) command_name = request.path.strip('/') id = request.route_path( "command pending jobs", name=cfg_template.name, jobid=encode_ids(spider_job_ids), _query=request.params, ) # Storing the request in the internal DB dbinterf = web_runner.db.DbInterface(settings['db_filename'], recreate=False) dbinterf.new_command( command_name, dict(request.params), spider_job_ids, request.remote_addr, id=id, ) dbinterf.close() raise exc.HTTPFound(location=id, detail="Command '{}' started with {} crawls.".format( cfg_template.name, len(spider_job_ids)))