Exemplo n.º 1
0
    def test_delete_requests_are_retried(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback_getpost, attempts_count_getpost = self.make_request_callback(
            0, job_metadata)
        callback_delete, attempts_count_delete = self.make_request_callback(
            2, job_metadata)

        self.mock_api(method=GET, callback=callback_getpost)
        self.mock_api(method=POST, callback=callback_getpost)
        self.mock_api(method=DELETE, callback=callback_delete)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
        job.metadata['foo'] = 'bar'
        del job.metadata['foo']
        job.metadata.save()

        # Assert
        self.assertEqual(attempts_count_delete[0], 3)
Exemplo n.º 2
0
    def test_metadata_save_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3,
                                  max_retry_time=1)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback_get, attempts_count_get = self.make_request_callback(
            0, job_metadata)
        callback_post, attempts_count_post = self.make_request_callback(
            2, job_metadata)

        self.mock_api(method=GET, callback=callback_get)
        self.mock_api(method=POST, callback=callback_post)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
        job.metadata['foo'] = 'bar'
        job.metadata.save()

        # Assert
        self.assertEqual(attempts_count_post[0], 3)
Exemplo n.º 3
0
def get_scraped_data(dir,items_job, key, spider):
    # establish a connection with scrapyhub and get a items generator
    hc = HubstorageClient(auth=key)

    empty, totalItems, keptItems = 0, 0, 0
    for job in hc.get_project(items_job).jobq.list(spider=spider):
        for item in hc.get_job(job['key']).items.list():

            totalItems += 1
            item = pd.Series(item)
            if item['title'] != '' and item['article'] != '' and \
                            item['title'] != ' ' and item['article'] != ' ':
                item['spider'] = spider
                item = item.drop('category')
                item = item.replace(["page1", "page2", "page3", "scrape_time", "", "basic"],
                                    [np.nan, np.nan, np.nan, np.nan, np.nan, "reutersbasic"])
                item = item.replace({'<.*?>': '', '\[.*?\]': '', '\(.*?\)': ''}, regex=True)

                #add article hash code as the id of the article
                item['id'] = hash(item['article'])

                #write item(as records) to a json file
                file = dir + 'raw/' + str(item['id']) + '.json'
                item.to_json(file)

                keptItems += 1

            else:
                empty += 1

    print '#' * 50
    print 'Fetched: ', totalItems, ' from spider: ', item['spider']
    print keptItems, ' were written to the folder'
    print '-' * 50, '\n\n'
Exemplo n.º 4
0
    def test_retrier_does_not_catch_unwanted_exception(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=2,
                                  max_retry_time=1)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback, attempts_count = self.make_request_callback(
            3, job_metadata, http_error_status=403)

        self.mock_api(callback=callback)

        # Act
        job, metadata, err = None, None, None
        try:
            job = client.get_job('%s/%s/%s' %
                                 (self.projectid, self.spiderid, 42))
            metadata = dict(job.metadata)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(metadata)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 403)
        self.assertEqual(attempts_count[0], 1)
Exemplo n.º 5
0
def get_job(job):
    jobid, apikey = get_job_specs(job)
    hsc = HubstorageClient(auth=apikey)
    job = hsc.get_job(jobid)
    if not job.metadata:
        raise NotFoundException('Job {} does not exist'.format(jobid))
    return job
Exemplo n.º 6
0
    def test_retrier_catches_badstatusline_and_429(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}

        attempts_count = [0]  # use a list for nonlocal mutability used in request_callback

        def request_callback(request):
            attempts_count[0] += 1

            if attempts_count[0] <= 2:
                raise ConnectionError("Connection aborted.", BadStatusLine("''"))
            if attempts_count[0] == 3:
                return (429, {}, {})
            else:
                resp_body = dict(job_metadata)
                return (200, {}, json.dumps(resp_body))

        self.mock_api(callback=request_callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 4)
Exemplo n.º 7
0
def get_job(job):
    jobid, apikey = get_job_specs(job)
    hsc = HubstorageClient(auth=apikey)
    job = hsc.get_job(jobid)
    if not job.metadata:
        raise NotFoundException("Job {} does not exist".format(jobid))
    return job
Exemplo n.º 8
0
    def test_get_job_does_fails_if_no_retries(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=0)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback, attempts_count = self.make_request_callback(2, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job, metadata, err = None, None, None
        try:
            job = client.get_job('%s/%s/%s' %
                                 (self.projectid, self.spiderid, 42))
            metadata = dict(job.metadata)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(metadata)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 504)
        self.assertEqual(attempts_count[0], 1)
Exemplo n.º 9
0
    def test_api_delete_can_be_set_to_non_idempotent(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3,
                                  max_retry_time=1)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback_delete, attempts_count_delete = self.make_request_callback(
            2, job_metadata)

        self.mock_api(method=DELETE, callback=callback_delete)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        err = None
        try:
            job.metadata.apidelete('/my/non/idempotent/delete/',
                                   is_idempotent=False)
        except HTTPError as e:
            err = e

        # Assert
        self.assertEqual(attempts_count_delete[0], 1)
        self.assertIsNotNone(err)
Exemplo n.º 10
0
    def test_retrier_catches_badstatusline_and_429(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }

        attempts_count = [
            0
        ]  # use a list for nonlocal mutability used in request_callback

        def request_callback(request):
            attempts_count[0] += 1

            if attempts_count[0] <= 2:
                raise ConnectionError("Connection aborted.",
                                      BadStatusLine("''"))
            if attempts_count[0] == 3:
                return (429, {}, {})
            else:
                resp_body = dict(job_metadata)
                return (200, {}, json.dumps(resp_body))

        self.mock_api(callback=request_callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 4)
Exemplo n.º 11
0
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result):
    monkeypatch.setattr("hubstorage.resourcetype.MSGPACK_AVAILABLE", msgpack_available)
    hsclient = HubstorageClient()
    job = hsclient.get_job("2222000/1/1")
    for resource in [job.items, job.logs, job.samples]:
        assert resource._allows_mpack(path) is (msgpack_available and expected_result)
    assert job.requests._allows_mpack(path) is False
    assert job.metadata._allows_mpack(path) is False
    assert job.jobq._allows_mpack(path) is False
Exemplo n.º 12
0
    def test_auth(self):
        # client without global auth set
        hsc = HubstorageClient(endpoint=self.hsclient.endpoint)
        self.assertEqual(hsc.auth, None)

        # check no-auth access
        try:
            hsc.push_job(self.projectid, self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).push_job(self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).get_job(
                (self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        # create project with auth
        auth = self.hsclient.auth
        project = hsc.get_project(self.projectid, auth)
        self.assertEqual(project.auth, auth)
        job = project.push_job(self.spidername)
        samejob = project.get_job(job.key)
        self.assertEqual(samejob.key, job.key)
Exemplo n.º 13
0
    def test_auth(self):
        # client without global auth set
        hsc = HubstorageClient(endpoint=self.hsclient.endpoint)
        self.assertEqual(hsc.auth, None)

        # check no-auth access
        try:
            hsc.push_job(self.projectid, self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).push_job(self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        # create project with auth
        auth = self.hsclient.auth
        project = hsc.get_project(self.projectid, auth)
        self.assertEqual(project.auth, auth)
        job = project.push_job(self.spidername)
        samejob = project.get_job(job.key)
        self.assertEqual(samejob.key, job.key)
Exemplo n.º 14
0
def fetch_and_save_items():
    hc = HubstorageClient(auth=API_KEY)
    project = hc.get_project(SH_PROJECT)
    for spider in SPIDERS:
        print("\nworking on spider {}".format(spider['spider_name']))
        spider_id = project.ids.spider(spider['spider_name'])
        summary = project.spiders.lastjobsummary(spiderid=spider_id)
        for element in summary:
            print(element['key'])
            job = hc.get_job(element['key'])
            items = job.items.iter_values()
            save_items(items, spider['institution_name'])
Exemplo n.º 15
0
    def test_get_job_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback, attempts_count = self.make_request_callback(2, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 3)
Exemplo n.º 16
0
    def test_metadata_save_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback_get, attempts_count_get = self.make_request_callback(0, job_metadata)
        callback_post, attempts_count_post = self.make_request_callback(2, job_metadata)

        self.mock_api(method=GET, callback=callback_get)
        self.mock_api(method=POST, callback=callback_post)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
        job.metadata['foo'] = 'bar'
        job.metadata.save()

        # Assert
        self.assertEqual(attempts_count_post[0], 3)
Exemplo n.º 17
0
    def test_get_job_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback, attempts_count = self.make_request_callback(2, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 3)
Exemplo n.º 18
0
    def test_api_delete_can_be_set_to_non_idempotent(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback_delete, attempts_count_delete = self.make_request_callback(2, job_metadata)

        self.mock_api(method=DELETE, callback=callback_delete)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        err = None
        try:
            job.metadata.apidelete('/my/non/idempotent/delete/', is_idempotent=False)
        except HTTPError as e:
            err = e

        # Assert
        self.assertEqual(attempts_count_delete[0], 1)
        self.assertIsNotNone(err)
Exemplo n.º 19
0
    def test_retrier_does_not_catch_unwanted_exception(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback, attempts_count = self.make_request_callback(3, job_metadata, http_error_status=403)

        self.mock_api(callback=callback)

        # Act
        job, metadata, err = None, None, None
        try:
            job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
            metadata = dict(job.metadata)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(metadata)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 403)
        self.assertEqual(attempts_count[0], 1)
Exemplo n.º 20
0
    def test_get_job_does_fails_on_too_many_retries(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2, max_retry_time=1)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback, attempts_count = self.make_request_callback(3, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job, metadata, err = None, None, None
        try:
            job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
            metadata = dict(job.metadata)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(metadata)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 504)
        self.assertEqual(attempts_count[0], 3)
Exemplo n.º 21
0
def main(argv):
	apikey = ''
	spider = ''
	
	try:
		opts, args = getopt.getopt(argv, "hi:o", ["apikey=","spider="])
	except getopt.GetoptError:
		print 'alljobs.py -k <API Key> -s <ProjectID\Spider>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'alljobs.py -k <API Key> -s <ProjectID\Spider>'
			sys.exit()
		elif opt in("-k", "--apikey"):
			apikey = arg		
		elif opt in("-s", "--spider"):
			spider = arg

	hc = HubstorageClient(auth=apikey)
	itemslist = hc.get_job(spider).items.list()
	
	itemslistIterator = itemslist.__iter__()
	for items in itemslistIterator:
		print json.dumps(items)
Exemplo n.º 22
0
 def __init__(self, project: str, spider: str):
     hc = HubstorageClient(auth=shub_cfg.get('apikey'))
     key = next(hc.get_project(project).jobq.list(spider=spider)).get('key')
     self.job = hc.get_job(key)
Exemplo n.º 23
0
class SystemTest(HSTestCase):

    MAGICN = 1211

    def setUp(self):
        super(HSTestCase, self).setUp()
        endpoint = self.hsclient.endpoint
        # Panel - no client auth, only project auth using user auth token
        self.panelclient = HubstorageClient(endpoint=endpoint)
        self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
        # Runner - client uses global auth to poll jobq
        self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
        # Scraper - uses job level auth, no global or project auth available
        self.scraperclient = HubstorageClient(endpoint=endpoint)

    def test_succeed_with_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason='all-good')
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'all-good')
        self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)

    def test_succeed_without_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=None)
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'no_reason')
        self.assertEqual(job.items.stats()['totals']['input_values'], self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'], self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'], self.MAGICN)

    def test_scraper_failure(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=IOError('no more resources, ha!'))
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'failed')
        # MAGICN per log level messages plus one of last failure
        stats = job.logs.stats()
        self.assertTrue(stats)
        self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)

    def _run_runner(self, pushed, close_reason):
        job = self.runnerclient.start_job(self.projectid)
        self.assertFalse(job.metadata.get('stop_requested'))
        job.metadata.update(host='localhost', slot=1)
        self.assertEqual(job.metadata.get('state'), 'running')
        # run scraper
        try:
            self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
        except Exception as exc:
            job.failed(message=str(exc))
            # logging from runner must append and never remove messages logged
            # by scraper
            self.assertTrue(job.logs.batch_append)
        else:
            job.finished()

        self.runnerclient.close()

    def _run_scraper(self, jobkey, jobauth, close_reason=None):
        httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
        job = self.scraperclient.get_job(jobkey, auth=jobauth)
        for idx in xrange(self.MAGICN):
            iid = job.items.write({'uuid': idx})
            job.logs.debug('log debug %s' % idx, idx=idx)
            job.logs.info('log info %s' % idx, idx=idx)
            job.logs.warn('log warn %s' % idx, idx=idx)
            job.logs.error('log error %s' % idx, idx=idx)
            sid = job.samples.write([idx, idx, idx])
            rid = job.requests.add(
                url='http://test.com/%d' % idx,
                status=random.randint(100, 1000),
                method=random.choice(httpmethods),
                rs=random.randint(0, 100000),
                duration=random.randint(0, 1000),
                parent=random.randrange(0, idx + 1) if idx > 10 else None,
                ts=millitime() + random.randint(100, 100000),
            )
            self.assertEqual(iid, idx)
            self.assertEqual(sid, idx)
            self.assertEqual(rid, idx)

        if isinstance(close_reason, Exception):
            self.scraperclient.close()
            raise close_reason

        if close_reason:
            job.metadata['close_reason'] = close_reason

        job.metadata.save()
        self.scraperclient.close()
        del self.scraperclient
Exemplo n.º 24
0
class SystemTest(HSTestCase):

    MAGICN = 1211

    def setUp(self):
        super(HSTestCase, self).setUp()
        endpoint = self.hsclient.endpoint
        # Panel - no client auth, only project auth using user auth token
        self.panelclient = HubstorageClient(endpoint=endpoint)
        self.panelproject = self.panelclient.get_project(self.projectid,
                                                         auth=self.auth)
        # Runner - client uses global auth to poll jobq
        self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
        # Scraper - uses job level auth, no global or project auth available
        self.scraperclient = HubstorageClient(endpoint=endpoint)

    def test_succeed_with_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason='all-good')
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'all-good')
        self.assertEqual(job.items.stats()['totals']['input_values'],
                         self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'],
                         self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'],
                         self.MAGICN)

    def test_succeed_without_close_reason(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=None)
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'no_reason')
        self.assertEqual(job.items.stats()['totals']['input_values'],
                         self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'],
                         self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'],
                         self.MAGICN)

    def test_scraper_failure(self):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed,
                         close_reason=IOError('no more resources, ha!'))
        # query again from panel
        job = p.get_jobs(self.spiderid).next()
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'), 'failed')
        # MAGICN per log level messages plus one of last failure
        stats = job.logs.stats()
        self.assertTrue(stats)
        self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)

    def _run_runner(self, pushed, close_reason):
        job = self.runnerclient.start_job(self.projectid)
        self.assertFalse(job.metadata.get('stop_requested'))
        job.metadata.update(host='localhost', slot=1)
        self.assertEqual(job.metadata.get('state'), 'running')
        # run scraper
        try:
            self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
        except Exception as exc:
            job.failed(message=str(exc))
            # logging from runner must append and never remove messages logged
            # by scraper
            self.assertTrue(job.logs.batch_append)
        else:
            job.finished()

        self.runnerclient.close()

    def _run_scraper(self, jobkey, jobauth, close_reason=None):
        httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
        job = self.scraperclient.get_job(jobkey, auth=jobauth)
        for idx in xrange(self.MAGICN):
            iid = job.items.write({'uuid': idx})
            job.logs.debug('log debug %s' % idx, idx=idx)
            job.logs.info('log info %s' % idx, idx=idx)
            job.logs.warn('log warn %s' % idx, idx=idx)
            job.logs.error('log error %s' % idx, idx=idx)
            sid = job.samples.write([idx, idx, idx])
            rid = job.requests.add(
                url='http://test.com/%d' % idx,
                status=random.randint(100, 1000),
                method=random.choice(httpmethods),
                rs=random.randint(0, 100000),
                duration=random.randint(0, 1000),
                parent=random.randrange(0, idx + 1) if idx > 10 else None,
                ts=millitime() + random.randint(100, 100000),
            )
            self.assertEqual(iid, idx)
            self.assertEqual(sid, idx)
            self.assertEqual(rid, idx)

        if isinstance(close_reason, Exception):
            self.scraperclient.close()
            raise close_reason

        if close_reason:
            job.metadata['close_reason'] = close_reason

        job.metadata.save()
        self.scraperclient.close()
        del self.scraperclient
Exemplo n.º 25
0
#coding=UTF-8
from hubstorage import HubstorageClient
hc = HubstorageClient(auth='bc2aa25cc40f4ed4b03988e8e0b9e89e')
project = hc.get_project('53883')
itemslist = hc.get_job('53883/1/5').items.list()
itemslist_size = itemslist.__sizeof__()
for element in itemslist:
	element.__delitem__('_type')
	element.__delitem__('_cached_page_id')
	element.__delitem__('_template')
	elementIterator = element.iteritems()
	for fields in elementIterator:
		fieldIterator = fields.__iter__()
		for values in fieldIterator:
			if isinstance(values, basestring):
				print values
			else:
				print values.pop()