Exemplo n.º 1
0
def get_job(job):
    jobid, apikey = get_job_specs(job)
    hsc = HubstorageClient(auth=apikey)
    job = hsc.get_job(jobid)
    if not job.metadata:
        raise NotFoundException("Job {} does not exist".format(jobid))
    return job
Exemplo n.º 2
0
    def test_retrier_catches_badstatusline_and_429(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}

        attempts_count = [0]  # use a list for nonlocal mutability used in request_callback

        def request_callback(request):
            attempts_count[0] += 1

            if attempts_count[0] <= 2:
                raise ConnectionError("Connection aborted.", BadStatusLine("''"))
            if attempts_count[0] == 3:
                return (429, {}, {})
            else:
                resp_body = dict(job_metadata)
                return (200, {}, json.dumps(resp_body))

        self.mock_api(callback=request_callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 4)
Exemplo n.º 3
0
 def test_connect_retry(self):
     c = HubstorageClient(auth=self.auth,
         endpoint=self.endpoint, max_retries=2)
     job = c.push_job(self.projectid, self.spidername,
                      state='running')
     m = job.metadata
     self.assertEqual(m.get('state'), u'running', c.auth)
     m.expire()
     self.assertEqual(c.session.adapters['http://'].max_retries, 2)
Exemplo n.º 4
0
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result):
    monkeypatch.setattr("hubstorage.resourcetype.MSGPACK_AVAILABLE", msgpack_available)
    hsclient = HubstorageClient()
    job = hsclient.get_job("2222000/1/1")
    for resource in [job.items, job.logs, job.samples]:
        assert resource._allows_mpack(path) is (msgpack_available and expected_result)
    assert job.requests._allows_mpack(path) is False
    assert job.metadata._allows_mpack(path) is False
    assert job.jobq._allows_mpack(path) is False
Exemplo n.º 5
0
 def setUp(self):
     super(HSTestCase, self).setUp()
     endpoint = self.hsclient.endpoint
     # Panel - no client auth, only project auth using user auth token
     self.panelclient = HubstorageClient(endpoint=endpoint)
     self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
     # Runner - client uses global auth to poll jobq
     self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
     # Scraper - uses job level auth, no global or project auth available
     self.scraperclient = HubstorageClient(endpoint=endpoint)
Exemplo n.º 6
0
def fetch_and_save_items():
    hc = HubstorageClient(auth=API_KEY)
    project = hc.get_project(SH_PROJECT)
    for spider in SPIDERS:
        print("\nworking on spider {}".format(spider['spider_name']))
        spider_id = project.ids.spider(spider['spider_name'])
        summary = project.spiders.lastjobsummary(spiderid=spider_id)
        for element in summary:
            print(element['key'])
            job = hc.get_job(element['key'])
            items = job.items.iter_values()
            save_items(items, spider['institution_name'])
Exemplo n.º 7
0
    def test_get_job_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback, attempts_count = self.make_request_callback(2, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 3)
Exemplo n.º 8
0
    def __init__(self, crawler):

        self.crawler = crawler
        self.hs_endpoint = crawler.settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(crawler, "HS_CONSUME_FROM_SLOT")
        try:
            self.hs_number_of_slots = int(crawler.settings.get("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS))
        except ValueError:
            self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS
        try:
            self.hs_max_links = int(crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS))
        except ValueError:
            self.hs_max_links = DEFAULT_MAX_LINKS
        self.hs_start_job_enabled = crawler.settings.get("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", ['finished'])
        self.hs_start_job_new_panel = crawler.settings.get("HS_START_JOB_NEW_PANEL", False)

        if not self.hs_start_job_new_panel:
            conn = Connection(self.hs_auth)
            self.oldpanel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
Exemplo n.º 9
0
    def __init__(self, crawler):
        settings = crawler.settings
        self.hs_endpoint = settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(settings, "HS_AUTH")
        self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
        self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
        self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
        self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
        self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])

        conn = Connection(self.hs_auth)
        self.panel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
Exemplo n.º 10
0
    def test_metadata_save_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback_get, attempts_count_get = self.make_request_callback(0, job_metadata)
        callback_post, attempts_count_post = self.make_request_callback(2, job_metadata)

        self.mock_api(method=GET, callback=callback_get)
        self.mock_api(method=POST, callback=callback_post)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
        job.metadata['foo'] = 'bar'
        job.metadata.save()

        # Assert
        self.assertEqual(attempts_count_post[0], 3)
Exemplo n.º 11
0
 def test_debug_queries(self):
     self.hsclient = HubstorageClient(auth=self.auth, endpoint=self.endpoint, debug=True)
     self.assertEqual(self.hsclient.queries, [])
     self.project = self.hsclient.get_project(self.projectid)
     list(self.project.get_jobs(self.spiderid))
     self.assertEqual(len(self.hsclient.queries), 1)
     q = self.hsclient.queries[0]
     self.assertEqual(q['method'], 'GET')
     self.assert_(q['time'] > 0)
     self.assert_('url' in q)
Exemplo n.º 12
0
    def test_push_job_does_not_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        callback, attempts_count = self.make_request_callback(2, {'key': '1/2/3'})

        self.mock_api(POST, callback=callback)

        # Act
        job, err = None, None
        try:
            job = client.push_job(self.projectid, self.spidername)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(job)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 504)
        self.assertEqual(attempts_count[0], 1)
Exemplo n.º 13
0
 def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30):
     self._hs_client = HubstorageClient(auth=auth)
     self._hcf = self._hs_client.get_project(project_id).frontier
     self._hcf.batch_size = batch_size
     self._hcf.batch_interval = flush_interval
     self._frontier = frontier
     self._links_count = defaultdict(int)
     self._links_to_flush_count = defaultdict(int)
     self._hcf_retries = 10
     self.logger = logging.getLogger("hubstorage-wrapper")
Exemplo n.º 14
0
    def test_collection_store_and_delete_are_retried(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)

        callback_post, attempts_count_post = self.make_request_callback(2, [])
        callback_delete, attempts_count_delete = self.make_request_callback(2, [])

        self.mock_api(method=POST, callback=callback_delete, url_match='/.*/deleted')
        self.mock_api(method=POST, callback=callback_post)  # /!\ default regexp matches all paths, has to be added last

        # Act
        project = client.get_project(self.projectid)
        store = project.collections.new_store('foo')
        store.set({'_key': 'bar', 'content': 'value'})
        store.delete('baz')

        # Assert
        self.assertEqual(attempts_count_post[0], 3)
        self.assertEqual(attempts_count_delete[0], 3)
Exemplo n.º 15
0
    def test_api_delete_can_be_set_to_non_idempotent(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback_delete, attempts_count_delete = self.make_request_callback(2, job_metadata)

        self.mock_api(method=DELETE, callback=callback_delete)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        err = None
        try:
            job.metadata.apidelete('/my/non/idempotent/delete/', is_idempotent=False)
        except HTTPError as e:
            err = e

        # Assert
        self.assertEqual(attempts_count_delete[0], 1)
        self.assertIsNotNone(err)
Exemplo n.º 16
0
    def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start):
        super(HCFStates, self).__init__(cache_size_limit)
        self._hs_client = HubstorageClient(auth=auth)
        self.projectid = project_id
        project = self._hs_client.get_project(self.projectid)
        self._collections = project.collections
        self._colname = colname + "_states"
        self.logger = logging.getLogger("hcf.states")

        if cleanup_on_start:
            self._cleanup()
Exemplo n.º 17
0
    def test_auth(self):
        # client without global auth set
        hsc = HubstorageClient(endpoint=self.hsclient.endpoint)
        self.assertEqual(hsc.auth, None)

        # check no-auth access
        try:
            hsc.push_job(self.projectid, self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).push_job(self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        # create project with auth
        auth = self.hsclient.auth
        project = hsc.get_project(self.projectid, auth)
        self.assertEqual(project.auth, auth)
        job = project.push_job(self.spidername)
        samejob = project.get_job(job.key)
        self.assertEqual(samejob.key, job.key)
Exemplo n.º 18
0
    def test_retrier_does_not_catch_unwanted_exception(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback, attempts_count = self.make_request_callback(3, job_metadata, http_error_status=403)

        self.mock_api(callback=callback)

        # Act
        job, metadata, err = None, None, None
        try:
            job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
            metadata = dict(job.metadata)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(metadata)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 403)
        self.assertEqual(attempts_count[0], 1)
Exemplo n.º 19
0
    def test_get_job_does_fails_on_too_many_retries(self):
        # Prepare
        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=2, max_retry_time=1)
        job_metadata = {'project': self.projectid, 'spider': self.spidername, 'state': 'pending'}
        callback, attempts_count = self.make_request_callback(3, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job, metadata, err = None, None, None
        try:
            job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
            metadata = dict(job.metadata)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(metadata)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 504)
        self.assertEqual(attempts_count[0], 3)
Exemplo n.º 20
0
    def test_push_job_does_not_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        callback, attempts_count = self.make_request_callback(
            2, {'key': '1/2/3'})

        self.mock_api(POST, callback=callback)

        # Act
        job, err = None, None
        try:
            job = client.push_job(self.projectid, self.spidername)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(job)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 504)
        self.assertEqual(attempts_count[0], 1)
Exemplo n.º 21
0
class BaseSpider(Spider):
    def __init__(self, *args, **kwargs):
        super(BaseSpider, self).__init__(*args, **kwargs)
        if 'crawl_days' in kwargs:
            self.crawl_type = 'full'
            self.crawl_days = int(self.crawl_days)
            assert self.crawl_days
        elif 'crawl_days' not in kwargs and 'crawl_type' not in kwargs:
            self.crawl_type = 'full'
            self.crawl_days = 14

    def start_requests(self):
        yield Request(self.input_url, callback=self.parse)

    def initialize_hubstorage_collection(self):
        self.hs_client = HubstorageClient(self.settings.get('HS_AUTH'))
        self.hs_projectid = os.environ.get('SCRAPY_PROJECT_ID')
        if self.hs_projectid is None:
            self.hs_projectid = self.settings.get('HS_PROJECTID')
        collections = self.hs_client.get_project(self.hs_projectid).collections
        self.hs_collection = collections.new_store(self.name)

    def set_min_post_date(self):
        if getattr(self, 'crawl_days', None):
            self.min_post_date = datetime.now() - timedelta(
                days=self.crawl_days)
        else:
            self.min_post_date = datetime.strptime(
                self.settings['AVC_MIN_POST_DATE'],
                self.settings['AVC_DATE_FORMAT'],
            )
        self.logger.info('Setting min_post_date as %s' % self.min_post_date)

    def get_latest_scraped_date(self, username):
        try:
            entry = self.hs_collection.get(username)
            return datetime.strptime(entry['value'],
                                     self.settings['AVC_DATE_TIME_FORMAT'])
        except:
            return None

    def set_latest_scraped_date(self, username, latest_scraped_date):
        entry = {
            '_key':
            username,
            'value':
            latest_scraped_date.strftime(
                self.settings['AVC_DATE_TIME_FORMAT']),
        }
        self.hs_collection.set(entry)
Exemplo n.º 22
0
    def test_delete_on_hubstorage_api_does_not_404(self):
        # NOTE: The current Hubstorage API does not raise 404 errors on deleting resources that do not exist,
        #       Thus the retry policy does not catch 404 errors when retrying deletes (simplify implementation A LOT).
        #       This test checks that this assumption holds.

        client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=0)
        project = client.get_project(projectid=self.projectid)

        # Check frontier delete
        project.frontier.delete_slot('frontier_non_existing', 'slot_non_existing')

        # Check metadata delete
        job = client.push_job(self.projectid, self.spidername)
        job.metadata['foo'] = 'bar'  # Add then delete key, this will trigger an api delete for item foo
        del job.metadata['foo']
        job.metadata.save()

        # Check collections delete
        store = project.collections.new_store('foo')
        store.set({'_key': 'foo'})
        store.delete('bar')

        self.assertTrue(True, "No error have been triggered by calling a delete on resources that do not exist")
Exemplo n.º 23
0
def main(argv):
	apikey = ''
	project = ''
	
	try:
		opts, args = getopt.getopt(argv, "hi:o", ["apikey=","project="])
	except getopt.GetoptError:
		print 'alljobs.py -k <API Key> -p <ProjectID>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'alljobs.py -k <API Key> -p <ProjectID>'
			sys.exit()
		elif opt in("-k", "--apikey"):
			apikey = arg
		elif opt in("-p", "--project"):
			project = arg
	
	hc = HubstorageClient(auth=apikey)
	project = hc.get_project(project)
	jobs_metadata = project.jobq.list()
	jobids = [j['key'] for j in jobs_metadata]
	jobidsUtf = [x.encode('UTF8') for x in jobids]
	print jobidsUtf
Exemplo n.º 24
0
def main(argv):
	apikey = ''
	spider = ''
	
	try:
		opts, args = getopt.getopt(argv, "hi:o", ["apikey=","spider="])
	except getopt.GetoptError:
		print 'alljobs.py -k <API Key> -s <ProjectID\Spider>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print 'alljobs.py -k <API Key> -s <ProjectID\Spider>'
			sys.exit()
		elif opt in("-k", "--apikey"):
			apikey = arg		
		elif opt in("-s", "--spider"):
			spider = arg

	hc = HubstorageClient(auth=apikey)
	itemslist = hc.get_job(spider).items.list()
	
	itemslistIterator = itemslist.__iter__()
	for items in itemslistIterator:
		print json.dumps(items)
Exemplo n.º 25
0
    def test_retrier_catches_badstatusline_and_429(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3,
                                  max_retry_time=1)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }

        attempts_count = [
            0
        ]  # use a list for nonlocal mutability used in request_callback

        def request_callback(request):
            attempts_count[0] += 1

            if attempts_count[0] <= 2:
                raise ConnectionError("Connection aborted.",
                                      BadStatusLine("''"))
            if attempts_count[0] == 3:
                return (429, {}, u'')
            else:
                resp_body = dict(job_metadata)
                return (200, {}, json.dumps(resp_body))

        self.mock_api(callback=request_callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 4)
Exemplo n.º 26
0
 def setUp(self):
     super(HSTestCase, self).setUp()
     endpoint = self.hsclient.endpoint
     # Panel - no client auth, only project auth using user auth token
     self.panelclient = HubstorageClient(endpoint=endpoint)
     self.panelproject = self.panelclient.get_project(self.projectid,
                                                      auth=self.auth)
     # Runner - client uses global auth to poll jobq
     self.runnerclient = HubstorageClient(endpoint=endpoint, auth=self.auth)
     # Scraper - uses job level auth, no global or project auth available
     self.scraperclient = HubstorageClient(endpoint=endpoint)
Exemplo n.º 27
0
 def _run_runner(self, pushed, close_reason):
     client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
     with closing(client) as runnerclient:
         job = self.start_job()
         self.assertFalse(job.metadata.get('stop_requested'))
         job.metadata.update(host='localhost', slot=1)
         self.assertEqual(job.metadata.get('state'), 'running')
         # run scraper
         try:
             self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
         except Exception as exc:
             job.logs.error(message=str(exc), appendmode=True)
             job.close_writers()
             job.jobq.finish(job, close_reason='failed')
             # logging from runner must append and never remove messages logged
             # by scraper
             self.assertTrue(job.logs.batch_append)
         else:
             job.jobq.finish(job, close_reason=close_reason or 'no_reason')
Exemplo n.º 28
0
class ClientTest(HSTestCase):

    def test_push_job(self):
        c = self.hsclient
        job = c.push_job(self.projectid, self.spidername,
                        state='running',
                        priority=self.project.jobq.PRIO_LOW,
                        foo='baz')
        m = job.metadata
        self.assertEqual(m.get('state'), u'running', c.auth)
        self.assertEqual(m.get('foo'), u'baz')
        self.project.jobq.delete(job)
        m.expire()
        self.assertEqual(m.get('state'), u'deleted')
        self.assertEqual(m.get('foo'), u'baz')

    def test_botgroup(self):
        self.project.settings.update(botgroups=['foo'], created=millitime())
        self.project.settings.save()
        c = self.hsclient
        q1 = c.push_job(self.project.projectid, self.spidername)
        j1 = c.start_job()
        self.assertEqual(j1, None, 'got %s, pushed job was %s' % (j1, q1))
        j2 = c.start_job(botgroup='bar')
        self.assertEqual(j2, None, 'got %s, pushed job was %s' % (j2, q1))
        j3 = c.start_job(botgroup='foo')
        self.assertEqual(j3.key, q1.key)

    def test_debug_queries(self):
        self.hsclient = HubstorageClient(auth=self.auth, endpoint=self.endpoint, debug=True)
        self.assertEqual(self.hsclient.queries, [])
        self.project = self.hsclient.get_project(self.projectid)
        list(self.project.get_jobs(self.spiderid))
        self.assertEqual(len(self.hsclient.queries), 1)
        q = self.hsclient.queries[0]
        self.assertEqual(q['method'], 'GET')
        self.assert_(q['time'] > 0)
        self.assert_('url' in q)
Exemplo n.º 29
0
    def __init__(self, crawler):

        self.crawler = crawler
        hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
        hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
        # Max number of batches to read from the HCF within a single run.
        try:
            self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
        except ValueError:
            self.hs_max_baches = DEFAULT_MAX_BATCHES
        self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", [])

        self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(list)
        self.batch_ids = []

        crawler.signals.connect(self.idle_spider, signals.spider_idle)
        crawler.signals.connect(self.close_spider, signals.spider_closed)
Exemplo n.º 30
0
    def test_auth(self):
        # client without global auth set
        hsc = HubstorageClient(endpoint=self.hsclient.endpoint)
        self.assertEqual(hsc.auth, None)

        # check no-auth access
        try:
            hsc.push_job(self.projectid, self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).push_job(self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).get_job(
                (self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        # create project with auth
        auth = self.hsclient.auth
        project = hsc.get_project(self.projectid, auth)
        self.assertEqual(project.auth, auth)
        job = project.push_job(self.spidername)
        samejob = project.get_job(job.key)
        self.assertEqual(samejob.key, job.key)
Exemplo n.º 31
0
class HCFStates(MemoryStates):
    def __init__(self, auth, project_id, colname, cache_size_limit,
                 cleanup_on_start):
        super(HCFStates, self).__init__(cache_size_limit)
        self._hs_client = HubstorageClient(auth=auth)
        self.projectid = project_id
        project = self._hs_client.get_project(self.projectid)
        self._collections = project.collections
        self._colname = colname + "_states"
        self.logger = logging.getLogger("hcf.states")

        if cleanup_on_start:
            self._cleanup()

    def _cleanup(self):
        while True:
            nextstart = None
            params = {
                'method':
                'DELETE',
                'url':
                'https://storage.scrapinghub.com/collections/%d/s/%s' %
                (self.projectid, self._colname),
                'auth':
                self._hs_client.auth
            }
            if nextstart:
                params['prefix'] = nextstart
            response = self._hs_client.session.request(**params)
            if response.status_code != 200:
                self.logger.error("%d %s", response.status_code,
                                  response.content)
                self.logger.info(params)
            try:
                r = loads(response.content.decode('utf-8'))
                self.logger.debug("Removed %d, scanned %d", r["deleted"],
                                  r["scanned"])
                nextstart = r.get('nextstart')
            except ValueError as ve:
                self.logger.debug(ve)
                self.logger.debug("content: %s (%d)" %
                                  (response.content, len(response.content)))
            if not nextstart:
                break

    def frontier_start(self):
        self._store = self._collections.new_store(self._colname)

    def frontier_stop(self):
        self.logger.debug("Got frontier stop.")
        self.flush()
        self._hs_client.close()

    def _hcf_fetch(self, to_fetch):
        finished = False
        i = iter(to_fetch)
        while True:
            prepared_keys = []
            while True:
                try:
                    prepared_keys.append("key=%s" % next(i))
                    if len(prepared_keys) >= 32:
                        break
                except StopIteration:
                    finished = True
                    break

            if not prepared_keys:
                break

            prepared_keys.append("meta=_key")
            params = {
                'method':
                'GET',
                'url':
                'https://storage.scrapinghub.com/collections/%d/s/%s' %
                (self.projectid, self._colname),
                'params':
                str('&').join(prepared_keys),
                'auth':
                self._hs_client.auth
            }
            start = time()
            response = self._hs_client.session.request(**params)
            self.logger.debug("Fetch request time %f ms",
                              (time() - start) * 1000)
            if response.status_code != 200:
                self.logger.error("%d %s", response.status_code,
                                  response.content)
                self.logger.info(params)
            for line in response.content.decode('utf-8').split('\n'):
                if not line:
                    continue
                try:
                    yield loads(line)
                except ValueError as ve:
                    self.logger.debug(ve)
                    self.logger.debug("content: %s (%d)" % (line, len(line)))
            if finished:
                break

    def fetch(self, fingerprints):
        to_fetch = [f for f in fingerprints if f not in self._cache]
        self.logger.debug("cache size %s" % len(self._cache))
        self.logger.debug("to fetch %d from %d" %
                          (len(to_fetch), len(fingerprints)))
        if not to_fetch:
            return
        count = 0
        for o in self._hcf_fetch(to_fetch):
            self._cache[o['_key']] = o['value']
            count += 1
        self.logger.debug("Fetched %d items" % count)

    def flush(self, force_clear=False):
        buffer = []
        count = 0
        start = time()
        try:
            for fprint, state_val in six.iteritems(self._cache):
                buffer.append({'_key': fprint, 'value': state_val})
                if len(buffer) > 1024:
                    count += len(buffer)
                    self._store.set(buffer)
                    buffer = []
        finally:
            count += len(buffer)
            self._store.set(buffer)
        self.logger.debug("Send time %f ms", (time() - start) * 1000)
        self.logger.debug("State cache has been flushed: %d items" % count)
        super(HCFStates, self).flush(force_clear)
Exemplo n.º 32
0
 def __init__(self, product_name, apikey, project_id, hours):
     self.product_name = product_name
     project = HubstorageClient(apikey).get_project(project_id)
     self.item_store = project.collections.new_store(product_name)
     self.load_items_from_last_n_hours(hours)
Exemplo n.º 33
0
 def test_custom_ua(self):
     client = HubstorageClient(auth=HSTestCase.auth,
                               endpoint=HSTestCase.endpoint,
                               user_agent='testUA')
     self.assertEqual(client.user_agent, 'testUA')
Exemplo n.º 34
0
 def setUpClass(cls):
     cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
     cls.project = cls.hsclient.get_project(cls.projectid)
     cls.spiderid = str(cls.project.ids.spider(cls.spidername, create=1))
     cls._set_testbotgroup()
Exemplo n.º 35
0
 def setUpClass(cls):
     cls.endpoint = HS_ENDPOINT
     cls.auth = HS_AUTH
     cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
     cls.project = cls.hsclient.get_project(cls.projectid)
     cls.fclient = cls.project.frontier
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result):
    monkeypatch.setattr(
        'hubstorage.collectionsrt.MSGPACK_AVAILABLE', msgpack_available)
    hsclient = HubstorageClient()
    collections = hsclient.get_project(2222000).collections
    assert collections._allows_mpack(path) is (msgpack_available and expected_result)
Exemplo n.º 37
0
 def setUp(self):
     super(HSTestCase, self).setUp()
     self.endpoint = self.hsclient.endpoint
     # Panel - no client auth, only project auth using user auth token
     self.panelclient = HubstorageClient(endpoint=self.endpoint)
     self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
Exemplo n.º 38
0
#coding=UTF-8
from hubstorage import HubstorageClient
hc = HubstorageClient(auth='bc2aa25cc40f4ed4b03988e8e0b9e89e')
project = hc.get_project('53883')
itemslist = hc.get_job('53883/1/5').items.list()
itemslist_size = itemslist.__sizeof__()
for element in itemslist:
	element.__delitem__('_type')
	element.__delitem__('_cached_page_id')
	element.__delitem__('_template')
	elementIterator = element.iteritems()
	for fields in elementIterator:
		fieldIterator = fields.__iter__()
		for values in fieldIterator:
			if isinstance(values, basestring):
				print values
			else:
				print values.pop()
Exemplo n.º 39
0
class HcfMiddleware(object):

    def __init__(self, crawler):
        settings = crawler.settings
        self.hs_endpoint = settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(settings, "HS_AUTH")
        self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
        self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
        self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
        self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
        self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])

        conn = Connection(self.hs_auth)
        self.panel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()

    def _get_config(self, settings, key, default=None):
        value = settings.get(key, default)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg, level=log.INFO):
        log.msg('(HCF) %s' % msg, level)

    def start_job(self, spider):
        self._msg("Starting new job for: %s" % spider.name)
        jobid = self.panel_project.schedule(
            spider.name,
            hs_consume_from_slot=self.hs_consume_from_slot,
            dummy=datetime.now()
        )
        self._msg("New job started: %s" % jobid)
        return jobid

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot)
        self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)

        self.has_new_requests = False
        for req in self._get_new_requests():
            self.has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        # unless this is not the first job.
        if not self.has_new_requests and not getattr(spider, 'dummy', None):
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if request.meta.get('use_hcf', False):
                    if request.method == 'GET':  # XXX: Only GET support for now.
                        slot = slot_callback(request)
                        if not request.url in self.new_links[slot]:
                            hcf_params = request.meta.get('hcf_params')
                            fp = {'fp': request.url}
                            if hcf_params:
                                fp.update(hcf_params)
                            # Save the new links as soon as possible using
                            # the batch uploader
                            self.fclient.add(self.hs_frontier, slot, [fp])
                            self.new_links[slot].add(request.url)
                    else:
                        self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url,
                                  log.ERROR)
                        yield request
                else:
                    yield request
            else:
                yield item

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by another process).
        if reason == 'finished':
            self._save_new_links_count()
            self._delete_processed_ids()

        # Close the frontier client in order to make sure that all the new links
        # are stored.
        self.fclient.close()
        self.hsclient.close()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished.
        if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason:

            # Start the new job if this job had requests from the HCF or it
            # was the first job.
            if self.has_new_requests or not getattr(spider, 'dummy', None):
                self.start_job(spider)

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1):
            for fingerprint, data in batch['requests']:
                num_links += 1
                yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}})
            self.batch_ids.append(batch['id'])
            if num_links >= self.hs_max_links:
                break
        self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot))
        self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot))

    def _save_new_links_count(self):
        """ Save the new extracted links into the HCF."""
        for slot, new_links in self.new_links.items():
            self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot))
        self.new_links = defaultdict(set)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids),
                                                                self.hs_consume_from_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        md5 = hashlib.md5()
        md5.update(request.url)
        digest = md5.hexdigest()
        return str(int(digest, 16) % self.hs_number_of_slots)
Exemplo n.º 40
0
 def setUp(self):
     super(HSTestCase, self).setUp()
     self.endpoint = self.hsclient.endpoint
     # Panel - no client auth, only project auth using user auth token
     self.panelclient = HubstorageClient(endpoint=self.endpoint)
     self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
Exemplo n.º 41
0
 def __init__(self, project: str, spider: str):
     hc = HubstorageClient(auth=shub_cfg.get('apikey'))
     key = next(hc.get_project(project).jobq.list(spider=spider)).get('key')
     self.job = hc.get_job(key)
Exemplo n.º 42
0
class HcfMiddleware(object):

    def __init__(self, crawler):

        self.crawler = crawler
        hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
        hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
        # Max number of batches to read from the HCF within a single run.
        try:
            self.hs_max_baches = int(crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
        except ValueError:
            self.hs_max_baches = DEFAULT_MAX_BATCHES
        self.hs_start_job_on_reason = crawler.settings.get("HS_START_JOB_ON_REASON", [])

        self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(list)
        self.batch_ids = []

        crawler.signals.connect(self.idle_spider, signals.spider_idle)
        crawler.signals.connect(self.close_spider, signals.spider_closed)

    def _get_config(self, crawler, key):
        value = crawler.settings.get(key)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg):
        log.msg('(HCF) %s' % msg)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'frontier', self.hs_project_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_slot = getattr(spider, 'slot', self.hs_project_slot)
        self._msg('Using HS_SLOT=%s' % self.hs_slot)

        has_new_requests = False
        for req in self._get_new_requests():
            has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        if not has_new_requests:
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if (request.method == 'GET' and  # XXX: Only GET support for now.
                    request.meta.get('use_hcf', False)):
                    slot = slot_callback(request)
                    hcf_params = request.meta.get('hcf_params')
                    fp = {'fp': request.url}
                    if hcf_params:
                        fp.update(hcf_params)
                    self.new_links[slot].append(fp)
                else:
                    yield item
            else:
                yield item

    def idle_spider(self, spider):
        self._save_new_links()
        self.fclient.flush()
        self._delete_processed_ids()
        has_new_requests = False
        for request in self._get_new_requests():
            self.crawler.engine.schedule(request, spider)
            has_new_requests = True
        if has_new_requests:
            raise DontCloseSpider

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by anothe process).
        if reason == 'finished':
            self._save_new_links()
            self._delete_processed_ids()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished. The idea is to limit
        # every spider runtime (either via itemcount, pagecount or timeout) and
        # then have the old spider start a new one to take its place in the slot.
        if reason in self.hs_start_job_on_reason:
            self._msg("Starting new job" + spider.name)
            job = self.hsclient.start_job(projectid=self.hs_projectid,
                                          spider=spider.name)
            self._msg("New job started: %s" % job)
        self.fclient.close()
        self.hsclient.close()

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_slot), 1):
            for r in batch['requests']:
                num_links += 1
                yield Request(r[0])
            self.batch_ids.append(batch['id'])
            if num_batches >= self.hs_max_baches:
                break
        self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_slot))
        self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_slot))

    def _save_new_links(self):
        """ Save the new extracted links into the HCF."""
        for slot, fps in self.new_links.items():
            self.fclient.add(self.hs_frontier, slot, fps)
            self._msg('Stored %d new links in slot(%s)' % (len(fps), slot))
        self.new_links = defaultdict(list)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids),
                                                                self.hs_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        return '0'
Exemplo n.º 43
0
class HCFClientWrapper(object):
    def __init__(self,
                 auth,
                 project_id,
                 frontier,
                 batch_size=0,
                 flush_interval=30):
        self._hs_client = HubstorageClient(auth=auth)
        self._hcf = self._hs_client.get_project(project_id).frontier
        self._hcf.batch_size = batch_size
        self._hcf.batch_interval = flush_interval
        self._frontier = frontier
        self._links_count = defaultdict(int)
        self._links_to_flush_count = defaultdict(int)
        self._hcf_retries = 10
        self.logger = logging.getLogger("hubstorage-wrapper")

    def add_request(self, slot, request):
        self._hcf.add(self._frontier, slot, [request])
        self._links_count[slot] += 1
        self._links_to_flush_count[slot] += 1
        return 0

    def flush(self, slot=None):
        n_links_to_flush = self.get_number_of_links_to_flush(slot)
        if n_links_to_flush:
            if slot is None:
                self._hcf.flush()
                for slot in self._links_to_flush_count.keys():
                    self._links_to_flush_count[slot] = 0
            else:
                writer = self._hcf._get_writer(self._frontier, slot)
                writer.flush()
                self._links_to_flush_count[slot] = 0
        return n_links_to_flush

    def read(self, slot, mincount=None):
        for i in range(self._hcf_retries):
            try:
                return self._hcf.read(self._frontier, slot, mincount)
            except requests_lib.exceptions.ReadTimeout:
                self.logger.error(
                    "Could not read from {0}/{1} try {2}/{3}".format(
                        self._frontier, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.ConnectionError:
                self.logger.error(
                    "Connection error while reading from {0}/{1} try {2}/{3}".
                    format(self._frontier, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.RequestException:
                self.logger.error(
                    "Error while reading from {0}/{1} try {2}/{3}".format(
                        self._frontier, slot, i + 1, self._hcf_retries))
            sleep(60 * (i + 1))
        return []

    def delete(self, slot, ids):
        for i in range(self._hcf_retries):
            try:
                self._hcf.delete(self._frontier, slot, ids)
                break
            except requests_lib.exceptions.ReadTimeout:
                self.logger.error(
                    "Could not delete ids from {0}/{1} try {2}/{3}".format(
                        self._frontier, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.ConnectionError:
                self.logger.error(
                    "Connection error while deleting ids from {0}/{1} try {2}/{3}"
                    .format(self._frontier, slot, i + 1, self._hcf_retries))
            except requests_lib.exceptions.RequestException:
                self.logger.error(
                    "Error deleting ids from {0}/{1} try {2}/{3}".format(
                        self._frontier, slot, i + 1, self._hcf_retries))
            sleep(60 * (i + 1))

    def delete_slot(self, slot):
        self._hcf.delete_slot(self._frontier, slot)

    def close(self):
        self._hcf.close()
        self._hs_client.close()

    def get_number_of_links(self, slot=None):
        if slot is None:
            return sum(self._links_count.values())
        else:
            return self._links_count[slot]

    def get_number_of_links_to_flush(self, slot=None):
        if slot is None:
            return sum(self._links_to_flush_count.values())
        else:
            return self._links_to_flush_count[slot]
Exemplo n.º 44
0
 def open_spider(self, spider):
     client = HubstorageClient(auth=settings.021fda8005214eac944950e2e96ffc92)
     project = client.get_project(settings.301640)
     self.data_stores = {}
     for product_name in get_product_names():
         self.data_stores[product_name] = project.collections.new_store(product_name)
Exemplo n.º 45
0
class SystemTest(HSTestCase):

    MAGICN = 1211

    def setUp(self):
        super(HSTestCase, self).setUp()
        self.endpoint = self.hsclient.endpoint
        # Panel - no client auth, only project auth using user auth token
        self.panelclient = HubstorageClient(endpoint=self.endpoint)
        self.panelproject = self.panelclient.get_project(self.projectid,
                                                         auth=self.auth)

    def tearDown(self):
        super(HSTestCase, self).tearDown()
        self.panelclient.close()

    def test_succeed_with_close_reason(self):
        self._do_test_success('all-good', 'all-good')

    def test_succeed_without_close_reason(self):
        self._do_test_success(None, 'no_reason')

    def test_scraper_failure(self):
        job = self._do_test_job(IOError('no more resources, ha!'), 'failed')
        # MAGICN per log level messages plus one of last failure
        stats = job.logs.stats()
        self.assertTrue(stats)
        self.assertEqual(stats['totals']['input_values'], self.MAGICN * 4 + 1)

    def _do_test_success(self, job_close_reason, expected_close_reason):
        job = self._do_test_job(job_close_reason, expected_close_reason)
        self.assertEqual(job.items.stats()['totals']['input_values'],
                         self.MAGICN)
        self.assertEqual(job.logs.stats()['totals']['input_values'],
                         self.MAGICN * 4)
        self.assertEqual(job.requests.stats()['totals']['input_values'],
                         self.MAGICN)

    def _do_test_job(self, job_close_reason, expected_close_reason):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_job(pushed['key'])
        self.assertEqual(job.metadata.get('state'), 'pending')
        # consume msg from runner
        self._run_runner(pushed, close_reason=job_close_reason)
        # query again from panel
        job = p.get_job(pushed['key'])
        self.assertEqual(job.metadata.get('state'), 'finished')
        self.assertEqual(job.metadata.get('close_reason'),
                         expected_close_reason)
        return job

    def _run_runner(self, pushed, close_reason):
        client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
        with closing(client) as runnerclient:
            job = self.start_job()
            self.assertFalse(job.metadata.get('stop_requested'))
            job.metadata.update(host='localhost', slot=1)
            self.assertEqual(job.metadata.get('state'), 'running')
            # run scraper
            try:
                self._run_scraper(job.key,
                                  job.jobauth,
                                  close_reason=close_reason)
            except Exception as exc:
                job.logs.error(message=str(exc), appendmode=True)
                job.close_writers()
                job.jobq.finish(job, close_reason='failed')
                # logging from runner must append and never remove messages logged
                # by scraper
                self.assertTrue(job.logs.batch_append)
            else:
                job.jobq.finish(job, close_reason=close_reason or 'no_reason')

    def _run_scraper(self, jobkey, jobauth, close_reason=None):
        httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
        # Scraper - uses job level auth, no global or project auth available
        client = HubstorageClient(endpoint=self.endpoint)
        with closing(client) as scraperclient:
            job = scraperclient.get_job(jobkey, auth=jobauth)
            for idx in xrange(self.MAGICN):
                iid = job.items.write({'uuid': idx})
                job.logs.debug('log debug %s' % idx, idx=idx)
                job.logs.info('log info %s' % idx, idx=idx)
                job.logs.warn('log warn %s' % idx, idx=idx)
                job.logs.error('log error %s' % idx, idx=idx)
                sid = job.samples.write([idx, idx, idx])
                rid = job.requests.add(
                    url='http://test.com/%d' % idx,
                    status=random.randint(100, 1000),
                    method=random.choice(httpmethods),
                    rs=random.randint(0, 100000),
                    duration=random.randint(0, 1000),
                    parent=random.randrange(0, idx + 1) if idx > 10 else None,
                    ts=millitime() + random.randint(100, 100000),
                )
                self.assertEqual(iid, idx)
                self.assertEqual(sid, idx)
                self.assertEqual(rid, idx)

            if isinstance(close_reason, Exception):
                raise close_reason

            if close_reason:
                job.metadata['close_reason'] = close_reason

            job.metadata.save()
Exemplo n.º 46
0
class SystemTest(HSTestCase):

    MAGICN = 1211

    def setUp(self):
        super(HSTestCase, self).setUp()
        self.endpoint = self.hsclient.endpoint
        # Panel - no client auth, only project auth using user auth token
        self.panelclient = HubstorageClient(endpoint=self.endpoint)
        self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)

    def tearDown(self):
        super(HSTestCase, self).tearDown()
        self.panelclient.close()

    def test_succeed_with_close_reason(self):
        self._do_test_success("all-good", "all-good")

    def test_succeed_without_close_reason(self):
        self._do_test_success(None, "no_reason")

    def test_scraper_failure(self):
        job = self._do_test_job(IOError("no more resources, ha!"), "failed")
        # MAGICN per log level messages plus one of last failure
        stats = job.logs.stats()
        self.assertTrue(stats)
        self.assertEqual(stats["totals"]["input_values"], self.MAGICN * 4 + 1)

    def _do_test_success(self, job_close_reason, expected_close_reason):
        job = self._do_test_job(job_close_reason, expected_close_reason)
        self.assertEqual(job.items.stats()["totals"]["input_values"], self.MAGICN)
        self.assertEqual(job.logs.stats()["totals"]["input_values"], self.MAGICN * 4)
        self.assertEqual(job.requests.stats()["totals"]["input_values"], self.MAGICN)

    def _do_test_job(self, job_close_reason, expected_close_reason):
        p = self.panelproject
        pushed = p.jobq.push(self.spidername)
        # check pending state
        job = p.get_job(pushed["key"])
        self.assertEqual(job.metadata.get("state"), "pending")
        # consume msg from runner
        self._run_runner(pushed, close_reason=job_close_reason)
        # query again from panel
        job = p.get_job(pushed["key"])
        self.assertEqual(job.metadata.get("state"), "finished")
        self.assertEqual(job.metadata.get("close_reason"), expected_close_reason)
        return job

    def _run_runner(self, pushed, close_reason):
        client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
        with closing(client) as runnerclient:
            job = self.start_job()
            self.assertFalse(job.metadata.get("stop_requested"))
            job.metadata.update(host="localhost", slot=1)
            self.assertEqual(job.metadata.get("state"), "running")
            # run scraper
            try:
                self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
            except Exception as exc:
                job.logs.error(message=str(exc), appendmode=True)
                job.close_writers()
                job.jobq.finish(job, close_reason="failed")
                # logging from runner must append and never remove messages logged
                # by scraper
                self.assertTrue(job.logs.batch_append)
            else:
                job.jobq.finish(job, close_reason=close_reason or "no_reason")

    def _run_scraper(self, jobkey, jobauth, close_reason=None):
        httpmethods = "GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT".split()
        # Scraper - uses job level auth, no global or project auth available
        client = HubstorageClient(endpoint=self.endpoint)
        with closing(client) as scraperclient:
            job = scraperclient.get_job(jobkey, auth=jobauth)
            for idx in xrange(self.MAGICN):
                iid = job.items.write({"uuid": idx})
                job.logs.debug("log debug %s" % idx, idx=idx)
                job.logs.info("log info %s" % idx, idx=idx)
                job.logs.warn("log warn %s" % idx, idx=idx)
                job.logs.error("log error %s" % idx, idx=idx)
                sid = job.samples.write([idx, idx, idx])
                rid = job.requests.add(
                    url="http://test.com/%d" % idx,
                    status=random.randint(100, 1000),
                    method=random.choice(httpmethods),
                    rs=random.randint(0, 100000),
                    duration=random.randint(0, 1000),
                    parent=random.randrange(0, idx + 1) if idx > 10 else None,
                    ts=millitime() + random.randint(100, 100000),
                )
                self.assertEqual(iid, idx)
                self.assertEqual(sid, idx)
                self.assertEqual(rid, idx)

            if isinstance(close_reason, Exception):
                raise close_reason

            if close_reason:
                job.metadata["close_reason"] = close_reason

            job.metadata.save()
Exemplo n.º 47
0
class HcfMiddleware(object):
    def __init__(self, crawler):

        self.crawler = crawler
        hs_endpoint = self._get_config(crawler, "HS_ENDPOINT")
        hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_project_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_project_slot = self._get_config(crawler, "HS_SLOT")
        # Max number of batches to read from the HCF within a single run.
        try:
            self.hs_max_baches = int(
                crawler.settings.get("HS_MAX_BATCHES", DEFAULT_MAX_BATCHES))
        except ValueError:
            self.hs_max_baches = DEFAULT_MAX_BATCHES
        self.hs_start_job_on_reason = crawler.settings.get(
            "HS_START_JOB_ON_REASON", [])

        self.hsclient = HubstorageClient(auth=hs_auth, endpoint=hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(list)
        self.batch_ids = []

        crawler.signals.connect(self.idle_spider, signals.spider_idle)
        crawler.signals.connect(self.close_spider, signals.spider_closed)

    def _get_config(self, crawler, key):
        value = crawler.settings.get(key)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg):
        log.msg('(HCF) %s' % msg)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'frontier',
                                   self.hs_project_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_slot = getattr(spider, 'slot', self.hs_project_slot)
        self._msg('Using HS_SLOT=%s' % self.hs_slot)

        has_new_requests = False
        for req in self._get_new_requests():
            has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        if not has_new_requests:
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if (request.method == 'GET'
                        and  # XXX: Only GET support for now.
                        request.meta.get('use_hcf', False)):
                    slot = slot_callback(request)
                    hcf_params = request.meta.get('hcf_params')
                    fp = {'fp': request.url}
                    if hcf_params:
                        fp.update(hcf_params)
                    self.new_links[slot].append(fp)
                else:
                    yield item
            else:
                yield item

    def idle_spider(self, spider):
        self._save_new_links()
        self.fclient.flush()
        self._delete_processed_ids()
        has_new_requests = False
        for request in self._get_new_requests():
            self.crawler.engine.schedule(request, spider)
            has_new_requests = True
        if has_new_requests:
            raise DontCloseSpider

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by anothe process).
        if reason == 'finished':
            self._save_new_links()
            self._delete_processed_ids()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished. The idea is to limit
        # every spider runtime (either via itemcount, pagecount or timeout) and
        # then have the old spider start a new one to take its place in the slot.
        if reason in self.hs_start_job_on_reason:
            self._msg("Starting new job" + spider.name)
            job = self.hsclient.start_job(projectid=self.hs_projectid,
                                          spider=spider.name)
            self._msg("New job started: %s" % job)
        self.fclient.close()
        self.hsclient.close()

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(
                self.fclient.read(self.hs_frontier, self.hs_slot), 1):
            for r in batch['requests']:
                num_links += 1
                yield Request(r[0])
            self.batch_ids.append(batch['id'])
            if num_batches >= self.hs_max_baches:
                break
        self._msg('Read %d new batches from slot(%s)' %
                  (num_batches, self.hs_slot))
        self._msg('Read %d new links from slot(%s)' %
                  (num_links, self.hs_slot))

    def _save_new_links(self):
        """ Save the new extracted links into the HCF."""
        for slot, fps in self.new_links.items():
            self.fclient.add(self.hs_frontier, slot, fps)
            self._msg('Stored %d new links in slot(%s)' % (len(fps), slot))
        self.new_links = defaultdict(list)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' %
                  (len(self.batch_ids), self.hs_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        return '0'
Exemplo n.º 48
0
from hubstorage import HubstorageClient
import requests

apikey = 'ab51ccfb248b4783bc710c25ae09f8db'

hc = HubstorageClient(auth=apikey)


def listJobs():
    jobs = hc.get_project('48869').jobq.list()
    return jobs


def getItems(job):
    items = hc.get_job(job).items.list()
    return items


def deleteJob(job):
    job = hc.get_job(job)
    job.purged()


def getItemCsv(job):
    fields = 'cod_prom,data_prom,name,nm_prom,url_img,url_prom,valor'
    itemsCsv = requests.get('https://storage.scrapinghub.com/items/' + job +
                            '?apikey=' + apikey + '&format=csv&fields=' +
                            fields)
    return itemsCsv