def initialize_hubstorage_collection(self): self.hs_client = HubstorageClient(self.settings.get('HS_AUTH')) self.hs_projectid = os.environ.get('SCRAPY_PROJECT_ID') if self.hs_projectid is None: self.hs_projectid = self.settings.get('HS_PROJECTID') collections = self.hs_client.get_project(self.hs_projectid).collections self.hs_collection = collections.new_store(self.name)
def test_metadata_save_does_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback_get, attempts_count_get = self.make_request_callback( 0, job_metadata) callback_post, attempts_count_post = self.make_request_callback( 2, job_metadata) self.mock_api(method=GET, callback=callback_get) self.mock_api(method=POST, callback=callback_post) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) job.metadata['foo'] = 'bar' job.metadata.save() # Assert self.assertEqual(attempts_count_post[0], 3)
def test_collection_store_and_delete_are_retried(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) callback_post, attempts_count_post = self.make_request_callback(2, []) callback_delete, attempts_count_delete = self.make_request_callback( 2, []) self.mock_api(method=POST, callback=callback_delete, url_match='/.*/deleted') self.mock_api( method=POST, callback=callback_post ) # /!\ default regexp matches all paths, has to be added last # Act project = client.get_project(self.projectid) store = project.collections.new_store('foo') store.set({'_key': 'bar', 'content': 'value'}) store.delete('baz') # Assert self.assertEqual(attempts_count_post[0], 3) self.assertEqual(attempts_count_delete[0], 3)
def test_delete_on_hubstorage_api_does_not_404(self): # NOTE: The current Hubstorage API does not raise 404 errors on deleting resources that do not exist, # Thus the retry policy does not catch 404 errors when retrying deletes (simplify implementation A LOT). # This test checks that this assumption holds. client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=0) project = client.get_project(projectid=self.projectid) # Check frontier delete project.frontier.delete_slot('frontier_non_existing', 'slot_non_existing') # Check metadata delete job = client.push_job(self.projectid, self.spidername) job.metadata[ 'foo'] = 'bar' # Add then delete key, this will trigger an api delete for item foo del job.metadata['foo'] job.metadata.save() # Check collections delete store = project.collections.new_store('foo') store.set({'_key': 'foo'}) store.delete('bar') self.assertTrue( True, "No error have been triggered by calling a delete on resources that do not exist" )
def get_scraped_data(dir,items_job, key, spider): # establish a connection with scrapyhub and get a items generator hc = HubstorageClient(auth=key) empty, totalItems, keptItems = 0, 0, 0 for job in hc.get_project(items_job).jobq.list(spider=spider): for item in hc.get_job(job['key']).items.list(): totalItems += 1 item = pd.Series(item) if item['title'] != '' and item['article'] != '' and \ item['title'] != ' ' and item['article'] != ' ': item['spider'] = spider item = item.drop('category') item = item.replace(["page1", "page2", "page3", "scrape_time", "", "basic"], [np.nan, np.nan, np.nan, np.nan, np.nan, "reutersbasic"]) item = item.replace({'<.*?>': '', '\[.*?\]': '', '\(.*?\)': ''}, regex=True) #add article hash code as the id of the article item['id'] = hash(item['article']) #write item(as records) to a json file file = dir + 'raw/' + str(item['id']) + '.json' item.to_json(file) keptItems += 1 else: empty += 1 print '#' * 50 print 'Fetched: ', totalItems, ' from spider: ', item['spider'] print keptItems, ' were written to the folder' print '-' * 50, '\n\n'
def test_api_delete_can_be_set_to_non_idempotent(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback_delete, attempts_count_delete = self.make_request_callback( 2, job_metadata) self.mock_api(method=DELETE, callback=callback_delete) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) err = None try: job.metadata.apidelete('/my/non/idempotent/delete/', is_idempotent=False) except HTTPError as e: err = e # Assert self.assertEqual(attempts_count_delete[0], 1) self.assertIsNotNone(err)
def __init__(self, auth, project_id, colname, cache_size_limit, cleanup_on_start): super(HCFStates, self).__init__(cache_size_limit) self._hs_client = HubstorageClient(auth=auth) self.projectid = project_id project = self._hs_client.get_project(self.projectid) self._collections = project.collections self._colname = colname + "_states" self.logger = logging.getLogger("hcf.states") if cleanup_on_start: self._cleanup()
def __init__(self, auth, project_id, frontier, batch_size=0, flush_interval=30): self._hs_client = HubstorageClient(auth=auth) self._hcf = self._hs_client.get_project(project_id).frontier self._hcf.batch_size = batch_size self._hcf.batch_interval = flush_interval self._frontier = frontier self._links_count = defaultdict(int) self._links_to_flush_count = defaultdict(int) self._hcf_retries = 10 self.logger = logging.getLogger("hubstorage-wrapper")
def run_job(project, timeout, auth, **kwargs): hc = HubstorageClient(auth=auth) project = hc.get_project(project) key = project.push_job('py:run_pipeline.py', **kwargs).key running = True stop_at = datetime.now() + timedelta(seconds=timeout) while running: running = project.get_job(key).metadata['state'] in ('pending', 'running') print('Still running') if datetime.now() > stop_at: print('Timeout exceeded') running = False print('Finished')
def __init__(self, crawler): self.crawler = crawler self.hs_endpoint = crawler.settings.get("HS_ENDPOINT") self.hs_auth = self._get_config(crawler, "HS_AUTH") self.hs_projectid = self._get_config(crawler, "HS_PROJECTID") self.hs_frontier = self._get_config(crawler, "HS_FRONTIER") self.hs_consume_from_slot = self._get_config(crawler, "HS_CONSUME_FROM_SLOT") try: self.hs_number_of_slots = int( crawler.settings.get("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)) except ValueError: self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS try: self.hs_max_links = int( crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS)) except ValueError: self.hs_max_links = DEFAULT_MAX_LINKS self.hs_start_job_enabled = crawler.settings.get( "HS_START_JOB_ENABLED", False) self.hs_start_job_on_reason = crawler.settings.get( "HS_START_JOB_ON_REASON", ['finished']) self.hs_start_job_new_panel = crawler.settings.get( "HS_START_JOB_NEW_PANEL", False) if not self.hs_start_job_new_panel: conn = Connection(self.hs_auth) self.oldpanel_project = conn[self.hs_projectid] self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint) self.project = self.hsclient.get_project(self.hs_projectid) self.fclient = self.project.frontier self.new_links_count = defaultdict(int) self.batch_ids = [] crawler.signals.connect(self.close_spider, signals.spider_closed) # Make sure the logger for hubstorage.batchuploader is configured logging.basicConfig()
def _run_runner(self, pushed, close_reason): client = HubstorageClient(endpoint=self.endpoint, auth=self.auth) with closing(client) as runnerclient: job = self.start_job() self.assertFalse(job.metadata.get('stop_requested')) job.metadata.update(host='localhost', slot=1) self.assertEqual(job.metadata.get('state'), 'running') # run scraper try: self._run_scraper(job.key, job.jobauth, close_reason=close_reason) except Exception as exc: job.logs.error(message=str(exc), appendmode=True) job.close_writers() job.jobq.finish(job, close_reason='failed') # logging from runner must append and never remove messages logged # by scraper self.assertTrue(job.logs.batch_append) else: job.jobq.finish(job, close_reason=close_reason or 'no_reason')
def test_auth(self): # client without global auth set hsc = HubstorageClient(endpoint=self.hsclient.endpoint) self.assertEqual(hsc.auth, None) # check no-auth access try: hsc.push_job(self.projectid, self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).push_job(self.spidername) except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_job((self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') try: hsc.get_project(self.projectid).get_job( (self.projectid, 1, 1)).items.list() except HTTPError as exc: self.assertTrue(exc.response.status_code, 401) else: self.assertTrue(False, '401 not raised') # create project with auth auth = self.hsclient.auth project = hsc.get_project(self.projectid, auth) self.assertEqual(project.auth, auth) job = project.push_job(self.spidername) samejob = project.get_job(job.key) self.assertEqual(samejob.key, job.key)
def test_get_job_does_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } callback, attempts_count = self.make_request_callback(2, job_metadata) self.mock_api(callback=callback) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) # Assert self.assertEqual(dict(job_metadata), dict(job.metadata)) self.assertEqual(attempts_count[0], 3)
def test_push_job_does_not_retry(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3) callback, attempts_count = self.make_request_callback( 2, {'key': '1/2/3'}) self.mock_api(POST, callback=callback) # Act job, err = None, None try: job = client.push_job(self.projectid, self.spidername) except HTTPError as e: err = e # Assert self.assertIsNone(job) self.assertIsNotNone(err) self.assertEqual(err.response.status_code, 504) self.assertEqual(attempts_count[0], 1)
def test_retrier_catches_badstatusline_and_429(self): # Prepare client = HubstorageClient(auth=self.auth, endpoint=self.endpoint, max_retries=3, max_retry_time=1) job_metadata = { 'project': self.projectid, 'spider': self.spidername, 'state': 'pending' } attempts_count = [ 0 ] # use a list for nonlocal mutability used in request_callback def request_callback(request): attempts_count[0] += 1 if attempts_count[0] <= 2: raise ConnectionError("Connection aborted.", BadStatusLine("''")) if attempts_count[0] == 3: return (429, {}, u'') else: resp_body = dict(job_metadata) return (200, {}, json.dumps(resp_body)) self.mock_api(callback=request_callback) # Act job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42)) # Assert self.assertEqual(dict(job_metadata), dict(job.metadata)) self.assertEqual(attempts_count[0], 4)
def __init__(self, product_name, apikey, project_id, hours): self.product_name = product_name project = HubstorageClient(apikey).get_project(project_id) self.item_store = project.collections.new_store(product_name) self.load_items_from_last_n_hours(hours)
def test_custom_ua(self): client = HubstorageClient(auth=HSTestCase.auth, endpoint=HSTestCase.endpoint, user_agent='testUA') self.assertEqual(client.user_agent, 'testUA')
def setUpClass(cls): cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint) cls.project = cls.hsclient.get_project(cls.projectid) cls.spiderid = str(cls.project.ids.spider(cls.spidername, create=1)) cls._set_testbotgroup()
def __init__(self, project: str, spider: str): hc = HubstorageClient(auth=shub_cfg.get('apikey')) key = next(hc.get_project(project).jobq.list(spider=spider)).get('key') self.job = hc.get_job(key)
def open_spider(self, spider): client = HubstorageClient(auth=settings.021fda8005214eac944950e2e96ffc92) project = client.get_project(settings.301640) self.data_stores = {} for product_name in get_product_names(): self.data_stores[product_name] = project.collections.new_store(product_name)
def setUp(self): super(HSTestCase, self).setUp() self.endpoint = self.hsclient.endpoint # Panel - no client auth, only project auth using user auth token self.panelclient = HubstorageClient(endpoint=self.endpoint) self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
def setUpClass(cls): cls.endpoint = HS_ENDPOINT cls.auth = HS_AUTH cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint) cls.project = cls.hsclient.get_project(cls.projectid) cls.fclient = cls.project.frontier
from hubstorage import HubstorageClient import requests apikey = 'ab51ccfb248b4783bc710c25ae09f8db' hc = HubstorageClient(auth=apikey) def listJobs(): jobs = hc.get_project('48869').jobq.list() return jobs def getItems(job): items = hc.get_job(job).items.list() return items def deleteJob(job): job = hc.get_job(job) job.purged() def getItemCsv(job): fields = 'cod_prom,data_prom,name,nm_prom,url_img,url_prom,valor' itemsCsv = requests.get('https://storage.scrapinghub.com/items/' + job + '?apikey=' + apikey + '&format=csv&fields=' + fields) return itemsCsv