示例#1
0
 def initialize_hubstorage_collection(self):
     self.hs_client = HubstorageClient(self.settings.get('HS_AUTH'))
     self.hs_projectid = os.environ.get('SCRAPY_PROJECT_ID')
     if self.hs_projectid is None:
         self.hs_projectid = self.settings.get('HS_PROJECTID')
     collections = self.hs_client.get_project(self.hs_projectid).collections
     self.hs_collection = collections.new_store(self.name)
示例#2
0
    def test_metadata_save_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback_get, attempts_count_get = self.make_request_callback(
            0, job_metadata)
        callback_post, attempts_count_post = self.make_request_callback(
            2, job_metadata)

        self.mock_api(method=GET, callback=callback_get)
        self.mock_api(method=POST, callback=callback_post)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))
        job.metadata['foo'] = 'bar'
        job.metadata.save()

        # Assert
        self.assertEqual(attempts_count_post[0], 3)
示例#3
0
    def test_collection_store_and_delete_are_retried(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)

        callback_post, attempts_count_post = self.make_request_callback(2, [])
        callback_delete, attempts_count_delete = self.make_request_callback(
            2, [])

        self.mock_api(method=POST,
                      callback=callback_delete,
                      url_match='/.*/deleted')
        self.mock_api(
            method=POST, callback=callback_post
        )  # /!\ default regexp matches all paths, has to be added last

        # Act
        project = client.get_project(self.projectid)
        store = project.collections.new_store('foo')
        store.set({'_key': 'bar', 'content': 'value'})
        store.delete('baz')

        # Assert
        self.assertEqual(attempts_count_post[0], 3)
        self.assertEqual(attempts_count_delete[0], 3)
示例#4
0
    def test_delete_on_hubstorage_api_does_not_404(self):
        # NOTE: The current Hubstorage API does not raise 404 errors on deleting resources that do not exist,
        #       Thus the retry policy does not catch 404 errors when retrying deletes (simplify implementation A LOT).
        #       This test checks that this assumption holds.

        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=0)
        project = client.get_project(projectid=self.projectid)

        # Check frontier delete
        project.frontier.delete_slot('frontier_non_existing',
                                     'slot_non_existing')

        # Check metadata delete
        job = client.push_job(self.projectid, self.spidername)
        job.metadata[
            'foo'] = 'bar'  # Add then delete key, this will trigger an api delete for item foo
        del job.metadata['foo']
        job.metadata.save()

        # Check collections delete
        store = project.collections.new_store('foo')
        store.set({'_key': 'foo'})
        store.delete('bar')

        self.assertTrue(
            True,
            "No error have been triggered by calling a delete on resources that do not exist"
        )
示例#5
0
def get_scraped_data(dir,items_job, key, spider):
    # establish a connection with scrapyhub and get a items generator
    hc = HubstorageClient(auth=key)

    empty, totalItems, keptItems = 0, 0, 0
    for job in hc.get_project(items_job).jobq.list(spider=spider):
        for item in hc.get_job(job['key']).items.list():

            totalItems += 1
            item = pd.Series(item)
            if item['title'] != '' and item['article'] != '' and \
                            item['title'] != ' ' and item['article'] != ' ':
                item['spider'] = spider
                item = item.drop('category')
                item = item.replace(["page1", "page2", "page3", "scrape_time", "", "basic"],
                                    [np.nan, np.nan, np.nan, np.nan, np.nan, "reutersbasic"])
                item = item.replace({'<.*?>': '', '\[.*?\]': '', '\(.*?\)': ''}, regex=True)

                #add article hash code as the id of the article
                item['id'] = hash(item['article'])

                #write item(as records) to a json file
                file = dir + 'raw/' + str(item['id']) + '.json'
                item.to_json(file)

                keptItems += 1

            else:
                empty += 1

    print '#' * 50
    print 'Fetched: ', totalItems, ' from spider: ', item['spider']
    print keptItems, ' were written to the folder'
    print '-' * 50, '\n\n'
示例#6
0
    def test_api_delete_can_be_set_to_non_idempotent(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback_delete, attempts_count_delete = self.make_request_callback(
            2, job_metadata)

        self.mock_api(method=DELETE, callback=callback_delete)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        err = None
        try:
            job.metadata.apidelete('/my/non/idempotent/delete/',
                                   is_idempotent=False)
        except HTTPError as e:
            err = e

        # Assert
        self.assertEqual(attempts_count_delete[0], 1)
        self.assertIsNotNone(err)
示例#7
0
    def __init__(self, auth, project_id, colname, cache_size_limit,
                 cleanup_on_start):
        super(HCFStates, self).__init__(cache_size_limit)
        self._hs_client = HubstorageClient(auth=auth)
        self.projectid = project_id
        project = self._hs_client.get_project(self.projectid)
        self._collections = project.collections
        self._colname = colname + "_states"
        self.logger = logging.getLogger("hcf.states")

        if cleanup_on_start:
            self._cleanup()
示例#8
0
 def __init__(self,
              auth,
              project_id,
              frontier,
              batch_size=0,
              flush_interval=30):
     self._hs_client = HubstorageClient(auth=auth)
     self._hcf = self._hs_client.get_project(project_id).frontier
     self._hcf.batch_size = batch_size
     self._hcf.batch_interval = flush_interval
     self._frontier = frontier
     self._links_count = defaultdict(int)
     self._links_to_flush_count = defaultdict(int)
     self._hcf_retries = 10
     self.logger = logging.getLogger("hubstorage-wrapper")
示例#9
0
def run_job(project, timeout, auth, **kwargs):
    hc = HubstorageClient(auth=auth)
    project = hc.get_project(project)
    key = project.push_job('py:run_pipeline.py', **kwargs).key

    running = True
    stop_at = datetime.now() + timedelta(seconds=timeout)
    while running:
        running = project.get_job(key).metadata['state'] in ('pending',
                                                             'running')
        print('Still running')

        if datetime.now() > stop_at:
            print('Timeout exceeded')
            running = False
    print('Finished')
示例#10
0
    def __init__(self, crawler):

        self.crawler = crawler
        self.hs_endpoint = crawler.settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(crawler, "HS_AUTH")
        self.hs_projectid = self._get_config(crawler, "HS_PROJECTID")
        self.hs_frontier = self._get_config(crawler, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(crawler,
                                                     "HS_CONSUME_FROM_SLOT")
        try:
            self.hs_number_of_slots = int(
                crawler.settings.get("HS_NUMBER_OF_SLOTS",
                                     DEFAULT_HS_NUMBER_OF_SLOTS))
        except ValueError:
            self.hs_number_of_slots = DEFAULT_HS_NUMBER_OF_SLOTS
        try:
            self.hs_max_links = int(
                crawler.settings.get("HS_MAX_LINKS", DEFAULT_MAX_LINKS))
        except ValueError:
            self.hs_max_links = DEFAULT_MAX_LINKS
        self.hs_start_job_enabled = crawler.settings.get(
            "HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = crawler.settings.get(
            "HS_START_JOB_ON_REASON", ['finished'])
        self.hs_start_job_new_panel = crawler.settings.get(
            "HS_START_JOB_NEW_PANEL", False)

        if not self.hs_start_job_new_panel:
            conn = Connection(self.hs_auth)
            self.oldpanel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth,
                                         endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links_count = defaultdict(int)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
示例#11
0
 def _run_runner(self, pushed, close_reason):
     client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
     with closing(client) as runnerclient:
         job = self.start_job()
         self.assertFalse(job.metadata.get('stop_requested'))
         job.metadata.update(host='localhost', slot=1)
         self.assertEqual(job.metadata.get('state'), 'running')
         # run scraper
         try:
             self._run_scraper(job.key, job.jobauth, close_reason=close_reason)
         except Exception as exc:
             job.logs.error(message=str(exc), appendmode=True)
             job.close_writers()
             job.jobq.finish(job, close_reason='failed')
             # logging from runner must append and never remove messages logged
             # by scraper
             self.assertTrue(job.logs.batch_append)
         else:
             job.jobq.finish(job, close_reason=close_reason or 'no_reason')
示例#12
0
    def test_auth(self):
        # client without global auth set
        hsc = HubstorageClient(endpoint=self.hsclient.endpoint)
        self.assertEqual(hsc.auth, None)

        # check no-auth access
        try:
            hsc.push_job(self.projectid, self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).push_job(self.spidername)
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_job((self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        try:
            hsc.get_project(self.projectid).get_job(
                (self.projectid, 1, 1)).items.list()
        except HTTPError as exc:
            self.assertTrue(exc.response.status_code, 401)
        else:
            self.assertTrue(False, '401 not raised')

        # create project with auth
        auth = self.hsclient.auth
        project = hsc.get_project(self.projectid, auth)
        self.assertEqual(project.auth, auth)
        job = project.push_job(self.spidername)
        samejob = project.get_job(job.key)
        self.assertEqual(samejob.key, job.key)
示例#13
0
    def test_get_job_does_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }
        callback, attempts_count = self.make_request_callback(2, job_metadata)

        self.mock_api(callback=callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 3)
示例#14
0
    def test_push_job_does_not_retry(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3)
        callback, attempts_count = self.make_request_callback(
            2, {'key': '1/2/3'})

        self.mock_api(POST, callback=callback)

        # Act
        job, err = None, None
        try:
            job = client.push_job(self.projectid, self.spidername)
        except HTTPError as e:
            err = e

        # Assert
        self.assertIsNone(job)
        self.assertIsNotNone(err)
        self.assertEqual(err.response.status_code, 504)
        self.assertEqual(attempts_count[0], 1)
示例#15
0
    def test_retrier_catches_badstatusline_and_429(self):
        # Prepare
        client = HubstorageClient(auth=self.auth,
                                  endpoint=self.endpoint,
                                  max_retries=3,
                                  max_retry_time=1)
        job_metadata = {
            'project': self.projectid,
            'spider': self.spidername,
            'state': 'pending'
        }

        attempts_count = [
            0
        ]  # use a list for nonlocal mutability used in request_callback

        def request_callback(request):
            attempts_count[0] += 1

            if attempts_count[0] <= 2:
                raise ConnectionError("Connection aborted.",
                                      BadStatusLine("''"))
            if attempts_count[0] == 3:
                return (429, {}, u'')
            else:
                resp_body = dict(job_metadata)
                return (200, {}, json.dumps(resp_body))

        self.mock_api(callback=request_callback)

        # Act
        job = client.get_job('%s/%s/%s' % (self.projectid, self.spiderid, 42))

        # Assert
        self.assertEqual(dict(job_metadata), dict(job.metadata))
        self.assertEqual(attempts_count[0], 4)
示例#16
0
 def __init__(self, product_name, apikey, project_id, hours):
     self.product_name = product_name
     project = HubstorageClient(apikey).get_project(project_id)
     self.item_store = project.collections.new_store(product_name)
     self.load_items_from_last_n_hours(hours)
示例#17
0
 def test_custom_ua(self):
     client = HubstorageClient(auth=HSTestCase.auth,
                               endpoint=HSTestCase.endpoint,
                               user_agent='testUA')
     self.assertEqual(client.user_agent, 'testUA')
示例#18
0
 def setUpClass(cls):
     cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
     cls.project = cls.hsclient.get_project(cls.projectid)
     cls.spiderid = str(cls.project.ids.spider(cls.spidername, create=1))
     cls._set_testbotgroup()
示例#19
0
文件: hub.py 项目: ranisalt/cagr-api
 def __init__(self, project: str, spider: str):
     hc = HubstorageClient(auth=shub_cfg.get('apikey'))
     key = next(hc.get_project(project).jobq.list(spider=spider)).get('key')
     self.job = hc.get_job(key)
示例#20
0
 def open_spider(self, spider):
     client = HubstorageClient(auth=settings.021fda8005214eac944950e2e96ffc92)
     project = client.get_project(settings.301640)
     self.data_stores = {}
     for product_name in get_product_names():
         self.data_stores[product_name] = project.collections.new_store(product_name)
示例#21
0
 def setUp(self):
     super(HSTestCase, self).setUp()
     self.endpoint = self.hsclient.endpoint
     # Panel - no client auth, only project auth using user auth token
     self.panelclient = HubstorageClient(endpoint=self.endpoint)
     self.panelproject = self.panelclient.get_project(self.projectid, auth=self.auth)
示例#22
0
 def setUpClass(cls):
     cls.endpoint = HS_ENDPOINT
     cls.auth = HS_AUTH
     cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
     cls.project = cls.hsclient.get_project(cls.projectid)
     cls.fclient = cls.project.frontier
示例#23
0
from hubstorage import HubstorageClient
import requests

apikey = 'ab51ccfb248b4783bc710c25ae09f8db'

hc = HubstorageClient(auth=apikey)


def listJobs():
    jobs = hc.get_project('48869').jobq.list()
    return jobs


def getItems(job):
    items = hc.get_job(job).items.list()
    return items


def deleteJob(job):
    job = hc.get_job(job)
    job.purged()


def getItemCsv(job):
    fields = 'cod_prom,data_prom,name,nm_prom,url_img,url_prom,valor'
    itemsCsv = requests.get('https://storage.scrapinghub.com/items/' + job +
                            '?apikey=' + apikey + '&format=csv&fields=' +
                            fields)
    return itemsCsv