Exemplo n.º 1
0
def get_job(job):
    jobid, apikey = get_job_specs(job)
    hsc = HubstorageClient(auth=apikey)
    job = hsc.get_job(jobid)
    if not job.metadata:
        raise NotFoundException('Job {} does not exist'.format(jobid))
    return job
Exemplo n.º 2
0
class _HubstorageRef(object):
    def __init__(self):
        self.enabled = 'SHUB_JOBKEY' in os.environ
        self._client = None
        self._project = None
        self._job = None
        if self.enabled:
            self.jobkey = os.environ['SHUB_JOBKEY']
            job_id = [int(id) for id in self.jobkey.split('/')]
            self._projectid, self._spiderid, self._jobcounter = job_id
        else:
            self._projectid = None
            self._spiderid = None
            self._jobcounter = None

    @property
    def auth(self):
        return to_native_str(decode(os.environ['SHUB_JOBAUTH'], 'hex_codec'))

    @property
    def endpoint(self):
        return os.environ.get('SHUB_STORAGE')

    @property
    def projectid(self):
        return self._projectid

    @property
    def spiderid(self):
        return self._spiderid

    @property
    def jobid(self):
        return self._jobcounter

    @property
    def client(self):
        from scrapinghub import HubstorageClient
        if self._client is None:
            user_agent = os.environ.get('SHUB_HS_USER_AGENT')
            self._client = HubstorageClient(endpoint=self.endpoint,
                                            auth=self.auth,
                                            user_agent=user_agent)
        return self._client

    @property
    def project(self):
        if self._project is None:
            self._project = self.client.get_project(str(self.projectid))
        return self._project

    @property
    def job(self):
        if self._job is None:
            self._job = self.project.get_job((self.spiderid, self.jobid))
        return self._job

    def close(self):
        if self._client is not None:
            self._client.close()
Exemplo n.º 3
0
def get_job(job):
    jobid, apikey = get_job_specs(job)
    hsc = HubstorageClient(auth=apikey)
    job = hsc.get_job(jobid)
    if not job.metadata:
        raise NotFoundException('Job {} does not exist'.format(jobid))
    return job
Exemplo n.º 4
0
class _Hubstorage(object):
    def __init__(self):
        self.available = "SHUB_JOBKEY" in os.environ and HubstorageClient is not None
        self._client = None
        self._project = None
        self._job = None
        if self.available:
            self.job_key = os.environ["SHUB_JOBKEY"]
            self._project_id, self._spider_id, self._job_id = map(
                int, self.job_key.split("/")
            )
        else:
            self._project_id = None
            self._spider_id = None
            self._job_id = None

    @property
    def auth(self):
        if six.PY2:
            return os.environ["SHUB_JOBAUTH"].decode("hex")
        else:
            return decode(os.environ["SHUB_JOBAUTH"], "hex_codec").decode("utf-8")

    @property
    def endpoint(self):
        return os.environ.get("SHUB_STORAGE")

    @property
    def project_id(self):
        return self._project_id

    @property
    def spider_id(self):
        return self._spider_id

    @property
    def job_id(self):
        return self._job_id

    @property
    def client(self):
        if self._client is None:
            self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
        return self._client

    @property
    def project(self):
        if self._project is None:
            self._project = self.client.get_project(str(self.project_id))
        return self._project

    @property
    def job(self):
        if self._job is None:
            self._job = self.project.get_job((self.spider_id, self.job_id))
        return self._job

    def close(self):
        if self._client is not None:
            self._client.close()
Exemplo n.º 5
0
 def client(self):
     from scrapinghub import HubstorageClient
     if self._client is None:
         user_agent = os.environ.get('SHUB_HS_USER_AGENT')
         self._client = HubstorageClient(endpoint=self.endpoint,
                                         auth=self.auth,
                                         user_agent=user_agent)
     return self._client
Exemplo n.º 6
0
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result):
    monkeypatch.setattr(
        'scrapinghub.hubstorage.resourcetype.MSGPACK_AVAILABLE', msgpack_available)
    hsclient = HubstorageClient()
    job = hsclient.get_job('2222000/1/1')
    for resource in [job.items, job.logs, job.samples]:
        assert resource._allows_mpack(path) is (msgpack_available and expected_result)
    assert job.requests._allows_mpack(path) is False
    assert job.metadata._allows_mpack(path) is False
    assert job.jobq._allows_mpack(path) is False
Exemplo n.º 7
0
def _run_scraper(jobkey, jobauth, close_reason=None):
    httpmethods = 'GET PUT POST DELETE HEAD OPTIONS TRACE CONNECT'.split()
    # Scraper - uses job level auth, no global or project auth available
    client = HubstorageClient(endpoint=TEST_ENDPOINT)
    # use some fixed timestamp to represent current time
    now_ts = 1476803148638
    with closing(client) as scraperclient:
        job = scraperclient.get_job(jobkey, auth=jobauth)
        for idx in range(MAGICN):
            iid = job.items.write({'uuid': idx})
            job.logs.debug('log debug %s' % idx, idx=idx)
            job.logs.info('log info %s' % idx, idx=idx)
            job.logs.warn('log warn %s' % idx, idx=idx)
            job.logs.error('log error %s' % idx, idx=idx)
            sid = job.samples.write([idx, idx, idx])
            rid = job.requests.add(
                url='http://test.com/%d' % idx,
                status=random.randint(100, 1000),
                method=random.choice(httpmethods),
                rs=random.randint(0, 100000),
                duration=random.randint(0, 1000),
                parent=random.randrange(0, idx + 1) if idx > 10 else None,
                ts=now_ts + 100 + idx,
            )
            assert iid == idx
            assert sid == idx
            assert rid == idx

        if isinstance(close_reason, Exception):
            raise close_reason

        if close_reason:
            job.metadata['close_reason'] = close_reason

        job.metadata.save()
Exemplo n.º 8
0
def hsclient_with_retries(max_retries=3, max_retry_time=1):
    return HubstorageClient(
        auth=TEST_AUTH,
        endpoint=TEST_ENDPOINT,
        max_retries=max_retries,
        max_retry_time=max_retry_time,
    )
 def client(self):
     from scrapinghub import HubstorageClient
     if self._client is None:
         user_agent = os.environ.get('SHUB_HS_USER_AGENT')
         self._client = HubstorageClient(endpoint=self.endpoint,
                                         auth=self.auth,
                                         user_agent=user_agent)
     return self._client
Exemplo n.º 10
0
def test_auth(hsclient, json_and_msgpack):
    # client without global auth set
    hsc = HubstorageClient(endpoint=hsclient.endpoint,
                           use_msgpack=hsclient.use_msgpack)
    assert hsc.auth is None

    # check no-auth access
    try:
        hsc.push_job(TEST_PROJECT_ID, TEST_SPIDER_NAME)
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_project(TEST_PROJECT_ID).push_job(TEST_SPIDER_NAME)
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_job((TEST_PROJECT_ID, 1, 1)).items.list()
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_project(TEST_PROJECT_ID).get_job(
            (TEST_PROJECT_ID, 1, 1)).items.list()
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    # create project with auth
    auth = hsclient.auth
    project = hsc.get_project(TEST_PROJECT_ID, auth)
    assert project.auth == auth
    job = project.push_job(TEST_SPIDER_NAME)
    samejob = project.get_job(job.key)
    assert samejob.key == job.key
Exemplo n.º 11
0
class DiscoveryProcessorMixin(object):

    def get_previous_job(self, attr):
        if not hasattr(self, attr):
            raise AttributeError(
                'You should specify a {attr} argument to the job'.format(
                    attr=attr
                )
            )

        job_id = getattr(self, attr)
        auth = self.crawler.settings.get('SCRAPINGHUB_APIKEY')
        hc = HubstorageClient(auth=auth)
        return hc.get_job(job_id)
Exemplo n.º 12
0
def _run_runner(hsproject, pushed, close_reason):
    client = HubstorageClient(endpoint=TEST_ENDPOINT, auth=TEST_AUTH)
    with closing(client) as runnerclient:
        job = start_job(hsproject)
        assert not job.metadata.get('stop_requested')
        job.metadata.update(host='localhost', slot=1)
        assert job.metadata.get('state') == 'running'
        # run scraper
        try:
            _run_scraper(job.key, job.jobauth, close_reason=close_reason)
        except Exception as exc:
            job.logs.error(message=str(exc), appendmode=True)
            job.close_writers()
            job.jobq.finish(job, close_reason='failed')
            # logging from runner must append and never remove messages logged
            # by scraper
            assert job.logs.batch_append
        else:
            job.jobq.finish(job, close_reason=close_reason or 'no_reason')
Exemplo n.º 13
0
 def client(self):
     if self._client is None:
         self._client = HubstorageClient(endpoint=self.endpoint,
                                         auth=self.auth)
     return self._client
Exemplo n.º 14
0
 def setUpClass(cls):
     cls.endpoint = HS_ENDPOINT
     cls.auth = HS_AUTH
     cls.hsclient = HubstorageClient(auth=cls.auth, endpoint=cls.endpoint)
     cls.project = cls.hsclient.get_project(cls.projectid)
     cls.fclient = cls.project.frontier
Exemplo n.º 15
0
    def client(self):
        from scrapinghub import HubstorageClient

        if self._client is None:
            self._client = HubstorageClient(endpoint=self.endpoint, auth=self.auth)
        return self._client
Exemplo n.º 16
0
def hsclient():
    return HubstorageClient(auth=TEST_AUTH, endpoint=TEST_ENDPOINT)
Exemplo n.º 17
0
class HcfMiddleware(object):

    def __init__(self, crawler):
        settings = crawler.settings
        self.hs_endpoint = settings.get("HS_ENDPOINT")
        self.hs_auth = self._get_config(settings, "HS_AUTH")
        self.hs_projectid = self._get_config(settings, "HS_PROJECTID", os.environ.get('SCRAPY_PROJECT_ID'))
        self.hs_frontier = self._get_config(settings, "HS_FRONTIER")
        self.hs_consume_from_slot = self._get_config(settings, "HS_CONSUME_FROM_SLOT")
        self.hs_number_of_slots = settings.getint("HS_NUMBER_OF_SLOTS", DEFAULT_HS_NUMBER_OF_SLOTS)
        self.hs_max_links = settings.getint("HS_MAX_LINKS", DEFAULT_MAX_LINKS)
        self.hs_start_job_enabled = settings.getbool("HS_START_JOB_ENABLED", False)
        self.hs_start_job_on_reason = settings.getlist("HS_START_JOB_ON_REASON", ['finished'])

        conn = Connection(self.hs_auth)
        self.panel_project = conn[self.hs_projectid]

        self.hsclient = HubstorageClient(auth=self.hs_auth, endpoint=self.hs_endpoint)
        self.project = self.hsclient.get_project(self.hs_projectid)
        self.fclient = self.project.frontier

        self.new_links = defaultdict(set)
        self.batch_ids = []

        crawler.signals.connect(self.close_spider, signals.spider_closed)

        # Make sure the logger for hubstorage.batchuploader is configured
        logging.basicConfig()
        self.logger = logging.getLogger("HCF")

    def _get_config(self, settings, key, default=None):
        value = settings.get(key, default)
        if not value:
            raise NotConfigured('%s not found' % key)
        return value

    def _msg(self, msg, level=logging.INFO):
        self.logger.log(level, msg)

    def start_job(self, spider):
        self._msg("Starting new job for: %s" % spider.name)
        jobid = self.panel_project.schedule(
            spider.name,
            hs_consume_from_slot=self.hs_consume_from_slot,
            dummy=datetime.now()
        )
        self._msg("New job started: %s" % jobid)
        return jobid

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_start_requests(self, start_requests, spider):

        self.hs_frontier = getattr(spider, 'hs_frontier', self.hs_frontier)
        self._msg('Using HS_FRONTIER=%s' % self.hs_frontier)

        self.hs_consume_from_slot = getattr(spider, 'hs_consume_from_slot', self.hs_consume_from_slot)
        self._msg('Using HS_CONSUME_FROM_SLOT=%s' % self.hs_consume_from_slot)

        self.has_new_requests = False
        for req in self._get_new_requests():
            self.has_new_requests = True
            yield req

        # if there are no links in the hcf, use the start_requests
        # unless this is not the first job.
        if not self.has_new_requests and not getattr(spider, 'dummy', None):
            self._msg('Using start_requests')
            for r in start_requests:
                yield r

    def process_spider_output(self, response, result, spider):
        slot_callback = getattr(spider, 'slot_callback', self._get_slot)
        for item in result:
            if isinstance(item, Request):
                request = item
                if request.meta.get('use_hcf', False):
                    if request.method == 'GET':  # XXX: Only GET support for now.
                        slot = slot_callback(request)
                        if not request.url in self.new_links[slot]:
                            hcf_params = request.meta.get('hcf_params')
                            fp = {'fp': request.url}
                            if hcf_params:
                                fp.update(hcf_params)
                            # Save the new links as soon as possible using
                            # the batch uploader
                            self.fclient.add(self.hs_frontier, slot, [fp])
                            self.new_links[slot].add(request.url)
                    else:
                        self._msg("'use_hcf' meta key is not supported for non GET requests (%s)" % request.url,
                                  log.ERROR)
                        yield request
                else:
                    yield request
            else:
                yield item

    def close_spider(self, spider, reason):
        # Only store the results if the spider finished normally, if it
        # didn't finished properly there is not way to know whether all the url batches
        # were processed and it is better not to delete them from the frontier
        # (so they will be picked by another process).
        if reason == 'finished':
            self._save_new_links_count()
            self._delete_processed_ids()

        # Close the frontier client in order to make sure that all the new links
        # are stored.
        self.fclient.close()
        self.hsclient.close()

        # If the reason is defined in the hs_start_job_on_reason list then start
        # a new job right after this spider is finished.
        if self.hs_start_job_enabled and reason in self.hs_start_job_on_reason:

            # Start the new job if this job had requests from the HCF or it
            # was the first job.
            if self.has_new_requests or not getattr(spider, 'dummy', None):
                self.start_job(spider)

    def _get_new_requests(self):
        """ Get a new batch of links from the HCF."""
        num_batches = 0
        num_links = 0
        for num_batches, batch in enumerate(self.fclient.read(self.hs_frontier, self.hs_consume_from_slot), 1):
            for fingerprint, data in batch['requests']:
                num_links += 1
                yield Request(url=fingerprint, meta={'hcf_params': {'qdata': data}})
            self.batch_ids.append(batch['id'])
            if num_links >= self.hs_max_links:
                break
        self._msg('Read %d new batches from slot(%s)' % (num_batches, self.hs_consume_from_slot))
        self._msg('Read %d new links from slot(%s)' % (num_links, self.hs_consume_from_slot))

    def _save_new_links_count(self):
        """ Save the new extracted links into the HCF."""
        for slot, new_links in self.new_links.items():
            self._msg('Stored %d new links in slot(%s)' % (len(new_links), slot))
        self.new_links = defaultdict(set)

    def _delete_processed_ids(self):
        """ Delete in the HCF the ids of the processed batches."""
        self.fclient.delete(self.hs_frontier, self.hs_consume_from_slot, self.batch_ids)
        self._msg('Deleted %d processed batches in slot(%s)' % (len(self.batch_ids),
                                                                self.hs_consume_from_slot))
        self.batch_ids = []

    def _get_slot(self, request):
        """ Determine to which slot should be saved the request."""
        md5 = hashlib.md5()
        md5.update(request.url)
        digest = md5.hexdigest()
        return str(int(digest, 16) % self.hs_number_of_slots)
Exemplo n.º 18
0
def test_allows_msgpack(monkeypatch, msgpack_available, path, expected_result):
    monkeypatch.setattr(
        'scrapinghub.hubstorage.collectionsrt.MSGPACK_AVAILABLE', msgpack_available)
    hsclient = HubstorageClient()
    collections = hsclient.get_project(2222000).collections
    assert collections._allows_mpack(path) is (msgpack_available and expected_result)
Exemplo n.º 19
0
def test_auth(hsclient, json_and_msgpack):
    # client without global auth set
    hsc = HubstorageClient(endpoint=hsclient.endpoint,
                           use_msgpack=hsclient.use_msgpack)
    assert hsc.auth is None

    # check no-auth access
    try:
        hsc.push_job(TEST_PROJECT_ID, TEST_SPIDER_NAME)
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_project(TEST_PROJECT_ID).push_job(TEST_SPIDER_NAME)
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_job((TEST_PROJECT_ID, 1, 1)).items.list()
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    try:
        hsc.get_project(TEST_PROJECT_ID).get_job(
            (TEST_PROJECT_ID, 1, 1)).items.list()
    except HTTPError as exc:
        assert exc.response.status_code == 401
    else:
        raise AssertionError('401 not raised')

    # create project with auth
    auth = hsclient.auth
    project = hsc.get_project(TEST_PROJECT_ID, auth)
    assert project.auth == auth
    job = project.push_job(TEST_SPIDER_NAME)
    samejob = project.get_job(job.key)
    assert samejob.key == job.key
class _HubstorageRef(object):

    def __init__(self):
        self.enabled = 'SHUB_JOBKEY' in os.environ
        self._client = None
        self._project = None
        self._job = None
        if self.enabled:
            self.jobkey = os.environ['SHUB_JOBKEY']
            job_id = [int(id) for id in self.jobkey.split('/')]
            self._projectid, self._spiderid, self._jobcounter = job_id
        else:
            self._projectid = None
            self._spiderid = None
            self._jobcounter = None

    @property
    def auth(self):
        return to_native_str(decode(os.environ['SHUB_JOBAUTH'], 'hex_codec'))

    @property
    def endpoint(self):
        return os.environ.get('SHUB_STORAGE')

    @property
    def projectid(self):
        return self._projectid

    @property
    def spiderid(self):
        return self._spiderid

    @property
    def jobid(self):
        return self._jobcounter

    @property
    def client(self):
        from scrapinghub import HubstorageClient
        if self._client is None:
            user_agent = os.environ.get('SHUB_HS_USER_AGENT')
            self._client = HubstorageClient(endpoint=self.endpoint,
                                            auth=self.auth,
                                            user_agent=user_agent)
        return self._client

    @property
    def project(self):
        if self._project is None:
            self._project = self.client.get_project(str(self.projectid))
        return self._project

    @property
    def job(self):
        if self._job is None:
            self._job = self.project.get_job((self.spiderid, self.jobid))
        return self._job

    def close(self):
        if self._client is not None:
            self._client.close()
Exemplo n.º 21
0
def test_custom_ua():
    client = HubstorageClient(auth=TEST_AUTH,
                              endpoint=TEST_ENDPOINT,
                              user_agent='testUA')
    assert client.user_agent == 'testUA'
Exemplo n.º 22
0
def panelclient():
    # Panel - no client auth, only project auth using user auth token
    return HubstorageClient(endpoint=TEST_ENDPOINT)