示例#1
0
class PttCrawlerJob():
    def __init__(self, crawl_url):
        """ Initialize and build a connection with Scrapinghub via its api
        """
        self._client = ScrapinghubClient(settings.SCRAPINGHUB_APIKEY)
        # TODO: need to be revised
        self._project_id = self._client.projects.list()[0]
        self._project = self._client.get_project(self._project_id)
        self._target = crawl_url
        self._job = None
        self._meta = None
        self._state = 'initialized'

    def run(self):
        """ Run the crawler (spider)
        """
        if not self._job:
            self._job = self._project.jobs.run(
                'ptt', job_args={'test_url': self._target})
            return self._job.key
        else:
            return None

    def update_meta(self):
        """ Update job's meta data
        """
        if self._job:
            self._meta = dict(self._job.metadata.iter())
            self._state = self._meta['state']

    def cancle(self):
        """ Cancle a job
        """
        self._job.cancel()

    @property
    def meta(self):
        """ Get job's meta data
        """
        if self._meta:
            return self._meta
        else:
            return None

    @property
    def state(self):
        """ Get job's current state
        """
        return self._state

    @property
    def item(self):
        """ Get scrapped items
        """
        if self._state == 'finished':
            # items.iter() returns a iterable, but is not a list and does not support indexing
            # so it has to be transformed into a list. Each element in the list is a dict.
            return list(self._job.items.iter())
        else:
            return None
示例#2
0
    def __init__(self, input_uri, settings):
        super().__init__(settings)
        client = ScrapinghubClient()

        jobkey = parse_job_key(os.environ['SHUB_JOBKEY'])
        project = client.get_project(jobkey.project_id)

        collection_name = input_uri.replace('collections://', '')
        self._store = project.collections.get_store(collection_name)
示例#3
0
def jobRuning00():
    # Enter ScrapingHub
    # Enter ScrapingHub
    apikey = '40f9881d52794d7bb09b9f5ee6d12a3e'  # your API key as a string
    client = ScrapinghubClient(apikey)
    projectID = 410647
    project = client.get_project(projectID)

    # get spider
    spiderID = 'quotes'
    spider = project.spiders.get(spiderID)
    spider.jobs.run()
示例#4
0
def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=(),
                    priority=DEFAULT_PRIORITY, units=None, tag=(), environment=()):
    client = ScrapinghubClient(apikey, dash_endpoint=endpoint)
    try:
        project = client.get_project(project)
        args = dict(x.split('=', 1) for x in arguments)
        cmd_args = args.pop('cmd_args', None)
        meta = args.pop('meta', None)
        job = project.jobs.run(
            spider=spider,
            meta=json.loads(meta) if meta else {},
            cmd_args=cmd_args,
            job_args=args,
            job_settings=dict(x.split('=', 1) for x in settings),
            priority=priority,
            units=units,
            add_tag=tag,
            environment=dict(x.split('=', 1) for x in environment),
        )
        return job.key
    except ScrapinghubAPIError as e:
        raise RemoteErrorException(str(e))
示例#5
0
class SHConnection():
    ''' Wrapper for scrapinghub client, project and api calls
    to simplify use.
    '''

    def __init__(self, api_key, default_project_key=None):
        self.api_key = api_key
        self.project_key = resolve_project_key(
            default_project_key=default_project_key
        )

    def __enter__(self):
        self.client = ScrapinghubClient(self.api_key)
        self.project = self.client.get_project(self.project_key)
        return self

    def __exit__(self, *args):
        self.client.close()

    def jobs_iter(self, **kwargs):
        return self.project.jobs.iter(**kwargs)

    def get_job(self, job_id):
        return self.client.get_job(job_id)