Exemplo n.º 1
0
def schedule_spider(project,
                    endpoint,
                    apikey,
                    spider,
                    arguments=(),
                    settings=(),
                    priority=DEFAULT_PRIORITY,
                    units=None,
                    tag=(),
                    environment=()):
    client = ScrapinghubClient(apikey, dash_endpoint=endpoint)
    try:
        project = client.get_project(project)
        args = dict(x.split('=', 1) for x in arguments)
        cmd_args = args.pop('cmd_args', None)
        meta = args.pop('meta', None)
        job = project.jobs.run(
            spider=spider,
            meta=json.loads(meta) if meta else {},
            cmd_args=cmd_args,
            job_args=args,
            job_settings=dict(x.split('=', 1) for x in settings),
            priority=priority,
            units=units,
            add_tag=tag,
            environment=dict(x.split('=', 1) for x in environment),
        )
        return job.key
    except ScrapinghubAPIError as e:
        raise RemoteErrorException(str(e))
Exemplo n.º 2
0
    def get_last_job_ids(self):
        project_id = os.environ.get("SCRAPY_PROJECT_ID")
        api_key = self.spider.settings.get("SCRAPINGHUB_API_KEY")

        if not project_id or not api_key:
            return []

        client = ScrapinghubClient(api_key)
        project = client.get_project(project_id)
        jobs = project.jobs.list()

        if not jobs:
            return []

        # find last job for spider searchterm same spider
        # can be invoked with different searchterms
        last_matching_job = None

        for each in jobs:
            key = each["key"]
            job = client.get_job(key)

            metadata = dict(job.metadata.list())
            searchterm = metadata.get("spider_args", {}).get("searchterm", "")

            if self.spider.searchterm == searchterm:
                last_matching_job = job
                break

        if not last_matching_job:
            return []

        return [item["id"] for item in last_matching_job.items.iter()]
Exemplo n.º 3
0
    def __init__(self,
                 collection_name,
                 project_id=None,
                 apikey=None,
                 autodetect_partitions=True):
        """
        collection_name - target collection
        project_id - target project id
        apikey - hubstorage apikey with access to given project. If None, delegate to scrapinghub lib.
        autodetect_partitions - If provided, autodetect partitioned collection. By default is True. If you want instead to force to read a non-partitioned
                collection when partitioned version also exists under the same name, use False.
        """
        self.hsc = ScrapinghubClient(apikey)._hsclient
        project_id = project_id or get_project_id()
        self.hsp = self.hsc.get_project(project_id)

        num_partitions = None
        if autodetect_partitions:
            num_partitions = get_num_partitions(self.hsp, collection_name)
            if num_partitions:
                log.info(
                    "Partitioned collection detected: %d total partitions.",
                    num_partitions)

        self.collections = []

        if num_partitions:
            for p in range(num_partitions):
                self.collections.append(
                    self.hsp.collections.new_store("{}_{}".format(
                        collection_name, p)))
        else:
            self.collections.append(
                self.hsp.collections.new_store(collection_name))
Exemplo n.º 4
0
def getDataXoso():
    # Enter ScrapingHub
    apikey = '40f9881d52794d7bb09b9f5ee6d12a3e'  # your API key as a string
    client = ScrapinghubClient(apikey)
    projectID = 410647
    project = client.get_project(projectID)

    # get spider
    spiderID = 'quotes'
    spider = project.spiders.get(spiderID)

    jobs_summary = spider.jobs.iter()
    job_keys = [j['key'] for j in jobs_summary]

    print(job_keys)
    result = []
    for job_key in job_keys:
        job = project.jobs.get(job_key)

        # Check to see if the job was completed
        if job.metadata.get(u'close_reason') == u'finished':
            for item in job.items.iter():
                result.append(item)
                
            return result    
Exemplo n.º 5
0
def create_json_schema(source_key: str,
                       item_numbers: List[int] = None) -> dict:
    client = ScrapinghubClient()
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = client.get_job(source_key)
        items_count = api.get_items_count(job)
        store = job.items
    else:
        logger.error(f"{source_key} is not a job or collection key")
        return

    if items_count == 0:
        logger.error(f"{source_key} does not have any items")
        return

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            logger.error(item_n_err.format(item_numbers[-1], items_count - 1))
            return
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1)
        samples.append(items[0])

    return infer_schema(samples)
Exemplo n.º 6
0
 def __init__(self):
     self.workflow_loop_enabled = False
     self.args = self.parse_args()
     self.client = ScrapinghubClient(self.args.apikey)
     self.project_id = resolve_project_id(self.args.project_id
                                          or self.project_id)
     if not self.project_id:
         self.argparser.error('Project id not provided.')
Exemplo n.º 7
0
 def ready(self):
     global test
     apikey = '88133cc793ab4296b56db8a87eaae1ec'
     client = ScrapinghubClient(apikey)
     test = client.get_job('223795/1/3')
     test = sorted(test.items.list(),
                   key=lambda k: k['score'],
                   reverse=True)
Exemplo n.º 8
0
 def __init__(self, auth, project_id, frontier, batch_size=0):
     self._client = ScrapinghubClient(auth=auth)
     self._hcf = self._client.get_project(project_id).frontiers
     self._frontier = self._hcf.get(frontier)
     self._links_count = defaultdict(int)
     self._links_to_flush_count = defaultdict(int)
     self._batch_size = batch_size
     self._hcf_retries = 10
 def __init__(self):
     self.apikey = ''  # your API key as a string
     self.client = ScrapinghubClient(self.apikey)
     self.project_num = 0
     self.project = self.client.get_project(self.project_num)
     self.neighborhood_spider = self.get_neighborhood_spider()
     self.listing_spider = self.get_listing_spider()
     self.airdna_spider = self.get_airdna_spider()
Exemplo n.º 10
0
    def __init__(self, input_uri, settings):
        super().__init__(settings)
        client = ScrapinghubClient()

        jobkey = parse_job_key(os.environ['SHUB_JOBKEY'])
        project = client.get_project(jobkey.project_id)

        collection_name = input_uri.replace('collections://', '')
        self._store = project.collections.get_store(collection_name)
Exemplo n.º 11
0
 def __init__(self, crawl_url):
     """ Initialize and build a connection with Scrapinghub via its api
     """
     self._client = ScrapinghubClient(settings.SCRAPINGHUB_APIKEY)
     # TODO: need to be revised
     self._project_id = self._client.projects.list()[0]
     self._project = self._client.get_project(self._project_id)
     self._target = crawl_url
     self._job = None
     self._meta = None
     self._state = 'initialized'
Exemplo n.º 12
0
def jobRuning00():
    # Enter ScrapingHub
    # Enter ScrapingHub
    apikey = '40f9881d52794d7bb09b9f5ee6d12a3e'  # your API key as a string
    client = ScrapinghubClient(apikey)
    projectID = 410647
    project = client.get_project(projectID)

    # get spider
    spiderID = 'quotes'
    spider = project.spiders.get(spiderID)
    spider.jobs.run()
Exemplo n.º 13
0
def showBooks(request):
    global job
    job = test
    if job is None:
        print("adgaegae")
        apikey = '88133cc793ab4296b56db8a87eaae1ec'
        client = ScrapinghubClient(apikey)
        job = client.get_job('223795/1/3')
        job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True)
        return render(request, 'user_page.html', {'spider_books': job, 'user_fullname':request.user.get_full_name,'myuser_id':request.user.myuser.id})
    else:
        '''job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True)'''
        return render(request, 'user_page.html',{'spider_books': job, 'user_fullname': request.user.get_full_name,'myuser_id': request.user.myuser.id})
Exemplo n.º 14
0
 def __init__(self, crawler):
     settings = crawler.settings
     coll_name = settings.get('TARGET_COLLECTION_NAME')
     coll_type = settings.get('TARGET_COLLECTION_TYPE', 's')
     if not coll_name or not coll_type:
         raise NotConfigured('Please set target collection settings.')
     current_project_id = os.environ.get('SCRAPY_PROJECT_ID')
     project_id = settings.get('HCF_PROJECT_ID', current_project_id)
     self.logger = logging.getLogger(__name__)
     # if auth is not set explicitly, fallback to SH job-level token
     self.client = ScrapinghubClient(settings.get('HCF_AUTH'))
     self.project = self.client.get_project(project_id)
     self.collection = self.project.collections.get(coll_type, coll_name)
Exemplo n.º 15
0
def main():
    args = parse_args()
    apikey = os.environ.get('SH_APIKEY') or args.apikey
    if not apikey:
        print('Please set API key')
        exit(1)

    client = ScrapinghubClient(apikey)
    job = client.get_job(args.job)
    events = args.func(job)
    if args.command == 'errors':
        report_errors = create_errors_report(events,
                                             max_urls_for_output=(min(
                                                 args.max, 30)))
        print(report_errors)
Exemplo n.º 16
0
def menu():
	client = ScrapinghubClient(config['scrapinghub']['api_key'])
	project = client.get_project(config['scrapinghub']['project_id'])
	job = project.jobs.list(spider=config['scrapinghub']['spider_name'], state='finished', count=1)[0]
	job = client.get_job(job['key'])

	menu = {}
	menu['aktualnosc'] = job.metadata.get('finished_time')
	menu['restauracja'] = {
		"nazwa": "CamelPizza",
		"logo": "https://www.camelpizza.pl/system/logos/27323/menu_size/1549450693.png",
		"url": "http://camelpizza.pl"
	}
	menu['grupy'] = []

	def get_grupa(item):
		for grupa in menu['grupy']:
			if grupa['nazwa'] == item['grupa']:
				return grupa
		grupa = { 'nazwa': item['grupa'], 'pozycje': [] }
		menu['grupy'].append(grupa)
		return grupa

	def get_pozycja(item):
		grupa = get_grupa(item)
		for pozycja in grupa['pozycje']:
			if pozycja['nazwa'] == item['pozycja']:
				return pozycja
		pozycja = { 'nazwa': item['pozycja'], 'opis': item['opis'], 'warianty': [] }
		grupa['pozycje'].append(pozycja)
		return pozycja

	def get_cena(item):
		kwota, waluta = item['cena'].replace(u'zł', u' zł').split()
		kwota = float(kwota.replace(',', '.'))
		waluta = waluta.replace(u'zł', 'PLN')
		return { 'kwota': kwota, 'waluta': waluta }

	items = job.items.list()
	for item in items:
		try:
			pozycja = get_pozycja(item)
			wariant = { 'opis': item['wariant'], 'ceny': [ get_cena(item) ]}
			pozycja['warianty'].append(wariant)
		except:
			print("Invalid item")

	return jsonify(menu)
Exemplo n.º 17
0
class PttCrawlerJob():
    def __init__(self, crawl_url):
        """ Initialize and build a connection with Scrapinghub via its api
        """
        self._client = ScrapinghubClient(settings.SCRAPINGHUB_APIKEY)
        # TODO: need to be revised
        self._project_id = self._client.projects.list()[0]
        self._project = self._client.get_project(self._project_id)
        self._target = crawl_url
        self._job = None
        self._meta = None
        self._state = 'initialized'

    def run(self):
        """ Run the crawler (spider)
        """
        if not self._job:
            self._job = self._project.jobs.run(
                'ptt', job_args={'test_url': self._target})
            return self._job.key
        else:
            return None

    def update_meta(self):
        """ Update job's meta data
        """
        if self._job:
            self._meta = dict(self._job.metadata.iter())
            self._state = self._meta['state']

    def cancle(self):
        """ Cancle a job
        """
        self._job.cancel()

    @property
    def meta(self):
        """ Get job's meta data
        """
        if self._meta:
            return self._meta
        else:
            return None

    @property
    def state(self):
        """ Get job's current state
        """
        return self._state

    @property
    def item(self):
        """ Get scrapped items
        """
        if self._state == 'finished':
            # items.iter() returns a iterable, but is not a list and does not support indexing
            # so it has to be transformed into a list. Each element in the list is a dict.
            return list(self._job.items.iter())
        else:
            return None
Exemplo n.º 18
0
 def job(self) -> Job:
     if not self._job:
         job = ScrapinghubClient().get_job(self.key)
         if job.metadata.get("state") == "deleted":
             raise ValueError(f"{self.key} has 'deleted' state")
         self._job = job
     return self._job
Exemplo n.º 19
0
class ScrapyCloudClient:
    def __init__(self):
        self.apikey = ''  # your API key as a string
        self.client = ScrapinghubClient(self.apikey)
        self.project_num = 0
        self.project = self.client.get_project(self.project_num)
        self.neighborhood_spider = self.get_neighborhood_spider()
        self.listing_spider = self.get_listing_spider()
        self.airdna_spider = self.get_airdna_spider()

    def get_neighborhood_spider(self):
        return ScrapyCloudNeighborhoodSearchSpider(
            self.project.spiders.get('neighborhood_search'))

    def get_listing_spider(self):
        return ScrapyCloudSpider(self.project.spiders.get('listing'))

    def get_airdna_spider(self):
        return ScrapyCloudSpider(self.project.spiders.get('airdna'))

    def listing_ids(self):
        all_ids = self.neighborhood_spider.get_listing_ids()
        print(len(all_ids))
        id_string = ""
        for num, i in enumerate(all_ids):
            if num == 0:
                id_string = str(i)
            else:
                id_string = id_string + "," + str(i)
        return id_string
Exemplo n.º 20
0
def shub_conn():
    # don't use default `.get()` property because then it will evaluate
    # `settings.SH_API_KEY` anyway and you might not have setup it locally
    api_key = os.environ.get('SH_API_KEY') or Settings().get('SH_API_KEY')

    # NOTE not really safe when `name` doesn't exist
    return ScrapinghubClient(api_key)
Exemplo n.º 21
0
class HcfCrawlerPipeline(object):
    def __init__(self, crawler):
        settings = crawler.settings
        coll_name = settings.get('TARGET_COLLECTION_NAME')
        coll_type = settings.get('TARGET_COLLECTION_TYPE', 's')
        if not coll_name or not coll_type:
            raise NotConfigured('Please set target collection settings.')
        current_project_id = os.environ.get('SCRAPY_PROJECT_ID')
        project_id = settings.get('HCF_PROJECT_ID', current_project_id)
        self.logger = logging.getLogger(__name__)
        # if auth is not set explicitly, fallback to SH job-level token
        self.client = ScrapinghubClient(settings.get('HCF_AUTH'))
        self.project = self.client.get_project(project_id)
        self.collection = self.project.collections.get(coll_type, coll_name)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_item(self, item, spider):
        item_to_export = dict(item)
        if not '_key' in item_to_export:
            fp = hashlib.sha1()
            fp.update(canonicalize_url(item['url']).encode('utf8'))
            item_to_export['_key'] = fp.hexdigest()
        self.collection.set(item_to_export)
        return item
Exemplo n.º 22
0
 def start_requests(self):
     #self.c.execute('DROP TABLE IF EXISTS iteminfos')
     #self.c.execute('CREATE TABLE IF NOT EXISTS iteminfos (item_id, item_main_type, item_mid_type, item_sub_type, item_price, area_id)')
     #self.conn.commit()
     #temp_conn = sql.connect('dataset/area.db')
     #temp_c = temp_conn.cursor()
     #temp_c.execute('DELETE FROM areainfos WHERE rowid NOT IN (SELECT min(rowid) FROM areainfos GROUP BY item_id,item_url,area_id)')
     #item_list = [row for row in temp_c.execute('SELECT * FROM areainfos ORDER BY area_id')]
     #temp_conn.commit()
     client = ScrapinghubClient('ec16b94bcf024d0bb502684368658d59')
     myproject = client.projects.get('254951')
     mystore = myproject.collections.get_store('area_info')
     value_num = mystore.count()
     #for item in item_list:
     for item in range(value_num):
         #url = item[1].encode()
         log_item = mystore.get(str(item))
         area_id = log_item['value']['area_id'][0]
         item_ids = log_item['value']['item_id']
         item_urls = log_item['value']['item_url']
         for i in range(len(item_ids)):
             if i % 10 == 0:
                 sleep(0.8)
             #request = scrapy.Request(url=url,callback=self.parse)
             #request.meta['item_id'] = item[0]
             #request.meta['area_id'] = item[2]
             request = scrapy.Request(url=item_urls[i],
                                      callback=self.parse,
                                      errback=self.error_handler)
             request.meta['item_id'] = item_ids[i]
             request.meta['area_id'] = area_id
             yield request
Exemplo n.º 23
0
def create_json_schema(source_key: str,
                       item_numbers: Optional[List[int]] = None) -> dict:
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = ScrapinghubClient().get_job(source_key)
        items_count = api.get_items_count(job)
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            raise ValueError(
                item_n_err.format(item_numbers[-1], items_count - 1))
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1, p_bar=None)
        samples.append(items[0])

    return infer_schema(samples)
Exemplo n.º 24
0
    def __init__(self, crawler):
        self.crawler = crawler
        settings = crawler.settings
        current_project_id = os.environ.get('SCRAPY_PROJECT_ID')
        project_id = settings.get('HCF_PROJECT_ID', current_project_id)
        frontier_name = settings.get('HCF_FRONTIER')
        frontier_slot = settings.get('HCF_FRONTIER_SLOT')
        if not project_id or not frontier_name or not frontier_slot:
            raise NotConfigured('Please set HCF settings for the middleware.')
        self.batch_size = settings.getint('HCF_BATCH_SIZE', DEFAULT_BATCH_SIZE)

        self.logger = logging.getLogger(__name__)
        # if auth is not set explicitly, fallback to SH job-level token
        self.client = ScrapinghubClient(settings.get('HCF_AUTH'))
        self.project = self.client.get_project(project_id)
        self.frontier = self.project.frontiers.get(frontier_name)
        self.frontier_slot = self.frontier.get(frontier_slot)
Exemplo n.º 25
0
def test_projects_list(client):
    projects = client.projects.list()
    assert client.projects.list() == []

    # use user apikey to list test projects
    client = ScrapinghubClient(TEST_USER_AUTH, TEST_DASH_ENDPOINT)
    projects = client.projects.list()
    assert isinstance(projects, list)
    assert int(TEST_PROJECT_ID) in projects
Exemplo n.º 26
0
def index():
    apikey = os.environ.get("APIKEY")
    job_id = os.environ.get("JOB_ID")

    client = ScrapinghubClient(apikey)
    job = client.get_job(job_id)

    data = []

    for item in job.items.iter():
        dict = {
            'title': item['title'][0],
            'director': item['director'][0],
            'summary': item['summary'][0]
        }
        data.append(dict)

    return render_template('index.html', data=data)
Exemplo n.º 27
0
    def __init__(self):
        parser = argparse.ArgumentParser(description=__doc__)
        parser.add_argument('pid', help='Taget project id')
        parser.add_argument('spider', help='Spider name')
        parser.add_argument('frontier', help='Frontier name')
        parser.add_argument('prefix', help='Slot prefix')
        parser.add_argument('--max-jobs', help='Max number of jobs for the given spider allowed to run in parallel.\
                            Default is %(default)s.', type=int, default=1)
        parser.add_argument('--apikey',
                            help='API key to use for HCF access. Uses SH_APIKEY environment variable if not given')
        parser.add_argument('--spider-args', help='Spider arguments dict in json format', default='{}')
        parser.add_argument('--loop-mode', help='If provided, manager will run in loop mode, with a cycle each given\
                            number of seconds.', type=int, metavar='SECONDS')

        self.args = parser.parse_args()

        client = ScrapinghubClient(self.args.apikey)
        self.project = client.get_project(self.args.pid)
        self.hcfpal = HCFPal(client._hsclient.get_project(self.args.pid))
Exemplo n.º 28
0
def obtainLatestJobIDofSpider(apikey, project_id, spider_id):
    client = ScrapinghubClient(apikey)
    myproject = client.projects.get(project_id)
    job_keys = [_['key'] for _ in myproject.jobs.iter()]
    job_ids = [
        int(_.split('/')[2]) if (_.split('/')[0] == myproject.key
                                 and _.split('/')[1] == str(spider_id)) else ''
        for _ in job_keys
    ]
    job_ids = [_ if type(_) is int else 0 for _ in job_ids]
    return int(sorted(job_ids)[-1])
Exemplo n.º 29
0
def has_project_access(project, endpoint, apikey):
    """Check whether an API key has access to a given project. May raise
    InvalidAuthException if the API key is invalid (but not if it is valid but
    lacks access to the project)"""
    client = ScrapinghubClient(apikey, dash_endpoint=endpoint)
    try:
        return project in client.projects.list()
    except ScrapinghubAPIError as e:
        if 'Authentication failed' in str(e):
            raise InvalidAuthException
        else:
            raise RemoteErrorException(str(e))
Exemplo n.º 30
0
def restore(spider_id, job_id=0, store_name='', *keys):
    APIKEY = 'ec16b94bcf024d0bb502684368658d59'
    PROJECTID = '254951'
    SPIDERID = spider_id
    client = ScrapinghubClient(APIKEY)
    myproject = client.get_project(PROJECTID)

    if job_id == 0:
        myjob_id = obtainLatestJobIDofSpider(APIKEY, PROJECTID, SPIDERID)
        #job_keys = [_['key'] for _ in myproject.jobs.iter()]
        #job_ids = [int(_.split('/')[2]) if (_.split('/')[0]==myproject.key and _.split('/')[1]==SPIDERID) else '' for _ in job_keys]
        #myjob_id = sorted(job_ids)[-1]
    else:
        myjob_id = job_id
    myjob = myproject.jobs.get('%s/%s/%d' % (PROJECTID, SPIDERID, myjob_id))
    myitem = [_ for _ in myjob.items.iter()]
    item_num = len(myitem)
    item_container = dict()
    for key_i in keys:
        item_container[key_i] = [_[key_i] for _ in myitem]

#area_ids = [_['area_id'] for _ in myitem]
#item_ids = [_['item_id'] for _ in myitem]
#item_urls = [_['item_url'] for _ in myitem]
    store_names = [_['name'] for _ in myproject.collections.iter()]
    if store_name in store_names:
        mycollection = myproject.collections.get_store(store_name)
        if mycollection.count() > 0:
            for _ in mycollection.iter():
                mycollection.delete(_['_key'])
        for item_i in range(item_num):
            area_info_item = dict()
            for key_i in keys:
                area_info_item[key_i] = item_container[key_i][item_i]
            #area_info_item['area_id'] = area_ids[item_i]
            #area_info_item['item_id'] = item_ids[item_i]
            #area_info_item['item_url'] = item_urls[item_i]
            mycollection.set({'_key': str(item_i), 'value': area_info_item})
    else:
        print "the collection %s you want to access is not exist." % store_name
Exemplo n.º 31
0
def schedule_spider(project, endpoint, apikey, spider, arguments=(), settings=(),
                    priority=DEFAULT_PRIORITY, units=None, tag=(), environment=()):
    client = ScrapinghubClient(apikey, dash_endpoint=endpoint)
    try:
        project = client.get_project(project)
        args = dict(x.split('=', 1) for x in arguments)
        cmd_args = args.pop('cmd_args', None)
        meta = args.pop('meta', None)
        job = project.jobs.run(
            spider=spider,
            meta=json.loads(meta) if meta else {},
            cmd_args=cmd_args,
            job_args=args,
            job_settings=dict(x.split('=', 1) for x in settings),
            priority=priority,
            units=units,
            add_tag=tag,
            environment=dict(x.split('=', 1) for x in environment),
        )
        return job.key
    except ScrapinghubAPIError as e:
        raise RemoteErrorException(str(e))