def get_last_job_ids(self): project_id = os.environ.get("SCRAPY_PROJECT_ID") api_key = self.spider.settings.get("SCRAPINGHUB_API_KEY") if not project_id or not api_key: return [] client = ScrapinghubClient(api_key) project = client.get_project(project_id) jobs = project.jobs.list() if not jobs: return [] # find last job for spider searchterm same spider # can be invoked with different searchterms last_matching_job = None for each in jobs: key = each["key"] job = client.get_job(key) metadata = dict(job.metadata.list()) searchterm = metadata.get("spider_args", {}).get("searchterm", "") if self.spider.searchterm == searchterm: last_matching_job = job break if not last_matching_job: return [] return [item["id"] for item in last_matching_job.items.iter()]
def create_json_schema(source_key: str, item_numbers: List[int] = None) -> dict: client = ScrapinghubClient() if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = client.get_job(source_key) items_count = api.get_items_count(job) store = job.items else: logger.error(f"{source_key} is not a job or collection key") return if items_count == 0: logger.error(f"{source_key} does not have any items") return item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: logger.error(item_n_err.format(item_numbers[-1], items_count - 1)) return else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1) samples.append(items[0]) return infer_schema(samples)
def ready(self): global test apikey = '88133cc793ab4296b56db8a87eaae1ec' client = ScrapinghubClient(apikey) test = client.get_job('223795/1/3') test = sorted(test.items.list(), key=lambda k: k['score'], reverse=True)
def showBooks(request): global job job = test if job is None: print("adgaegae") apikey = '88133cc793ab4296b56db8a87eaae1ec' client = ScrapinghubClient(apikey) job = client.get_job('223795/1/3') job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True) return render(request, 'user_page.html', {'spider_books': job, 'user_fullname':request.user.get_full_name,'myuser_id':request.user.myuser.id}) else: '''job = sorted(job.items.list(), key=lambda k: k['score'], reverse=True)''' return render(request, 'user_page.html',{'spider_books': job, 'user_fullname': request.user.get_full_name,'myuser_id': request.user.myuser.id})
def main(): args = parse_args() apikey = os.environ.get('SH_APIKEY') or args.apikey if not apikey: print('Please set API key') exit(1) client = ScrapinghubClient(apikey) job = client.get_job(args.job) events = args.func(job) if args.command == 'errors': report_errors = create_errors_report(events, max_urls_for_output=(min( args.max, 30))) print(report_errors)
def menu(): client = ScrapinghubClient(config['scrapinghub']['api_key']) project = client.get_project(config['scrapinghub']['project_id']) job = project.jobs.list(spider=config['scrapinghub']['spider_name'], state='finished', count=1)[0] job = client.get_job(job['key']) menu = {} menu['aktualnosc'] = job.metadata.get('finished_time') menu['restauracja'] = { "nazwa": "CamelPizza", "logo": "https://www.camelpizza.pl/system/logos/27323/menu_size/1549450693.png", "url": "http://camelpizza.pl" } menu['grupy'] = [] def get_grupa(item): for grupa in menu['grupy']: if grupa['nazwa'] == item['grupa']: return grupa grupa = { 'nazwa': item['grupa'], 'pozycje': [] } menu['grupy'].append(grupa) return grupa def get_pozycja(item): grupa = get_grupa(item) for pozycja in grupa['pozycje']: if pozycja['nazwa'] == item['pozycja']: return pozycja pozycja = { 'nazwa': item['pozycja'], 'opis': item['opis'], 'warianty': [] } grupa['pozycje'].append(pozycja) return pozycja def get_cena(item): kwota, waluta = item['cena'].replace(u'zł', u' zł').split() kwota = float(kwota.replace(',', '.')) waluta = waluta.replace(u'zł', 'PLN') return { 'kwota': kwota, 'waluta': waluta } items = job.items.list() for item in items: try: pozycja = get_pozycja(item) wariant = { 'opis': item['wariant'], 'ceny': [ get_cena(item) ]} pozycja['warianty'].append(wariant) except: print("Invalid item") return jsonify(menu)
def index(): apikey = os.environ.get("APIKEY") job_id = os.environ.get("JOB_ID") client = ScrapinghubClient(apikey) job = client.get_job(job_id) data = [] for item in job.items.iter(): dict = { 'title': item['title'][0], 'director': item['director'][0], 'summary': item['summary'][0] } data.append(dict) return render_template('index.html', data=data)
def get_data(): client = ScrapinghubClient('<KEY>') project = client.get_project(441598) spider = project.spiders.get('state') job_id = list(project.activity.iter(count=2)) job_id = job_id[1]['job'] job = client.get_job(job_id) state_name = [] death =[] cured =[] confirmed_cases=[] for item in job.items.iter(): state_name.append(item[b'state'].decode("utf-8")) death.append(item[b'death'].decode("utf-8")) cured.append(item[b'cured'].decode("utf-8")) confirmed_cases.append(item[b'confirmed_cases'].decode("utf-8")) data = {'state':state_name,'death':death,'cured':cured,'confirmed_cases':confirmed_cases} data = pd.DataFrame(data) data = data[:-1] return data
def main(): requestsMade = 0 while requestsMade < 3: # running the job client = ScrapinghubClient(APIKEY) project = client.get_project(projectId) job = project.jobs.run(spider) if job.metadata.get('state') == 'running' or job.metadata.get('state') == 'pending' or job.metadata.get('state') == 'finished': requestsMade = 10 # getting result from job lastFinishedJob = project.jobs.iter(spider=spider, state='finished', count=1) for job in lastFinishedJob: lastJobId = job['key'] jobData = client.get_job(lastJobId) saveToMongo(jobData.items) else: requestsMade += 1 time.sleep(5)
class SHConnection(): ''' Wrapper for scrapinghub client, project and api calls to simplify use. ''' def __init__(self, api_key, default_project_key=None): self.api_key = api_key self.project_key = resolve_project_key( default_project_key=default_project_key ) def __enter__(self): self.client = ScrapinghubClient(self.api_key) self.project = self.client.get_project(self.project_key) return self def __exit__(self, *args): self.client.close() def jobs_iter(self, **kwargs): return self.project.jobs.iter(**kwargs) def get_job(self, job_id): return self.client.get_job(job_id)
def get_spider_name(job_key): client = ScrapinghubClient() job = client.get_job(job_key) return job.metadata.get("spider")
client.projects.summary() project = client.get_project(list_projects[0]) ### Invoking a job spider = project.spiders.get(project.spiders.list()[0]['id']) spider.jobs.summary() last_key = list(spider.jobs.iter_last())[0]['key'] ## Accessing job output data ### Project ID/Spider ID/Job ID job = client.get_job(last_key) # ============================================================================= # SQL Alchemy Connection to the database # ============================================================================= import sqlalchemy as sqal from sqlalchemy import MetaData , create_engine from sqlalchemy.engine.url import URL from sqlalchemy.orm import sessionmaker ## Establishes a DBAPI connection. db_connect_str = URL(**postgres_key) engine =create_engine(db_connect_str) # reflects the schema, and produces mapping
if job['state'] != 'finished': mail_subject = "Problema, stato job non finito" mail_body = "Problema, stato job non finito " + job['key'] send_email(mail_from, mail_to_error, mail_username, mail_password, mail_server, mail_port, mail_subject, mail_body) exit() if 'items' not in job: mail_subject = "Problema, job non contiene elementi" mail_body = "Problema, job non contiene elementi " + job['key'] send_email(mail_from, mail_to_error, mail_username, mail_password, mail_server, mail_port, mail_subject, mail_body) exit() if job['key'] in str(storico_jobs): # mail_subject = "Problema, job nuovo non presente" # mail_body = "Problema, job nuovo non presente" # send_email(mail_from, mail_to, mail_username, mail_password, mail_server, mail_port, mail_subject, mail_body) continue items = hc.get_job(job['key']).items.list() job_key = [job['key']] for item in items: lista.append((item['isin'], item['isin_titolo'], item['scadenza'], item['strike'], item['tipo_opzione'], item['volume_contratti'], item['volatilita_implicita'])) run_time = job['running_time'] / 1000 data = datetime.datetime.fromtimestamp(run_time).strftime("%Y%m%d") if server == 'remoto': directory = dir + lista[0][1] + "/opzioni/" csv_filename = dir + lista[0][1] + "/opzioni/" + data + '.csv' elif server == 'local': directory = dir + lista[0][1] + "\\opzioni\\" csv_filename = dir + lista[0][1] + "\\opzioni\\" + data + '.csv' if not os.path.exists(directory): os.makedirs(directory) with open(csv_filename, 'w', newline="") as f: