def start_requests(self): urls = ['https://api.github.com/users/{}/starred'.format(name) for name in usernames] for url in urls: req = JsonRequest(url=url, callback=self.parse_stars) # API call, don't need to check robots req.meta['dont_obey_robotstxt'] = True yield req
def test_request_class(self): r1 = FormRequest("http://www.example.com") self._assert_serializes_ok(r1, spider=self.spider) r2 = CustomRequest("http://www.example.com") self._assert_serializes_ok(r2, spider=self.spider) r3 = JsonRequest("http://www.example.com", dumps_kwargs={"indent": 4}) self._assert_serializes_ok(r3, spider=self.spider)
def start_requests(self): yield FormRequest(self.start_url, callback=self.parse_response, formdata=self.data) yield JsonRequest(self.start_url, callback=self.parse_response, data=self.data)
def get_token(self): headers = { 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36' } url = 'https://api.twitter.com/1.1/guest/activate.json' yield JsonRequest(url=url, method='POST', headers=headers, callback=self.parse_token)
def start_requests(self): print('\n--- Starting crawl of stock: {} ({}) ---\n'.format( self.current_stock_info['name'], self.current_stock_info['symbol'].upper())) yield JsonRequest(url=self.start_url, data=self.start_query_params, dont_filter=True)
def start_requests(self): return [ JsonRequest(url=self.link, data=formdata(), headers=headers, callback=self.parse) ]
def parse_product_list(self, response: Response): json_data = response.text data = json.loads(json_data)[0]['data']['valentino'] products = data['products'] edges = products['edges'] codes = [edge['node']['code'] for edge in edges] for code in codes: req_body = [{ "operationName": "fetchProductDetail", "variables": { "code": code, "breadcrumbFlg": "NO", "platform": "valentino" }, "query": "query fetchProductDetail($code: ID, $breadcrumbFlg: breadcrumbFlg, $platform: Platform) { shop { productDetail(code: $code, breadcrumbFlg: $breadcrumbFlg, platform: $platform) { userErrors { code message } product { code title description images { url } styleProducts { code title images { url } salePrice { amount currencyCode } skus { code salePrice { amount currencyCode } options { code frontName values { frontName code images { url } } } } options { code frontName values { code frontName images { url } } } } salePrice { amount currencyCode } skus { code salePrice { amount currencyCode } options { code frontName values { frontName code images { url } } } } options { code frontName values { code frontName images { url } } code } } } }}" }] json_body = json.dumps(req_body) yield JsonRequest(self.ql_url, method='POST', body=json_body, callback=self.parse_product)
def start_requests(self): busca_url = "https://www.rentfaster.ca/api/map.json" dados = { "e": "zoom_changed", "l": "12,51.0687,-114.0899", "beds": ",bachelor", "baths": "1,1.5,2,2.5,1,1.5,2,2.5,3+", "type": "Apartment,Condo,Loft,Condo,Loft", "price_range_adv[from]": "0", "price_range_adv[to]": "1000", "furnishing": "Unfurnished", "area": "51.16417413931004,-113.96716211547852,50.97302850876522,-114.21263788452148" } yield JsonRequest(busca_url, data=dados, callback=self.parse)
def start_requests(self): size = 50 max_page = 33 for page in range(1, max_page + 1): req_body = [{ "operationName": "fetchProductList", "variables": { "input": { "size": size, "page": page, "filters": { 'keyword': '' }, "breadcrumbFlg": "YES" } }, "query": "query fetchProductList($input: ProductFilters) { valentino { products(input: $input) { pageInfo { totalCount size page hasNextPage } edges { node { code } } } }}" }] json_body = json.dumps(req_body) yield JsonRequest(self.ql_url, method='POST', body=json_body, callback=self.parse_product_list)
def post_item(self, item): if '?' not in item.post_link: post_link = f'{item.post_link}?json=1' else: post_link = f'{item.post_link}json=1' return JsonRequest(post_link, method='POST', data=item, dont_filter=True, callback=self.post_success, errback=self.parse_fail)
def start_requests(self): # get cat data if not hasattr(self, 'bidDatas') or not self.bidDatas: self.page += 1 url = f'http://crawler.wemarry.vn/api/get-detail-multi?id={self.params}&page={self.page}' yield JsonRequest(url, callback=self.parse_data, dont_filter=True) # create requests if getattr(self, 'bidDatas', None): data = self.bidDatas.pop() item = obj( id_web=data.get('ID_WEB'), id=data.get('ID'), link=data.get('LINK'), post_link=data.get('POST_LINK'), arr_law = data.get('ARR_LAW'), # hotel_city_id=None, # ha noi/ da nang/ ho chi minh # hotel_search_image='', # # detail hotel # hotel_source=None, # Agoda,... # hotel_type=None, # str nha nghi/ khach san, ... # hotel_name=None, # str Hotel name # hotel_star=0, # int Hotel rating # hotel_address=None, # str Hotel address # hotel_description=None, # str Hotel description # hotel_image=None, # list of Hotel image # hotel_attribute=None, # list() Hotel attribute # hotel_latlng=None, # str 'lat,lon' # hotel_price=None, # list of room info: { name, price, guest, attribute } # # review # hotel_review=None # list of reviews: { name, image, rating, title, content } ) for request in self.create_request(item): yield request
def parse(self, response): company_name = "" for gpu in response.xpath('//*[@class="processors"]//tr'): current_company_name = gpu.xpath( '*[@class="mfgr"]//text()').extract_first() if current_company_name == "AMD": company_name = current_company_name if current_company_name == "Intel": company_name = current_company_name if current_company_name == "NVIDIA": company_name = current_company_name data = { 'company': company_name, 'product_name': gpu.xpath('td[1]//a//text()').extract_first(), 'gpu_chip': gpu.xpath('td[2]//a//text()').extract_first(), 'release_date': gpu.xpath('td[3]//text()').extract_first(), 'bus': gpu.xpath('td[4]//text()').extract_first(), 'memory': gpu.xpath('td[5]//text()').extract_first(), 'gpu_clock': gpu.xpath('td[6]//text()').extract_first(), 'memory_clock': gpu.xpath('td[7]//text()').extract_first(), } yield JsonRequest(url='http://localhost/api/gpu/add', headers={'X-AUTH-TOKEN': apikey}, data=data)
def parse(self, response): company_name = "" for cpu in response.xpath('//*[@class="processors"]//tr'): current_company_name = cpu.xpath( '*[@class="mfgr"]//text()').extract_first() if current_company_name == "AMD": company_name = current_company_name if current_company_name == "Intel": company_name = current_company_name data = { 'company': company_name, 'product_name': cpu.xpath('td[1]//a//text()').extract_first(), 'code_name': cpu.xpath('td[2]//text()').extract_first(), 'cores': cpu.xpath('td[3]//text()').extract_first(), 'clock': cpu.xpath('td[4]//text()').extract_first(), 'socket': cpu.xpath('td[5]//text()').extract_first(), 'process': cpu.xpath('td[6]//text()').extract_first(), 'l3_cache': cpu.xpath('td[7]//text()').extract_first(), 'tdp': cpu.xpath('td[8]//text()').extract_first(), 'released': cpu.xpath('td[9]//text()').extract_first(), } yield JsonRequest(url='http://localhost/api/cpu/add', headers={'X-AUTH-TOKEN': apikey}, data=data)
def parse(self, response): for debate in response.xpath( "/html/body/div[2]/div[4]/div/div/div/div[4]/div[1]/ul/li"): # yes ke liye title_1 = debate.xpath('.//@href').extract() list_A.append(title_1) desc_yes = debate.xpath('p/text()').extract() list_B.append(desc_yes) print(len(list_A), len(list_B)) print("---------------") # for debate1 in response.xpath("/html/body/div[2]/div[4]/div/div/div/div[4]/div[2]/ul/li"): # # No ke liye # title2 = debate1.xpath('.//@href').extract() # list_C.append(title2) # desc_no = debate1.xpath('p/text()').extract() # list_D.append(desc_no) # title = debate1.xpath('a/text()').extract() # list_E.append(title) # # print (len(title2), len(desc_no), len(title)) # print (len(list_C), len(list_D), len(list_E)) # print("---------------*****") # for debate1 in zip(title_1,desc_yes,desc_no): for i in range(2): # print("Hey ya") scraped_info = { 'title1': list_A[i], 'desc_yes': list_B[i] # 'desc_no' : list_D[i] } yield scraped_info # next_page= response.xpath("/html/body/div[2]/div[4]/div/div/div/div[5]/a/@href").extract_first() # print(next_page) # if next_page is not None: # print("kaise ho beta") # next_page_link= response.urljoin(next_page) # yield scrapy. Request(url=next_page_link, callback=self.parse) # if response.xpath("//a/@rel='next'\").get() == '1'): # print("next page mkj") # next_page = response.xpath('//a[@rel='next']/@href').get() # yield response.follow(url=next_page , callback=self.parse) data = { # 'debateId': 'DF5F0C8D-BDA6-4C05-9C50-07FCD527D8BE', 'page': '5', } yield JsonRequest( url= 'https://www.debate.org/opinions/do-you-agree-with-the-black-lives-matter-movement-1', data=data)
def test_POST_small_json_x10(self): request = JsonRequest(url=self.get_url('/post-data-json-small'), method='POST', data=Data.JSON_SMALL) return self._check_POST_json_x10( request, Data.JSON_SMALL, Data.EXTRA_SMALL, 200 )
def test_POST_large_json_x10(self): request = JsonRequest(url=self.get_url('/post-data-json-large'), method='POST', data=Data.JSON_LARGE) return self._check_POST_json_x10( request, Data.JSON_LARGE, Data.EXTRA_LARGE, 200 )
def start_requests(self): params = self.post_params.copy() yield JsonRequest(url=self.url, data=params, callback=self.parse_results_list, meta={ 'dont_obey_robotstxt': True, 'from': 0 })
def start_requests(self): for page in range(1000): self.data['pageNo'] = page temp = JsonRequest(url=self.api_url, headers=self.headers, data=self.data, callback=self.parse, dont_filter=True) yield temp
def parse(self, response): for hospital in response.css('table#dataList tbody tr'): data = { 'contact-text': self.removeWhiteSpace( self.trySafeIndexAccess( hospital.xpath('./td[2]/text()').extract(), 0)), 'contact-website': self.trySafeIndexAccess( hospital.xpath('./td[2]//a/@href').extract(), 0), 'region-abbreviation': self.translateBundesland( self.removeWhiteSpace( hospital.xpath('./td[3]/text()').extract()[0])), 'icu-low-care': self.translateStatus( hospital.xpath('./td[4]//span/@class').extract()[0]), 'icu-high-care': self.translateStatus( hospital.xpath('./td[5]//span/@class').extract()[0]), 'ecmo': self.translateStatus( hospital.xpath('./td[6]//span/@class').extract()[0]), 'date-of-information': self.removeWhiteSpace( hospital.xpath('./td[7]/text()').extract()[0]), 'time-of-information': self.removeWhiteSpace( hospital.xpath('./td[7]/text()').extract()[1]), } data['hospital-name'] = self.removeWhiteSpace( hospital.xpath('./td[1]/text()').extract()[0]) additional_hopsital_data = hospital.xpath( './td[1]/small/text()').extract() if len(additional_hopsital_data) == 2: data['hospital-street'] = self.removeWhiteSpace( additional_hopsital_data[0]) splitted_data = additional_hopsital_data[1].split() data['hospital-postalcode'] = splitted_data.pop(0) data['hospital-city'] = " ".join(splitted_data) if len(additional_hopsital_data) == 3: data['hospital-department'] = additional_hopsital_data[0] data['hospital-street'] = self.removeWhiteSpace( additional_hopsital_data[1]) splitted_data = additional_hopsital_data[2].split() data['hospital-postalcode'] = splitted_data.pop(0) data['hospital-city'] = " ".join(splitted_data) self.query = f'{self.basic_query}&inputtype=textquery&input={data["hospital-street"]} {data["hospital-postalcode"]} {data["hospital-city"]}' yield JsonRequest( url= f'https://maps.googleapis.com/maps/api/place/findplacefromtext/json?{self.query}', cb_kwargs=dict(hospital=data), callback=self.parseGoogleResponse)
def search(self, offset=0): return JsonRequest( url=self.buildUrl(offset), data={"criterias": [{ "property": "ngsearchword", "values": [""] }]}, callback=self.parse, )
def parse(self, response): id_api = int(response.css('div::attr(data-good-link)').get()) data = { "imtId": id_api, "take": self.count_comment, "order": "dateDesc" } yield JsonRequest( url='https://public-feedbacks.wildberries.ru/api/v1/feedbacks/site', data=data, callback=self.parse_my_url)
def login_request(username, password): """Build login request.""" return JsonRequest( 'https://api.makeupalley.com/api/v1/users/auth/login', headers={'Referer': 'https://www.makeupalley.com/'}, data={ 'userName': username, 'password': password, 'rememberMe': True, 'fromSavedCookie': False }, meta={'handle_httpstatus_list': [401]})
def parse(self, response): print('\n>>> Get html from URL: %s' % response.url) # print(response.text) self.token = response.xpath('//input[@id="tokenvalue"]/@value').get() print('token: %s' % self.token) return [JsonRequest( url='http://www.ttsucha.com/api/ttscapi/noTosearch', method = 'POST', headers = self.custom_headers, data = { 'trackingNo': self.num, 'token': self.token }, callback = self.parse_trackinginfo )]
def get_num_papers(self, response): meta = response.meta data = json.loads(response.body) num_papers = data['hits']['total'] num_iterations = num_papers - (num_papers % 10 ) # at most 10 papers per page for iteration in range(0, num_iterations + 1, 10): self.post_params['from'] = iteration yield JsonRequest(url=self.url, data=self.post_params, callback=self.parse_query_result, meta=meta, dont_filter=True)
def parse_results_list(self, response): data = json.loads(response.body) last_time = datetime.now() has_new_paper = False for item in data['hits']['hits']: item = item['_source'] item.update({ 'date': parse_date(item['date']), 'date_created': parse_date(item['date_created']), 'date_modified': parse_date(item['date_modified']), 'date_published': parse_date(item['date_published']), 'date_updated': parse_date(item['date_updated']), }) last_time = min(last_time, item['date_updated']) try: doi = None for x in item['identifiers']: m = re.match(r'^https?://(?:dx\.)?doi.org/(.*)$', x) if m: doi = m.group(1) break if not doi: raise StopIteration except StopIteration: break item['doi'] = doi item['osf_id'] = item['id'] del item['id'] if self.has_duplicate(where='Scraper_osf_org', query={'doi': doi}): continue has_new_paper = True self.save_article(item, to='Scraper_osf_org', push_lowercase_to_meta=False) if has_new_paper and last_time > datetime(year=2020, month=1, day=1): params = self.post_params.copy() params['from'] = response.meta['from'] + len(data['hits']['hits']) yield JsonRequest(url=self.url, data=params, callback=self.parse_results_list, meta={ 'dont_obey_robotstxt': True, 'from': params['from'] })
def parse(self, response): item = response.meta.get('item') self.zoneid = response.css('#hdZoneId::attr(value)').extract_first() self.excluid = response.css('#hdExcluId::attr(value)').extract_first() url = self.url.format(page=self.page, zoneid=self.zoneid, excluid=self.excluid) yield JsonRequest(url, headers=self.headers, callback=self.parse_api, errback=self.fail, dont_filter=True, meta={'item': item})
def parse(self, response): endFlag='0' body = json.loads(response.text) for each in body['data']['records']: item = GovinvestHunanItem() investDict = {} if not 'approvalDate' in each.keys(): continue createTime = each['createTime'] recordDate = datetime.strptime(createTime, "%Y-%m-%d") print(recordDate) currDate = datetime.strptime(datetime.now().strftime("%Y-%m-%d"), "%Y-%m-%d") #print(currDate) yesterday = datetime.strptime((datetime.today()+ timedelta(-1)).strftime("%Y-%m-%d"), "%Y-%m-%d") #print(yesterday) if currDate == recordDate: print('currDate == recordDate') continue if yesterday > recordDate: print('yesterday > recordDate') endFlag='1' continue pid = each['id'] projectName = each['prjName'] #项目名称 projectCode = each['projectCode'] #项目代码 approvalNum = each['approvalNum'] #批复文号 fileGuid = each['fileGuid'] #文件id approvalDepartName = each['approvalDepartName'] #审批单位 approvalDate = each['approvalDate'] #审批单位 investDict[u'发布日期'] = createTime #发布日期 investDict[u'批复时间'] = approvalDate #批复时间 investDict[u'项目名称'] = projectName #项目名称 investDict[u'项目代码'] = projectCode #项目代码 investDict[u'批复文号'] = approvalNum #批复文号 investDict[u'审批单位'] = approvalDepartName #审批单位 investDict[u'id'] = pid #项目id investDict[u'附件地址'] = self.downloadLink.format(fileGuid=fileGuid) #附件地址 item['dic']=investDict yield item self.count +=1 if self.count<100 and endFlag=='0': print ('go next page ------------------------------'+str(self.count)) self.packet['pageIndex'] = self.count yield JsonRequest(self.start_urls[0], data=self.packet, callback=self.parse)
def parse_item(self, response, domain, job_id, crawl_variations, lat='43.769037', lng='-79.371951'): self._referer_for_jsonrequest = response.request.url self._domain = domain self._job_id = job_id self._sku = utils.extract_sku_from_url(response.url, self._domain) if not self._sku: self.logger.exception("[{}][null] Request ignored - no SKU".format( self._domain)) raise IgnoreRequest if response.status != 200: # broken link or inactive item yield self.build_listing_item(response) else: _data = self.__get_preloaded_data_components(response) _meta_data = self.__extract_meta_data(response) self._parent_sku = _data.get('SkuSelectors', {}).get('pCode', '{}P'.format(self._sku)) if crawl_variations: _skus = list( _data.get('SkuSelectors', {}).get('skuListProperties', {}).keys()) else: _skus = [self._sku] yield self.build_listing_item(response, data=_data, meta_data=_meta_data) yield JsonRequest( settings.CANADIANTIRE_CA_API_STORES_LINK_FORMAT.format( lat=lat, lng=lng, pid=self._parent_sku), callback=self.parse_near_stores, errback=parsers.resp_error_handler, meta={ # avoid error - Crawled (503) <GET https://api-triangle.canadiantire.ca/robots.txt> 'dont_obey_robotstxt': True, }, headers={ 'Referer': self._referer_for_jsonrequest, }, cb_kwargs={ 'skus': _skus, })
def start_requests(self): print('>>> post request.') return [ JsonRequest(url="https://t.17track.net/restapi/track", method='POST', headers=self.custom_headers, data={ 'data': [{ 'num': self.num, 'fc': 0, 'sc': 0 }], 'guid': '', 'timeZoneOffset': -480 }, callback=self.after_post) ]
def parse(self, response): json_response = json.loads(response.body_as_unicode()) self.reqs_number += 1 # dealing with this HeadlinesResponse that is a placeholder for lists for json_list in json_response['HeadlinesResponse']: # the summary object is one to check to see if there is any more # response beyond a given date if 'Summary' in json_list: oldest_date_obj = min( json_list['Summary'], key=lambda x: x['CreateTimestamp']['Value']) doc_id = oldest_date_obj['DocumentIdUri'].split('/')[-1] oldest_res_date = oldest_date_obj['CreateTimestamp']['Value'] yield { 'HeadlinesResponse': json_response['HeadlinesResponse'], } # condition for stopping is if max_requests was or max end time was reached can_stop = (self.max_requests and self.reqs_number >= self.max_requests) or ( self.end_time and oldest_res_date <= self.end_time) if not can_stop: # updating params for next request query_params = self.start_query_params.copy() query_params['datetime'] = oldest_res_date query_params['direction'] = 'older' # This param seems to be useless. But the official website request uses it. query_params['docid'] = doc_id yield JsonRequest(url=response.url, data=query_params, dont_filter=True, callback=self.parse) # start new stock else: yield from self.start_new_stock(response) else: yield from self.start_new_stock(response)