def test_url_query_parameter_2(self): """ This problem was seen several times in the feeds. Sometime affiliate URLs contains nested encoded affiliate URL with direct URL as parameters. For example: aff_url1 = 'http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1' the typical code to extract needed URL from it is: aff_url2 = url_query_parameter(aff_url1, 'url') after this aff2_url is: 'http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Children's gardenfurniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1' the direct URL extraction is url = url_query_parameter(aff_url2, 'referredURL') but this will not work, because aff_url2 contains ' (comma sign encoded in the feed) and the URL extraction will fail, current workaround was made in the spider, just a replace for ' to %27 """ return # FIXME: this test should pass but currently doesnt # correct case aff_url1 = "http://www.anrdoezrs.net/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EGarden+table+and+chair+sets%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357199%2526langId%253D-1" aff_url2 = url_query_parameter(aff_url1, 'url') self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Garden table and chair sets&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357199%26langId%3D-1") prod_url = url_query_parameter(aff_url2, 'referredURL') self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357199&langId=-1") # weird case aff_url1 = "http://www.tkqlhce.com/click-2590032-10294381?url=http%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FArgosCreateReferral%3FstoreId%3D10001%26langId%3D-1%26referrer%3DCOJUN%26params%3Dadref%253DGarden+and+DIY-%3EGarden+furniture-%3EChildren%26%2339%3Bs+garden+furniture%26referredURL%3Dhttp%3A%2F%2Fwww.argos.co.uk%2Fwebapp%2Fwcs%2Fstores%2Fservlet%2FProductDisplay%253FstoreId%253D10001%2526catalogId%253D1500001501%2526productId%253D1500357023%2526langId%253D-1" aff_url2 = url_query_parameter(aff_url1, 'url') self.assertEqual(aff_url2, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ArgosCreateReferral?storeId=10001&langId=-1&referrer=COJUN¶ms=adref%3DGarden and DIY->Garden furniture->Children's garden furniture&referredURL=http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay%3FstoreId%3D10001%26catalogId%3D1500001501%26productId%3D1500357023%26langId%3D-1") prod_url = url_query_parameter(aff_url2, 'referredURL') # fails, prod_url is None now self.assertEqual(prod_url, "http://www.argos.co.uk/webapp/wcs/stores/servlet/ProductDisplay?storeId=10001&catalogId=1500001501&productId=1500357023&langId=-1")
def parse(self, response): self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO) # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: return res_data = json.loads(response.body) if res_data["retcode"] != "0" and res_data["retcode"] != 0: self.log("Crawled %s %s" % (response.url, res_data["retcode"]), level=scrapy.log.CRITICAL) return bid = url_query_parameter(response.url, 'bid') start = url_query_parameter(response.url, 'start') min_time = time.time() for item in res_data["result"]["posts"]: url = "http://buluo.qq.com/p/detail.html?bid=%s&pid=%s" % (bid, item["pid"]) yield self.baidu_rpc_request({"url": url, "src_id": 22}) if int(item["time"]) < min_time: min_time = int(item["time"]) start = int(start) if "total" in res_data["result"] and int( res_data["result"]["total"]) > start + 20 and time.time() - min_time < 3600 * 24 * 2: next_url = 'http://buluo.qq.com/cgi-bin/bar/post/get_post_by_page?bid=%s&num=20&start=%s&bkn' % ( bid, start + 20) self.log("SendCrawl %s Total:%d" % (next_url, int(res_data["result"]["total"])), level=scrapy.log.INFO) yield scrapy.Request(url=next_url, headers={"Referer": "http://buluo.qq.com/p/barindex.html?bid=%s" % bid})
def test_url_query_parameter(self): self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "id"), '200') self.assertEqual(url_query_parameter("product.html?id=200&foo=bar", "notthere", "mydefault"), 'mydefault') self.assertEqual(url_query_parameter("product.html?id=", "id"), None) self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1), '')
def get_date_from_url(self, url): year = url_query_parameter(url, "year") month = url_query_parameter(url, "mon") day = url_query_parameter(url, "day") # url_date = datetime(year=int(year), month=int(month), day=int(day), tzinfo=timezone(self.tz)) url_date = datetime(year=int(year), month=int(month), day=int(day)) # print(url_date) return url_date
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//li[@class="PANEL ALL"]//a/@href').extract() categories += hxs.select( '//li[@class="PANEL BY-SIZE"]//a/@href').extract() categories += hxs.select( '//li[@class="PANEL BY-TYPE"]//a/@href').extract() for url in categories: url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product_list) products = hxs.select('//div[@id="pdList"]//a/@href').extract() products += hxs.select( '//div[@class="product-tile"]//a/@href').extract() for url in products: pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product) product_variants = hxs.select( '//div[@class="productVariantTypeOptions"]/a/@href').extract() for url in product_variants: self.log('productVariantTypeOptions! {}'.format(url)) pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product) next_page = None cur_page = url_query_parameter(response.url, 'pi', None) if cur_page: # The spider is already crawling the pages, we just assing the current url # so we can increment the 'pi' argument next_page = response.url else: # First page of the product list, we extract the pagination url with regex next_page = re.findall('.get\( "(.*)pi=', response.body) if next_page: next_page = response.urljoin(next_page[0]) if (next_page and products != response.meta.get('products', [])) or ( next_page and product_variants != response.meta.get('product_variants', [])): cur_page = url_query_parameter(next_page, 'pi', '1') url = add_or_replace_parameter(next_page, 'pi', str(int(cur_page) + 1)) self.log('Goes to next page: ' + url) yield Request(url, callback=self.parse_product_list, meta={ 'products': products, 'product_variants': product_variants })
def get_date_from_url(url): day = url_query_parameter(url, u"fecha_dia") month = url_query_parameter(url, u"fecha_mes") year = url_query_parameter(url, u"fecha_anio") page_date = { u"day": int(day), u"month": int(month), u"year": int(year), } return page_date
def parse_comments(self, response): items = [] # the first item is not a comment for athing_sel in response.xpath('//tr[@class="athing"]')[1:]: comment_item = CommentItem() comment_item['hacker_news_item'] = url.url_query_parameter( response.url, "id") comment_item['nesting_level'] = int( athing_sel.xpath( ".//td[@class='ind']/img/@width").extract_first()) comment_item['text'] = "\n".join( athing_sel.xpath( ".//td[@class='default']//span[@class='comment']//text()"). extract()[1:-6]) comment_item['user_name'] = athing_sel.xpath( ".//td[@class='default']//span[@class='comhead']/" "a[starts-with(@href, 'user')]/text()").extract_first() comment_item['id_'] = athing_sel.xpath( ".//td[@class='default']//a[starts-with(@href, 'item')]/@href" ).extract_first() comment_item['hacker_news_item'] = response.url items.append(comment_item) self.fill_parents(items) for item in items: yield item
def getPrice(self, url): # check if its flipkart link if self.isHeadphonezoneLink(url): # remove references # this is done so that we save only the product url in database #url = self.cleanURL(url) price = {} res = requests.get(url, headers=self.headers) #print(res.content) soup = BeautifulSoup(res.content, 'html5lib') scripts = None try: scripts = soup.findAll('script', type='application/ld+json') for script in scripts: script = json.loads(script.text) if 'offers' in script: scripts = script if not isinstance(scripts, dict): scripts = None except AttributeError: return None finally: price['currency'], price['regular'], price[ 'title'] = self.filterPrice( scripts, url_query_parameter(url, 'variant')) return price
def run_crawl_all(self, response): print(' --- run_crawl_all --- ') t = datetime.datetime.now().strftime("%Y.%m.%d-%H:%M:%S") next_offset = int(url_query_parameter(response.url, 'offset')) + 10 list_parse_res = list_parse(eval(response.body.decode())) list_db_data = list_into_dbdata(list_parse_res, self.task['task_biz_enname'], self.task['task_biz_chname'], self.task['_id']) # 到头了或者出错了 if not list_db_data: self.task['task_status'] = 'end_success' print('要出去了') else: res = mongo_instance.loads.insert_many(list_db_data) if self.crawled_times == 1: print(' 插入的第一个id是: %s' % res.inserted_ids[0]) self.task['task_start_loadid'] = res.inserted_ids[0] self.crawled_times += 1 print('还有请求呢别着急出去') self.task['task_updatetime'] = t self.task['task_endtime'] = t mongo_instance.tasks.find_one_and_update( filter={'_id': self.task['_id']}, update={'$set': self.task}) if not 'running' in self.task['task_status']: return else: yield scrapy.Request(url=add_or_replace_parameter( response.url, 'offset', next_offset), headers=FakeLoadParams.headers, method='GET')
def getAccessToken(link, user_id, password, birthyear , userprofile): code = None #try : r = requests.get(link, allow_redirects=True) #print(r.text.encode('utf-8')) # Connect to duckduckgo browser = mechanicalsoup.StatefulBrowser() browser.open(link) # Fill-in the search form form = browser.select_form() browser["apiKey"] = userprofile.api_key browser["username"] = user_id browser["password"] = password browser["password2fa"] = birthyear resp = browser.submit_selected() print(resp) if resp.status_code == 200 : #print '200' page = browser.get_current_page() #print page res = page.find("p", class_="error-msg") if res is not None: raise RuntimeError(res.getText()) form1 = browser.select_form() browser.get_current_form().choose_submit_by_value('Accept') resp = browser.submit_selected(); print(resp) code = url_query_parameter(resp.url, 'code') #print(code) return code
def parse(self, response): print("procesing:" + response.url) # extract links featured_urls = response.css( 'section[class="featured-articles__wrapper___1dxwZ"] a::attr(href)' ).extract() trending_urls = response.css( 'section[class="trending-stories__wrapper___1KLqW"] a::attr(href)' ).extract() promoted_urls = response.css( 'section[class="promoted-stories__section___zvI1g"] a::attr(href)' ).extract() topic_urls = response.css( 'section[class="topics-river__wrapper___2CozB"] a::attr(href)' ).extract() all_urls = featured_urls + trending_urls + promoted_urls + topic_urls all_urls = [i for i in all_urls if '/article/' in i] all_urls = list(set(all_urls)) for url in all_urls: yield scrapy.Request( url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse_listing(self, response): """ Extract product list. @url https://www.walgreens.com/store/c/eyes/ID=360457-tier3 @returns requests 1 """ blob = response.css('script').re_first( r'__APP_INITIAL_STATE__ = (\{.+\});') if not blob: return data = json.loads(blob) if not data['searchResult'].get('productList'): return for each in data['searchResult']['productList']: yield response.follow(each['productInfo']['productURL'], callback=self.parse_product) limit = response.meta.get('limit', 24) offset = int(url_query_parameter(response.url, 'No', 0)) + limit return response.follow(add_or_replace_parameter( response.url, 'No', offset), callback=self.parse_listing, meta={ 'offset': offset, 'limit': limit })
def color_ids(self, response, pid): colour_urls = response.css( self.css.colour_link_css).extract() + [response.url] return [ url_query_parameter(color_url, 'dwvar_{}_color'.format(pid)) for color_url in colour_urls ]
def parse_category(self, response): category = response.css('li.last::text').extract() products = response.xpath('//div[@typeof="Product"]') for product in products: loader = ProductLoader(Product(), selector=product) loader.add_xpath('identifier', './/*[@property="url"]/@sku') url = product.xpath('.//*[@property="url"]/@href').extract_first() loader.add_value('url', response.urljoin(url)) loader.add_xpath('name', './/*[@property="url"]/text()') loader.add_xpath('price', './/*[@property="price"]/text()') loader.add_xpath('sku', './/*[@property="url"]/@sku') loader.add_xpath('category', '//li[@typeof="v:Breadcrumb"]/a/text()') loader.add_value('category', category) loader.add_xpath('image_url', './/*[@property="image"]/@content') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', '9.95') if product.xpath('.//button[starts-with(@id, "outOfStock")]'): loader.add_value('stock', 0) yield loader.load_item() if url_query_parameter(response.url, 'pn') or re.search('/cat_.+/.', response.url): return filters = response.css('ul.filters input::attr(id)').re('^\S{5}$') for filt in filters: url = response.url + '/' + filt yield Request(url, self.parse_category)
def parse(self, response): print("procesing:" + response.url) # extract links top_links = response.css('.mainStory::attr(href)').extract() moretop_links = response.css( 'div[class="assetBody riverPost"] > a::attr(href)').extract() subtop_links = response.css( 'a[class="related content_article"]::attr(href)').extract() righttop_links = response.css( 'div[class="assetBody dekRight riverPost"] > a::attr(href)' ).extract() latest_links = response.css('h3 > .assetHed::attr(href)').extract() all_links = top_links + moretop_links + subtop_links + righttop_links + latest_links all_links = [i for i in all_links if '/news' in i] all_links = list(set(all_links)) for link in all_links: url = self.base_url + link yield scrapy.Request( url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse(self, response): print("procesing:" + response.url) # extract links col1_links = response.css( 'div[class="column_one "] > ul > li > div > a::attr(href)' ).extract() col2_links = response.css( 'div[class="column_two"] > ul > li > div > a::attr(href)').extract( ) other_links = response.css( 'div[class="flex-feature | container container_column--desktop "] > a::attr(href)' ).extract() box_links = response.css( 'div[class="envelope-container"] a::attr(href)').extract() all_links = col1_links + col2_links + other_links + box_links all_links = list(set(all_links)) all_links = [i for i in all_links if '/story' in i] for link in all_links: url = self.base_url + link yield scrapy.Request( url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse(self, response): print("procesing:" + response.url) # extract links top_link = response.xpath( "//a[@class = 'gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-paragon-bold gs-u-mt+ nw-o-link-split__anchor']/@href" ).extract() other_links = response.css( 'a[class="gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"]::attr(href)' ).extract() list_links = response.css( 'a[class="qa-heading-link lx-stream-post__header-link"]::attr(href)' ).extract() # create urls all_links = top_link + other_links + list_links all_links = [i for i in all_links if '/news/' in i and '/av/' not in i] all_links = list(set(all_links)) for link in all_links: url = self.base_url + link yield scrapy.Request( url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse(self, response): print("procesing:" + response.url) # extract links top_links = response.css( 'div[class="sc-dnqmqq euQtQS pages__Row-r8po92-0 pages__TopContentRow-r8po92-1 fHFVhw"] a::attr(href)' ).extract() top_links = [ i for i in top_links if len(i) > 20 and '/authors' not in i ] other_links = response.css( 'div[class="sc-dnqmqq euQtQS pages__Row-r8po92-0 pages__CenterContentRow-r8po92-3 iZXSgS"] a::attr(href)' ).extract() other_links = [i for i in other_links if '/authors' not in i] all_links = top_links + other_links all_links = list(set(all_links)) for link in all_links: url = self.base_url + link yield scrapy.Request( url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse(self, response): print("procesing:"+response.url) # extract links all_links = response.css('.card__link::attr(href)').extract() for link in all_links: url = self.base_url + link yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse(self, response): print("procesing:"+response.url) # extract links all_urls = response.css('h3 > a::attr(href)').extract() all_urls = list(set(all_urls)) for url in all_urls: yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def extract_links(self, response): page_no = url_query_parameter(response.url, 'page_no', None) if not response.css('.Result a'): return [] return [ Link(url=add_or_replace_parameter(response.url, 'page_no', int(page_no) + 1)) ]
def parse_curr(self, resp): # print(resp.text) curr_val = resp.xpath( '/html/body/div/div/div/h2/strong/text()').extract() # print(curr_val) if curr_val: # print(curr_val) curr_val = "".join(curr_val) curr_val = "".join(curr_val.split()) try: curr_val = float(curr_val) except Exception: curr_val = None else: curr_val = None # print(curr_val) if curr_val: from_cur = url_query_parameter(resp.url, "primary") to_cur = url_query_parameter(resp.url, "secondary") corr_id = self.conn.get_corr_id(from_cur.upper(), to_cur.upper()) # print(corr_id) if corr_id: item = TestSpiderItem() item["value"] = curr_val item["corr_id"] = corr_id yield item base_urls = self.conn.get_urls() for url in base_urls: yield SplashRequest( url=url, # url="https://ru.cryptonator.com/rates/convert/?amount=1&primary=btc&secondary=ltc&source=liverates", # callback=self.get_coocie, callback=self.parse_curr, endpoint="execute", cache_args=["lua_source"], args={ "lua_source": script, }, meta={'dont_redirect': True}, dont_filter=True)
def parse(self, response): print("procesing:"+response.url) # extract links all_urls = response.css('div[class="PageBuilder-col-9 PageBuilder-col"] a::attr(href)').extract() all_urls = [i for i in all_urls if '/2020/' in i] all_urls = list(set(all_urls)) for url in all_urls: yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def get_search_results_requests(self, response): sel = scrapy.Selector(response) for url in sel.css('h3.r a::attr(href)').extract(): if 'google.com/url' in url: url = url_query_parameter(url, 'url') referer = urljoin(response.url, url) else: referer = 'https://www.google.com/url?url=%s' % url headers = {'Referer': referer} yield scrapy.Request(url, headers=headers)
def get_page(response): from_page = response.meta.get('from_page', None) if from_page: page = from_page + 1 else: page = url_query_parameter(response.url, 'p', None) if page: page = str_to_int(page) return page
def parse(self, response): div = response.xpath( "/html/body/div[9]/div[2]/section/div[1]/ul//div[@class='small-card search-Result-Card col-lg-6 col-md-6 col-sm-12 col-xs-12']" ) urls = div.xpath("//div/div/div[1]/div[2]/a[@href]").get().split( "/")[3].split('-')[0], for code in urls: yield scrapy.Request( code, meta={'deltafetch_key': url_query_parameter(code, 'code')}, callback=self.parse_element)
def parse(self, response): print("procesing:"+response.url) # extract links all_links = response.css('.sdc-site-tile__headline > a::attr(href)').extract() all_links = [i for i in all_links if '/video' not in i] all_links = list(set(all_links)) for link in all_links: url = self.base_url + link yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def parse(self, response): print("procesing:"+response.url) # extract links all_links = response.css('h2[class="title"] > a::attr(href), h4[class="title"] > a::attr(href)').extract() all_links = [i for i in all_links if '/science' in i and 'pictures' not in i] all_links = list(set(all_links)) for link in all_links: url = self.base_url + link yield scrapy.Request(url, meta={'deltafetch_key': url_query_parameter(url, 'id')}, callback=self.parse_attr)
def get_pollutant_name(url): code_name = url_query_parameter(url, u"contaminante") name = None if code_name == u"1": name = u"co" elif code_name == u"2": name = u"no2" elif code_name == u"3": name = u"pm10" return name
def parse_options(self, response): data = json.loads(response.body) identifier = url_query_parameter(response.url, 'productId') sku = url_query_parameter(response.url, 'skuId') loader = ProductLoader(Product(), response=response) loader.add_value(None, response.meta['item']) loader.replace_value('identifier', '.'.join((identifier, sku))) loader.replace_value('sku', sku) loader.replace_value('name', data['skuName']) if not data['skuName'].endswith( data['size']) and not data['skuName'].endswith( data['size'].replace(' ', '')): loader.add_value('name', data['size']) loader.replace_value('image_url', response.urljoin(data['thumbnail_url'])) loader.replace_value('price', str(data['unit_sale_price'])) loader.replace_value('stock', data['stock']) if Decimal(data['unit_sale_price']) < 20: loader.add_value('shipping_cost', '2.99') yield loader.load_item()
def get_search_results_requests(self, response): sel = scrapy.Selector(response) for url in sel.css('h3.r a::attr(href)').extract(): if 'google.com/url' in url: url = url_query_parameter(url, 'url') referer = urljoin(response.url, url) else: referer = 'https://www.google.com/url?url=%s' % url headers = { 'Referer': referer } yield scrapy.Request(url, headers=headers)
def parse(self, response): data = json.loads(response.text) usernames = [user['components']['username']['val'] for user in data['completions']] for username in usernames: yield scrapy.Request( url='https://keybase.io/{}'.format(username), callback=self.parse_profile, ) if len(usernames) == 100: q = url_query_parameter(response.url, 'q') for c in ascii_lowercase: yield scrapy.Request(url=SEARCH_URL.format(q + c))
def parse_hotel(self, response): hxs = Selector(response) hotel = HtmlParser.extract_hotel(response.url, hxs) checkin = url_query_parameter(response.url,"checkin") checkout = url_query_parameter(response.url,"checkout") checkinDatetime = None checkoutDatetime = None today = datetime.date.today() if checkin is not None: checkinDatetime = datetime.datetime.strptime(checkin, "%Y-%m-%d").date() checkinDatetime = self.add_months(checkinDatetime,1) else: checkinDatetime = datetime.date(today.year, today.month, 15) if checkout is not None: checkoutDatetime = datetime.datetime.strptime(checkout, "%Y-%m-%d").date() checkoutDatetime = self.add_months(checkoutDatetime,1) else: checkoutDatetime = datetime.date(today.year, today.month, 16) maxDatetime = self.add_months(today,18) if checkinDatetime < maxDatetime: url = url_query_cleaner(response.url) url = add_or_replace_parameter(url,"checkin",str(checkinDatetime)) url = add_or_replace_parameter(url,"checkout",str(checkoutDatetime)) #logging.warning('---------------------------- %s' % url) yield Request(url, callback=self.parse_hotel) yield hotel["hotel"] if len(hotel["rooms"]) > 0: for room in hotel["rooms"]: yield room
def parse_term(self, response): ''' @url https://api.oyez.org/cases?filter=term:2014&page=0 @returns requests 31 31 @returns items 0 0 ''' cases = json.loads(response.body) results = [] for case in cases: url = case['href'] results.append(scrapy.Request(url=url, callback=self.parse_case)) if len(cases)>=30: page = int(url_query_parameter(response.url, 'page')) + 1 results.append(scrapy.Request(url=self.term_url(page), callback=self.parse_term)) return results
def parse(self, content): for href in content.css('.moment-table__more > a:nth-child(1)::attr("href")'): parts = {} source_url = content.urljoin(href.extract()) parts['source_url'] = source_url parts['source_id'] = url_query_parameter(source_url, 'id') parts['source_name'] = self.name # <div class="">24.01.2016 um 17:00 Uhr • Peterpower</div> pub_date = content.css('.moment-table__datetime::text').extract_first().partition(' Uhr')[0] parts['time'] = datetime.strptime(pub_date , "%d.%m.%Y um %H:%M") # here request = scrapy.Request(source_url, callback = self.parse_item_page) request.meta['parts'] = parts yield request ## # and open the next page if os.getenv('SCRAPY_BVG_RECURSIVE', False): next_page = content.css('a.paging__control--next::attr("href")').extract_first() logging.log(logging.INFO, "Recursing into next page %s" % next_page) yield scrapy.Request(next_page, callback = self.parse) pass
def movie_page(self, response): hxs = Selector(response) item = ThreedmmcomItem() item['thread_url'] = response.url item['thread_id'] = url_query_parameter(response.url, 't') item['name'] = firstOrNone(hxs.select('//div[@class="bigusername"]/text()')) files = item['files'] = hxs.select('//a[@data-location]/@data-location').extract() if not files: return meta=item['meta']={} metadivs=response.xpath('//table[starts-with(@id,"post")]//td[@class="alt1" and @width="125"]/div') for i,entry in enumerate(metadivs): name=firstOrNone(entry.css('div.smallfont').xpath('text()')) if name and i<len(metadivs)-2: meta[name]=firstOrNone(metadivs[i+1].xpath('./descendant-or-self::*[name() != "script" and name() != "style"]/text()[normalize-space()]')) version=response.xpath('//table[starts-with(@id,"post")]//td[@class="alt1" and @width="125"]/table/tr[1]//strong/text()').extract() if version: item['meta']['version']=version[0].strip() return item