def logged_in(self, response): if response.url == 'https://accounts.pixiv.net/login': raise CloseSpider('username or password error !') yield SplashRequest(self.generate_search_url(), self.parse)
def start_requests(self): for url in self.start_urls: # Uncomment below if you want a screenshot of the response # yield SplashRequest(url, self.parse, endpoint='render.json', args={"wait": 5, "png": 1, "render_all": 1}) yield SplashRequest(url, self.parse, args={"wait": 5})
def start_requests(self): yield SplashRequest(self.start_urls[0], callback=self.scrap_search_result_page, args={'wait': 0.5})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url=url, callback=self.parse, args={'headers': self.headers, 'wait': 10,'timeout':20}, encoding='utf-8')
def start_requests(self): yield SplashRequest(url="https://groceries.asda.com/shelf/health-beauty/hair-care/shampoo-conditioner/shampoo/103730", callback=self.parse, endpoint="execute",args={ 'lua_source': self.script })
def start_requests(self): yield SplashRequest( url= 'http://www.rslvic.com.au/rsl-network/victorian-map-of-all-branches/', callback=self.parse, )
def start_requests(self): print("lua script - " + self.expand_and_scroll_lua) for url in self.start_urls: yield SplashRequest(url, self.parse, args={'wait': 0.5})
def go_to_listings(self, response): listings_url = response.xpath('//*[@id="menu2"]/ul/li[4]/a/@href').extract_first() yield SplashRequest(url=response.urljoin(listings_url), callback=self.parse, args={'wait': 0.5, 'timeout': 60})
def start_requests(self): yield SplashRequest(self.start_url, args={"images": 0, "wait": 3})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': script})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, callback=self.login)
def splash_request(self, request): return SplashRequest(url=request.url, callback=self.parse_conference, args={'wait': 2})
def modify_realtime_request(self, request): user_url_input = request.meta["url"] return SplashRequest(user_url_input, self.parse, args={'lua_source': self.script}, endpoint='execute')
def start_requests(self): yield SplashRequest( url='http://globoesporte.globo.com/mg/futebol/'\ 'copa-do-brasil/jogo/27-09-2017/cruzeiro-flamengo/', callback=self.parse, args={'wait': 2} )
def adidas_parse(self, response): products = response.xpath('//*[@id="hc-container"]/div') for product in products: # If product don't have comming soon tag, scrape tag = product.xpath( "./div[2]/div[3]/div[2]/span/text()").extract_first() if "coming soon" not in tag.lower().strip(): sneaker = Sneaker() root_url = "https://www.adidas.com" data = product.xpath("./div/@data-context").extract_first() # Name m = re.search('name:(.*);', data) sneaker["name"] = m.group(1) # Model m = re.search('model:(.*)', data) description = 'Model: ' + m.group(1) # Id m = re.search('id:(.*);name', data) description += ' ID: ' + m.group(1) sneaker["description"] = description sneaker["image"] = product.xpath( "./div[2]/div[3]/div[3]/a/img[1]/@data-original" ).extract_first() sneaker["currency"] = product.xpath( "./div[2]/div[3]/div[4]/div[4]/div/span[1]/text()" ).extract_first().strip() sneaker["price"] = product.xpath( "./div[2]/div[3]/div[4]/div[4]/div/span[2]/text()" ).extract_first().strip() url = product.xpath( "./div[2]/div[3]/div[3]/a/@href").extract_first() sneaker["url"] = root_url + url sneaker["tag"] = 'adidas' yield sneaker self.page += 120 if products: next_page = "http://www.adidas.com/us/men-shoes?sz=120&start=" + str( self.page) # With proxy if self.settings.get('ADIDAS_PROXY_ENABLED'): yield SplashRequest(next_page, self.adidas_parse, headers=self.adidas_headers(), args={ 'images_enabled': 'false', 'proxy': self.random_proxy() }) # Without proxy else: yield SplashRequest(next_page, self.adidas_parse, headers=self.adidas_headers(), args={'images_enabled': 'false'})
def start_requests(self): yield SplashRequest(url=self.url + '/sapi/category/getAllBrand', callback=self.parse, meta={'splash': { 'endpoint': 'render.html' } })
def start_requests(self): urls = list() non_shopify_list = list() bots_list = list() # Get all urls to scrape with open(os.path.dirname(__file__) + self.url_file, "rt") as f: urls = [url.strip() for url in f.readlines()] # Supported non shopify sites list with open(os.path.dirname(__file__) + self.non_shopify_file, "rt") as f: non_shopify_list = [url.strip() for url in f.readlines()] # Supported bots sites list with open(os.path.dirname(__file__) + self.bots_file, "rt") as f: bots_list = [url.strip() for url in f.readlines()] for url in urls: t = tldextract.extract(url) root = t.domain + '.' + t.suffix proxy_enabled = self.settings.get('PROXY_ENABLED') adidas_proxy_enabled = self.settings.get('ADIDAS_PROXY_ENABLED') # Adidas site (uses scrapy-splash) if "adidas.com" in url: # With proxy if adidas_proxy_enabled: yield SplashRequest(url, self.adidas_parse, headers=self.adidas_headers(), args={ 'images_enabled': 'false', 'proxy': self.random_proxy() }) # Without proxy else: yield SplashRequest(url, self.adidas_parse, headers=self.adidas_headers(), args={'images_enabled': 'false'}) # Non shopify site elif any(root in s for s in non_shopify_list): # With proxy if proxy_enabled: yield scrapy.Request(url, self.non_shoify, meta={'proxy': self.random_proxy()}) # Without proxy else: yield scrapy.Request(url, self.non_shoify) # Bots elif any(root in s for s in bots_list): # With proxy if proxy_enabled: yield scrapy.Request(url, self.bots_parse, meta={'proxy': self.random_proxy()}) # Without proxy else: yield scrapy.Request(url, self.bots_parse) # Shopify sites else: # With proxy if proxy_enabled: yield scrapy.Request(url, self.shopify_parse, meta={'proxy': self.random_proxy()}) # Without proxy else: yield scrapy.Request(url, self.shopify_parse)
def start_requests(self): sampleURL = 'https://www.fahasa.com/sach-trong-nuoc/van-hoc-trong-nuoc/page/' for i in range(0, self.numOfPage): yield SplashRequest(sampleURL + str(i + 1) + '.html', self.parse, args={"wait": 5})
def start_requests(self): yield SplashRequest( url= 'https://angel.co/companies?locations[]=1688-United+States&tab=hiring&stage[]=Series+A&stage[]=Series+B&stage[]=Series+C', callback=self.parse, )
def start_requests(self): urls = ['http://stock.qq.com/l/stock/ywq/list20150423143546.htm'] for url in urls: yield SplashRequest(url=url, callback=self.parse, args={'wait': 0.5})
def parse(self, response): # This callback determines if the selected menu is # at the top of the list, if it is then it adds the urls # to the list and keeps going # if its not, then it calls the lua to prepare the page # for scraping, and then scrapes it url = response.url menu = response.css(".category-filter__link") #submenu = response.css("") #print ("self.urls - " +str(self.urls)) print("processing response.url - " + response.url) #print ("menu: ") #print (menu.getall()) #print ("len(menu): " + str(len(menu))) #print ("menu[0] : " + menu.get()) #print("name - " + menu[0].css('.category-filter__text ::text').get()) #inspect_response(response,self) if (len(menu) > 0 and menu[0].css('[aria-current="page"]')): print(f"inside menu page for url - {url}") # The top page is active #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get()) # therefore we need to scrape the links, and continue searching # we then need to loop through each other page. # call parse, and scrape it is not menu_url = menu[0].css('::attr(href)').get() menu_name = menu[0].css('.category-filter__text ::text').get() for item in menu: heading = item.css('.category-filter__text ::text').get() scraped_url = item.css('::attr(href)').get() scraped_url = self.base_url + scraped_url section = menu_name subsection = heading category = lookup_category("", section, subsection) store_url(self.conn, scraped_url, self.store_id, category, section, subsection) #self.section_dict[url]=(menu_name, heading) #if self.urls.count(url) == 0: # self.urls.append(url) #urls=menu.css('::attr(href)').getall() # Remove the the first(this) page from list to parse #urls.pop() #self.urls.extend(urls) #print("urls to scrape - " + str(self.urls)) #print("local urls - " + str(urls)) """ while len(self.urls) != 0: url = self.urls.pop() self.processedUrls.append(url) #url = self.base_url + url_suffix #print ("urls - " + str(self.urls)) #print ("pulling from url - " + url) #print ("urls lengths - " + str(len(self.urls))) yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': self.expand_and_scroll_lua}) """ elif (len(menu) == 0): inspect_response(response, self) else: #we are on a subpage, so now we can start scraping # GROCERY_SELECTOR = '.grid-item' NAME_SELECTOR = '.small-type.detail-card-description ::text' PRICE_SELECTOR = '.price ::text' PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text' metadata = get_url_metadata(self.cursor, url) section = metadata[0] subsection = metadata[1] print("subpage - scraping " + url + ", from section - " + section) for grocery in response.css(GROCERY_SELECTOR): self.name = grocery.css(NAME_SELECTOR).extract_first() self.price = grocery.css(PRICE_SELECTOR).extract_first() if self.price is not None: self.price = self.price.replace('*', '').replace('$', '') self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first() if self.ppu is not None: self.ppu = convert_ppu(self.ppu) #inspect_response(response, self) #parse the ounces off of the name yield { 'name': self.name, 'price': self.price, 'price-per-unit': self.ppu, 'section': section, 'subsection': subsection, 'url': response.url } finish_url(self.conn, self.store_id, url) print("finishing url - " + url) next_url = get_next_url(self.cursor, 1) if next_url is not None: print("got next_url - " + next_url) yield SplashRequest( next_url, self.parse, endpoint='execute', dont_filter=True, args={'lua_source': self.expand_and_scroll_lua}) else: print("Next url is none therefore we must be finished ! ")
def start_requests(self): for url in self.start_urls: yield SplashRequest(url)
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, endpoint='render.html', args={'wait': 5})
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, endpoint="render.html", callback=self.parse)
def get_request(self, url): return SplashRequest(url=url, endpoint='execute', cache_args=['lua_source'], args={'lua_source': self.lua_script}, cb_kwargs={'provider': self})
def start_requests(self): for page in range(1, 11): url = self.base_url % page yield SplashRequest(url=url, args={'wait': 3, 'images': 0})
def start_requests(self): for title in self.start_urls: url = self.start_urls[title] # identify the stock area ,like HK, US , sar etc if 'sau' in title.lower(): stock_area = 'SAU' stock_come = 'SAU' yield SplashRequest(url, endpoint='execute', args={ 'lua_source': self.lua_extract_page, 'images': 0, 'timeout': self.rendering_page_timeout + 30 }, callback=self.extract_page, meta={ 'stock_area': stock_area, 'stock_come': stock_come }) elif 'hk' in title.lower(): stock_area = 'HK' stock_come = 'CN' if not 'sp' in title.lower(): yield SplashRequest(url, endpoint='execute', args={ 'lua_source': self.lua_extract_page, 'images': 0, 'timeout': self.rendering_page_timeout }, callback=self.parse_page_num, meta={ 'stock_area': stock_area, 'stock_come': stock_come }) else: # direct extract page # get stock_id firstly stock_id = url.split('/')[-1].split('.')[0] yield SplashRequest(url, endpoint='execute', args={ 'lua_source': self.lua_extract_page, 'images': 0, 'timeout': self.rendering_page_timeout }, callback=self.extract_page, dont_filter=True, meta={ 'stock_name': 'None', 'stock_id': stock_id, 'stock_area': stock_area, 'stock_come': stock_come }) elif 'us_chinese' in title.lower(): stock_area = 'US' stock_come = 'CN' yield SplashRequest(url, endpoint='execute', args={ 'lua_source': self.lua_extract_page, 'images': 0, 'timeout': self.rendering_page_timeout }, callback=self.parse_page_num, meta={ 'stock_area': stock_area, 'stock_come': stock_come }) elif 'united_states' in title.lower(): stock_area = 'US' stock_come = 'US' for i in range(1, self.united_states_pages + 1): real_lua_source = self.lua_United_states_pages.format(i) yield SplashRequest(url, endpoint='execute', args={ 'lua_source': real_lua_source, 'images': 0, 'timeout': self.rendering_page_timeout }, callback=self.parse, meta={ 'stock_area': stock_area, 'stock_come': stock_come }) elif 'hs_a' in title.lower(): stock_area = 'CN' stock_come = 'CN' for i in range(1, self.hsa_default_pages + 1): real_lua_source = self.lua_HSA_pages.format(i) #print(real_lua_source) yield SplashRequest(url, endpoint='execute', args={ 'lua_source': real_lua_source, 'images': 0, 'timeout': self.rendering_page_timeout }, callback=self.parse, dont_filter=True, meta={ 'stock_area': stock_area, 'stock_come': stock_come })
def start_requests(self): for url in self.start_url: yield SplashRequest(url, callback=self.parse)
def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, args={'wait': 0.5})
def start_requests(self): setting = self.settings page = setting['CRAWL_PAGE'] for p in range(1, page + 1): yield SplashRequest(self.generate_search_url(page=p), self.parse)