def parse(self, response: TextResponse): latest_articles = response.css( 'main#content article[class*="expanded-feature"] *[class*="title"] a' ) more_stories = response.css('ol[class*="list"] *[class*="title"] a') yield from response.follow_all(chain(latest_articles, more_stories), callback=self.parse_news)
def parse(self, response: TextResponse, **kwargs): if self.n_pages == self.max_pages: return self.n_pages += 1 links = response.css("#recent p.title > a::attr(href)").extract() if links: for link in links: if self.db.rentals.find({ "_id": link }, { "_id": 1 }).count() == 0: yield response.follow(link, callback=self.parse) else: ad = parse_ad_page_html(response.body) with contextlib.suppress(DuplicateKeyError): self.db.rentals.insert_one(asdict(ad)) yield ad next_page = response.css(".next a") if next_page: yield response.follow(next_page[0], callback=self.parse)
def run(): response = requests.get("https://en.wikipedia.org/wiki/Quantum_mechanics") response.raise_for_status() response = TextResponse( body=response.content, url="https://en.wikipedia.org/wiki/Quantum_mechanics") response.css('div.printfooter > a::text').extract_first() soup = BeautifulSoup(response.text, 'html.parser') a_tags = soup.findAll('a') s = [] for tag in a_tags: #print(len(tag.contents)) s.append(tag.get('href', None)) wikis = [] for i in range(len(s)): if re.search(r'^/wiki/.+', str(s[i])) or re.search( r'^https://en.wikipedia.org.+', str(s[i])): continue else: if re.search(r'^https://.+', str(s[i])): wikis.append(s[i]) del wikis[2] print(wikis) '''
def jobs_scraper(url): page = requests.get(url) response = TextResponse(body=page.text, url=url, encoding="utf-8") companies_name = response.xpath( "//p[@class='job_list_company_title']/text()").extract() vac_name = response.xpath("//p[@class='font_bold']/text()").extract() base_url = "https://staff.am" urls = response.xpath( "//div[@class='web_item_card hs_job_list_item']/a/@href").extract() vacs_url = [base_url + i for i in urls] deadline1 = response.css( "div[class = 'job-inner job-list-deadline'] >p:not([class='job_location'])" ) deadline2 = [i.css('::text').extract()[1] for i in deadline1] deadline = [i.replace('\n', " ") for i in deadline2] location1 = response.css( "div[class = 'job-inner job-list-deadline'] >p[class='job_location']") location2 = [i.css("::text").extract()[1] for i in location1] location = [i.replace('\n', "").strip() for i in location2] return pd.DataFrame({ "Companies": companies_name, "Vacancies": vac_name, 'Links': vacs_url, "Deadline": deadline, 'Location': location })
def parse(self, response: TextResponse): # getting data for productdata object r_url = response.url r_page = response.text r_time = datetime.now() print("scraping for productData: {}".format(r_url)) dct = product_data_html_parse(response.css(".prodtable").get()) remove = False try: # get price r_price = dct["RSP"] r_price = "".join( re.findall(r"([\d,.])", r_price)) # use regex to remove the currency symbol # get brand r_brand = dct['Brand'] # get item name r_itemname = response.css( ".productpagedetail-inner .prodname::text").get() # get item size r_size = dct['Pack Size'] # r_size = "".join(re.findall(r"([\d,.])", r_size)) except KeyError as e: print(e) print("Missing data, remove ") remove = True # since product and productdata has a 1 to 1 relationship, the url of product and productdata is the same # iterate thorugh the product table, find the matching url and create the productdata sqlalchemy object with the product object Product_table = crawlProduct.dbSession.query(Product).all() for i in Product_table: # if there is a matcging url, we found the matching product if i.url == r_url: # check if any of the scraped data has none values. if true, delete the product entry if remove: crawlProduct.dbSession.query(Product).filter( Product.id == i.id).delete() else: product_data_object = ProductData( url=r_url, html=r_page, date=r_time, price=r_price, brand=r_brand, itemName=r_itemname, size=r_size, product=i, ) crawlProduct.dbSession.add(product_data_object) crawlProduct.dbSession.commit()
def parse(self, response): links = [] for url in self.start_urls: time.sleep(3) self.driver.get(url) i = 0 while i < 60: next = self.driver.find_element_by_class_name('load-more') try: next.click() except: print "\n\n#######\n", "SAIU", "\n\n#########\n" break i = i + 1 resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') links = links + resp.css('.feed-post-link::attr(href)').extract() tam = len(links) print "\n\n************** ", tam, " **************\n\n" if tam >= 1000: break c = 1 setLinks = set(links) n = len(setLinks) for link in setLinks: c += 1 print "**********************", c / ( n * 1.0), "**********************", "\n" self.driver.get(link) resp = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') noticia = { 'titulo': resp.css('.content-head__title::text').extract_first(), 'subtitulo': resp.css('.content-head__subtitle::text').extract_first(), 'texto': "".join(resp.css('.content-text__container::text').extract()), } if (noticia['titulo'] is not None and noticia['subtitulo'] is not None): self.news.append(noticia) yield noticia self.driver.close()
def quote_scraper(response): page = requests.get(url) response= TextResponse(body=page.text,url=url,encoding="utf-8") author = response.css("small.author::text").extract() quotes = response.css("div.quote > span.text::text").extract() div_tags =response.css("div.tags ") tags = [i.css("a.tag::text").extract() for i in response.css("div.tags")] base_url="http://quotes.toscrape.com/" rel_hyperlinks=response.css("small.author~a::attr(href)").extract() hyperlink=[base_url+i for i in rel_hyperlinks] return pd.DataFrame({"quotes":quotes,"author":author,"tags":tags,"author_page":hyperlink})
def parse(self, response): self.driver.get(response.url) website = self.website while True: if self.page_number > 10000: break try: prefix_size = 0 next = self.driver.find_element_by_id(website.next_button_id) next.click() time.sleep(5) sel_response = TextResponse( url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8') if (prefix_size == 0): suffix_size = get_suffix_size( response.css('a::attr(href)').re(website.url_patterns), sel_response.css('a::attr(href)').re( website.url_patterns)) # extract all links from current page that respect pattern. Remove common links found previously. links = sel_response.css('a::attr(href)').re( website.url_patterns)[prefix_size:-suffix_size] prefix_size += len(links) links = set(links) if (not links): self.logger.info('No link found. Stopping scraper.') return self.page_number += 1 # if website uses relative url, prepend all links with domain name if (website.relative_url): links = (response.urljoin(link) for link in links if link) # only keep unvisited links links = (link for link in links if not Article.objects.filter(url=link)) for link in links: yield response.follow( link, callback=parse_article, meta={ 'spider': self, 'website': website }, priority=1) except Exception as e: break driver.close() driver.quit()
def booksto_scrape(url,base_url="http://books.toscrape.com/"): page = requests.get(url) response = TextResponse(body=page.text,url=url,encoding="utf-8") book_title=response.css("h3>a::attr(title)").extract() book_rating = response.css("p[class^='star-rating']::attr(class)").extract() instock =response.css("p.price_color ~ p.instock::attr(class)").extract() instock_or_not = [i.replace("availability", " ") for i in instock] p_price = response.css("p.price_color::text").extract() price = [i.replace("Â", "") for i in p_price] book_page_URL = response.css("h3 >a::attr(href)").extract() book_picture_URL = response.css("img::attr(src)").extract() base_url = "http://books.toscrape.com/catalogue/" book_page_URL2 = [base_url + i for i in book_page_URL ] book_picture_URL2 = [base_url + i for i in book_picture_URL] rating = [] for i in book_rating: rating.append(i.replace("star-rating", "")) book_genre = [] book_description = [] for i in book_page_URL2: page = requests.get(i) response = TextResponse(body=page.text,url=i,encoding="utf-8") book_genre.append(response.css("li~li~li > a::text")[0].extract()) book_description.append(response.css("article[class='product_page'] > p::text").extract_first()) return pd.DataFrame({ "price":price,"book_title":book_title, "rating":rating,"instock_or_not":instock_or_not,"book_genre":book_genre,"book_description":book_description,"book_page_URL2":book_page_URL2,"book_picture_URL2":book_picture_URL2})
def parse_unit(self, response: TextResponse): resp = TextResponse( url=response.url, request=response.url, body=json.loads(response.body.decode('utf-8'))['goods']['html'], encoding='utf-8') base_url = response.request.headers['user_base_url'] units = [] for index, item in enumerate( resp.css('table#search_results tr.highlight') + resp.css('table#search_results tr.editors_choise')): url = self.strip(item.css('a.t::attr(href)').extract_first()) title = self.strip(item.css('a.t span::text').extract_first()) units.append({'base_url': base_url, 'url': url, 'title': title}) return {'data': units}
def movie_scraper(url,base_url="https://www.imdb.com/chart/moviemeter/"): page = requests.get(url) response = TextResponse(body=page.text,url=url,encoding="utf-8") title = response.css("td.titleColumn >a ::text").extract() release_year = response.css("td >span.secondaryInfo::text").extract() year = [i.replace("(" , "").replace(")" , "") for i in release_year] ratings_table = response.css("td.ratingColumn.imdbRating") rating = [i.css('strong::text').extract_first() for i in ratings_table] movurl = response.css("td.titleColumn ::attr(href)").extract() hyperlink = 'https://www.imdb.com/' movie_hyperlink = [hyperlink+i for i in movurl] rank_div = response.css("div.velocity") rank_ext = [i.css("::text").extract_first() for i in rank_div] ranks = [i.replace('\n',"").replace("(no change)","") for i in rank_ext] return pd.DataFrame({"Title":title, "Year":year, "Ratings":rating, "Movie_Hyperlink": movie_hyperlink , 'Rank':ranks,})
class Movies: def __init__(self,URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8") def get_movies(self): titles = self.response.css("td.titleColumn>a::text").extract() year = self.response.css("td.titleColumn>span.secondaryInfo::text").extract() ranking = [i.css("div.velocity::text").extract() for i in self.response.css("td.titleColumn")] rating_td = self.response.css('td[class = "ratingColumn imdbRating"]') rating = [i.css('strong::text').extract() for i in rating_td] rel_hyperlinks = self.response.css("td.titleColumn > a::attr(href)").extract() hyperlink_movie = [base_url + i for i in rel_hyperlinks] return pd.DataFrame({"Titles":titles,"Year":year,"Hyperlink_movie":hyperlink_movie,"Ranking":ranking,"Rating":rating})
class Jobs: def __init__(self,URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8") def get_vacancy(self): vac = self.response.xpath('//div[@class="job-inner job-item-title"]/p[@class="font_bold"]/text()').extract() return vac def get_company(self): comp = self.response.xpath('//div[@class="job-inner job-item-title"]/p[@class="job_list_company_title"]/text()').extract() return comp def get_deadline(self): dl1 = self.response.css('div[class="job-inner job-list-deadline"] p::text').extract() dl2 = [''.join(x) for x in zip(dl1[0::2], dl1[1::2])] del dl2[1::2] dl = [i.replace("\n\n", "").replace("\n"," ").strip() for i in dl2] return dl def get_location(self): loc = self.response.xpath('//div[@class="job-inner job-list-deadline"]/p[@class="job_location"]/text()').extract() loc = [i.replace('\n','').strip() for i in loc] return loc def get_ind_page(self): ind_page = [base_url + i for i in self.response.xpath('//div[@class="list-view"]/div/div/a/@href').extract()] return ind_page def get_next(self): page = self.response.xpath('//ul[@class="pagination"]/li[@class="next"]/a/@href').extract() return page
def parse(self, response: TextResponse) -> [Request, YelpService]: """ This is the default callback used by Scrapy to process downloaded responses, when their requests don’t specify a callback. The parse method is in charge of processing the response and returning scraped data and/or more URLs to follow. Args: :param response: the response to parse """ # Checks if we are in the search result page if response.url.startswith("https://www.yelp.com/search?"): info_page_urls = response.css(".biz-name::attr(href)") # Checks if we have some result if info_page_urls is not None: for url in info_page_urls[:self.max_results]: # Joins the url found with the domain url, and returns a new Request for it, # that gonna be parsed by this method. info_page = response.urljoin(url.extract()) yield Request(info_page) # We are in the info page, therefore we already can extract the information else: yield self._map_response(response)
def parse(self, response): self.driver.get(response.url) selector = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') time.sleep(4) items = selector.css( '#education-EeiiqoQoZTGTLg2SBVVUiQ span::text').extract() time.sleep(2) print(items) ''' driver.get("https://resumes.indeed.com/resume/91d7f36dfe695dee?s=l%3Dnoida%26q%3Dpython%2520developer%26searchFields%3D") html = driver.page_source title = driver.find_element_by_class_name('locality').text start_urls = [ 'https://resumes.indeed.com/resume/91d7f36dfe695dee?s=l%3Dnoida%26q%3Dpython%2520developer%26searchFields%3D' ] def start_requests(self): for url in self.start_urls: yield scrapy.Request(url=url,callback=self.parse) def parse(self,response): print("respo nse ===============================>",response.status,"<=========================") validating = response.xpath('//a') print(response.text) ''' '''
def parse(self, response: TextResponse): input_hidden = response.css('input[type="hidden"]') form_data = { 'command': 'get_goods', 'page': 'all', 'store': 'msk-0_1721_1', 'thumbnail_view': '2', 'sch_good_id': '', 'sch_id': '', 'c_id': input_hidden.css( 'input[name="c_id"]::attr(value)').extract_first(), 'fn': input_hidden.css('input[name="fn"]::attr(value)').extract_first(), 'g_id': input_hidden.css( 'input[name="g_id"]::attr(value)').extract_first(), 'def_sort': input_hidden.css( 'input[name="def_sort"]::attr(value)').extract_first(), 'sys_all': input_hidden.css( 'input[name="sys_all"]::attr(value)').extract_first() } return FormRequest(self.api_url, headers={'user_base_url': response.url}, formdata=form_data, callback=self.parse_unit)
class Movies: def __init__(self,URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8") def scrape_movies(self): "Scrapes the movies, ratings, ranks and the hyperlink" title = self.response.css("td.titleColumn a::text").extract() year = [str(i).strip('()') for i in self.response.css(".secondaryInfo:nth-child(2)::text").extract()] rank = [] [rank.append(i) for i in range(1,101)] rating = [] for i in self.response.css(".imdbRating"): rating.append(str(i.css("strong::text").extract()).strip('[]')) hyperlink = [base_url+i for i in self.response.css("td.titleColumn a::attr(href)").extract()] return title, year, rank, rating, hyperlink
class Quotes: def __init__(self,URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8") def get_quotes(self): quotes = self.response.css("span.text::text").extract() authors = self.response.css("small.author::text").extract() tags = [i.css("a.tag::text").extract() for i in self.response.css("div.tags")] hyperlinks = [base_url+i for i in self.response.css("small.author ~ a::attr(href)").extract()] return quotes, authors, tags, hyperlinks def get_next(self): next_url = self.response.css("li.next a::attr(href)").extract() return next_url
class Book: def __init__(self, URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text, url=self.URL, encoding="utf-8") def get_next(self): "returns the link of the following page, if it exists" next_url = self.response.css("li.next a::attr(href)").extract() return next_url def get_title(self): title = self.response.css( 'article[class="product_pod"] h3 a::attr(title)').extract() return title def get_rating(self): rating = self.response.css( 'p[class*="star-rating"]::attr(class)').extract() rating = [i.replace('star-rating', '').strip() for i in rating] return rating def get_price(self): price = self.response.css(".price_color::text").extract() price = [i.replace('£', '') for i in price] return price def get_book_url(self): book_url = [ temp_url + i for i in self.response.css( 'article[class="product_pod"] h3 a::attr(href)').extract() ] return book_url def get_img_url(self): img_url = [ temp_url + i for i in self.response.css('.thumbnail::attr(src)').extract() ] return img_url def get_inStock(self): in_stock = self.response.css('.instock::text').extract() in_stock = [i.replace('\n', '').strip() for i in in_stock] return in_stock[1::2] def get_genre(self): book_genre = self.response.css( '.breadcrumb li:nth-child(3) a::text').extract() return (book_genre) def get_desc(self): book_desc = self.response.xpath('//article/p/text()').extract() return book_desc
def get_chromedriver_version(chrome_version: str): url = 'https://chromedriver.storage.googleapis.com/' response = requests.get(url, headers=headers, params=params) response = TextResponse(body=response.content, url=url) for item in response.css('Prefix').extract(): if chrome_version[0:7] in item: return re.compile(r'(\d.*\d)').search(item)
class Book: def __init__(self, URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text, url=self.URL, encoding="utf-8") def scrape_book(self): "Scrapes the book with all its info." title = self.response.css( 'article[class="product_pod"] h3 a::attr(title)').extract() rating = self.response.css( 'p[class*="star-rating"]::attr(class)').extract() rating = [i.replace('star-rating', '').strip() for i in rating] price = self.response.css(".price_color::text").extract() price = [i.replace('£', '') for i in price] book_url = [ URL + i for i in self.response.css( 'article[class="product_pod"] h3 a::attr(href)').extract() ] img_url = [ URL + i for i in self.response.css('img::attr(src)').extract() ] in_stock = self.response.css('.instock::text').extract() in_stock = [i.replace('\n', '').strip() for i in in_stock] return title, rating, price, book_url, img_url, in_stock[1::2] def get_next(self): next_url = self.response.css("li.next a::attr(href)").extract() return next_url
def parse_node(self, response, node): item = NewsItem() item["title"] = node.xpath("title/text()").get() item["link"] = node.xpath("link/text()").get() item["date"] = datetime.datetime.strptime( node.xpath("pubDate/text()").get(), '%a, %d %b %Y %X +%f') item["category"] = node.xpath("category/text()").get() item["author"] = node.xpath("dc/author/text()").get() item["category"] = node.xpath("category/text()").get() description = node.xpath("description/text()").get() description = TextResponse(response.url, body=description, encoding='utf-8') item["image"] = description.css("img ::attr('src')").get() item["content"] = get_description( description.css(":not(script)::text").getall()) item["resume"] = item["content"] yield Request(item["link"], self.parse_item, meta={"item": item})
def __init__(self, response: TextResponse): self.response = response # parse the class table self.content_all = [tr.css('td') for tr in response.css('tr')] self.content = filter(lambda x: len(x) == 2, self.content_all) # non-fields have only 1 td tag self.fields = {field_name.css('::text').get(): field_value for field_name, field_value in self.content} # main fields self.class_id = [p for p in response.url.split('/') if len(p) > 0][-1] self.department_code = [p for p in response.url.split('/') if len(p) > 0][-2] self.course_code = self.department_code + " " + self._get_field("Number") # parse all fields self.instructor = self._get_field('Instructor', first_line_only=True) if self.instructor: self.instructor = re.sub(r'[\s-]+$', '', self.instructor) # clean up self.course_title = self._get_course_title() self.course_subtitle = self._get_course_subtitle() # schedule self.scheduled_days = None self.scheduled_time_start = None self.scheduled_time_end = None self.location = None date_and_location = self._get_field('Day & Time') if date_and_location: tmp = date_and_location.split("\n") # e.g.: TR 4:10pm-5:25pm\n825 Seeley W. Mudd Building date_and_time = tmp[0].split() # e.g.: TR 4:10pm-5:25pm self.scheduled_days = date_and_time[0] # e.g. TR t = date_and_time[1] # e.g. 4:10pm-5:25pm self.scheduled_time_start, self.scheduled_time_end = t.split('-') self.location = None if len(tmp)>1: self.location = tmp[1] self.section_key = self._get_field("Section key") self.open_to = self._get_field("Open To") if self.open_to: self.open_to = [s.strip() for s in self.open_to.split(',')] self.course_descr = self._get_field("Course Description") self.prerequisites = [] if self.course_descr: self.prerequisites = ColumbiaClassListing.get_prerequisites(self.course_descr) self.points = self._get_field("Points") self.class_type = self._get_field("Type") self.method_of_instruction = self._get_field("Method of Instruction") self.department = self._get_field("Department") self.call_number = self._get_field("Call Number") self.campus = self._get_field("Campus") enrollment_string = self._get_field('Enrollment') self.enrollment_date, self.enrollment_current, self.enrollment_max \ = self._get_enrollment(enrollment_string)
def get_proxies(): r = requests.get("https://github.com/fate0/proxylist/blob/master/proxy.list") response = TextResponse(r.url, body=r.text, encoding='utf-8') res = [] for i in response.css("td::text").getall(): ii = json.loads(i) country = ii["country"] or "" if country == "US" and ii["response_time"] < 5: res.append(f"{ii['host']}:{ii['port']}") return res
def parse(self, response): self.driver.get( 'http://www.hannovermesse.de/en/exhibition/exhibitors-products/advanced-search/' ) wait = WebDriverWait(self.driver, 5) wait.until( EC.element_to_be_clickable(( By.XPATH, '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]' ))) showallbutton = WebDriverWait(self.driver, 10).until( EC.element_to_be_clickable(( By.XPATH, '//*[@id="searchAP:search"]/section/div[6]/div/div/div[2]/div[2]/a' ))) showallbutton.click() self.driver.execute_script( "document.getElementById('searchAP:zb:442:r').click()") self.driver.find_element_by_xpath( '//*[@id="searchAP:searchButton2"]').click() #searchAP:zb:1:r Agriculture, forestry and fishing #searchAP:zb:18:r Mining and extracting rocks and earth #searchAP:zb:35:r Manufacturing industry #searchAP:zb:220:r Energy supply #searchAP:zb:229:r Water supply, sewage and refuse disposal, sanitation and similar activities #searchAP:zb:239:r Construction/construction industry #searchAP:zb:251:r Sale, maintenance and repair of motor vehicles #searchAP:zb:291:r Transportation and storage #searchAP:zb:312:r Hotels and restaurants/lodging and catering #searchAP:zb:320:r Information and communication #searchAP:zb:351:r Provision of financial and insurance services #searchAP:zb:357:r Real estate activities #searchAP:zb:361:r Provision of freelanced, scientific and technical services #searchAP:zb:379:r Provision of other business activities #searchAP:zb:394:r Public adminstration, defence and social security #searchAP:zb:404:r Education #searchAP:zb:412:r Human health and social work activities #searchAP:zb:425:r Arts, entertainment and recreation #searchAP:zb:430:r Recreational, cultural and sporting activities; other #searchAP:zb:439:r Households #searchAP:zb:440:r Extra-territorial organisations and bodies #searchAP:zb:441:r All sectors, sector independent #searchAP:zb:442:r pupils, students # Now that the webpage is all revealed Scrapy can bring down all the Company URLs # I.e. we need to follow the link for every companys to get onto its page to get our data response1 = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8') for href in response1.css('.search-link ::attr(href)'): url = response1.urljoin(href.extract()) yield scrapy.Request(url, callback=self.parse_dir_contents)
def getimgdec(movies): try: r = requests.get('http://www.imdb.com/find?ref_=nv_sr_fn&q=' + movies) response = TextResponse(r.url, body=r.text, encoding='utf-8') spcial = response.css( '.findList .findResult .result_text a::attr(href)').extract_first( ) spcial = 'http://www.imdb.com/' + spcial.encode("ascii", "ignore") except AttributeError: spcial = 'http://www.imdb.com/' return MoviesNameUrl[movies][1], MoviesNameUrl[movies][0], spcial
class Quotes: def __init__(self, URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text, url=self.URL, encoding="utf-8") def get_quotes(self): return self.response.css("span.text::text").extract() def get_authors(self): return self.response.css("small.author::text").extract() def get_tags(self): "gets the tags all in one list" return self.response.css("div.tags > a.tag::text").extract() def get_author_link(self): return self.response.css("small.author ~ a::attr(href)").extract()
class Pages: def __init__(self,URL): self.URL = URL self.page = requests.get(self.URL) self.response = TextResponse(body=self.page.text,url=self.URL,encoding="utf-8") def url_scraper(self): """ Getting urls for all top restaurants in Yerevan """ url = [base_url + i for i in self.response.css('div.wQjYiB7z>span>a._15_ydu6b::attr(href)').extract()] return url def get_next(self): """ If a NEXT button exists,get the next page's URL """ next_url = self.response.css("div[class='unified pagination js_pageLinks'] a[class='nav next rndBtn ui_button primary taLnk']::attr(href)").extract() return next_url
def parse(self, response: TextResponse): categories = [] base_url = response.url for index, item in enumerate(response.css('ul.nix-menu').xpath('li')): url = item.css('a::attr(href)').extract_first() title = item.css('a::text').extract_first() categories.append({ 'base_url': base_url, 'url': url, 'title': title }) return {'data': categories}
def books_scraper(url, base_url="http://books.toscrape.com/"): page = requests.get(url) response = TextResponse(body=page.text, url=url, encoding="utf-8") prices = response.css("p.price_color::text").extract() price = [float(i.replace("£", "")) for i in prices] book_url = response.css("h3 >a::attr(href)").extract() pic_url = response.css("img::attr(src)").extract() star = response.css("p[class^='star-rating']::attr(class)").extract() star_rating = [] for i in star: star_rating.append(i.replace("star-rating", "")) base_url = "http://books.toscrape.com/catalogue/" bookurl = [base_url + i for i in book_url] picurl = [base_url + i for i in pic_url] genere = [] desc = [] for i in bookurl: page = requests.get(i) response = TextResponse(body=page.text, url=i, encoding="utf-8") genere.append(response.css("li~li~li > a::text")[0].extract()) desc.append( response.css( "article[class='product_page'] > p::text").extract_first()) return pd.DataFrame({ "Price": price, "BooksHyperlinks": bookurl, "PicsHyperlinks": picurl, "Star_Ratings": star_rating, 'Generes': genere, "AboutBook": desc })
def parse_node(self, response, node): item = NewsItem() item['title'] = node.xpath('title/text()').get() item['link'] = node.xpath('link/text()').get() description = node.xpath('description/text()').get() description = TextResponse(response.url, body=description, encoding='utf-8') item['image'] = description.css("img ::attr('src')").get() item['resume'] = description.css(".K2FeedIntroText strong::text").get() item['content'] = description.css(".K2FeedFullText ::text ").getall() item['category'] = node.xpath('category/text()').get() item['author'] = node.xpath('author/text()').get() item['date'] = datetime.datetime.strptime( node.xpath("pubDate/text()").get(), '%a, %d %b %Y %X +%f') if item["content"] is None or item["content"] is '': item["content"] = description.css("::text").get() if item["resume"] is None: item["resume"] = description.css("::text").get() if '.jpg' not in item["image"]: return yield item
def _extract_service_phone(self, response: TextResponse) -> str: """ Extracts the service phone from the response if it can be found, otherwise returns an empty string. Args: :param response: the response received from a `Request` object :return: the service phone if it can be found, otherwise an empty string """ phone = response.css(".biz-phone::text").extract_first() if not phone: self.log("Cannot find the phone of the service: " + response.url, logging.ERROR) return "" else: return phone.strip()
def _extract_service_address(self, response: TextResponse) -> str: """ Extracts the service address from the response if it can be found, otherwise returns an empty string. Args: :param response: the response received from a `Request` object :return: the service address if it can be found, otherwise an empty string """ # The address information is formatted by using "<br>" tags, so, we need to extract all # items within the "<address>" tag and merge them at the end separated by commas. address = response.css(".street-address address::text").extract() if not address: self.log("Cannot find the address of the service: " + response.url, logging.ERROR) return "" else: return ', '.join(address).strip()