def caronaInfo(self): resp = requests.get('https://www.worldometers.info/coronavirus/') values = { 'world': 'N/A', 'india': 'N/A', 'deathCases': 'N/A', 'recCases': 'N/A', 'newCases': 'N/A', 'totalDeath': 'N/A', 'totalRec': 'N/A' } sel = Selector(resp) soup = BeautifulSoup(sel.get(), "html.parser") world = soup.find(id="main_table_countries_today").find( "td", string="World").parent.td.next_sibling.next_sibling.string india = soup.find(id="main_table_countries_today").find( "a", string="India").parent.parent.td.next_sibling.next_sibling.string deathCases = soup.find(id="main_table_countries_today").find( "a", string="India" ).parent.parent.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string recCases = soup.find(id="main_table_countries_today").find( "a", string="India" ).parent.parent.td.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.string newCases = soup.find(id="main_table_countries_today").find( "a", string="India" ).parent.parent.td.next_sibling.next_sibling.next_sibling.next_sibling.string totalDeath = soup.find_all(id="maincounter-wrap")[1].span.string totalRec = soup.find_all(id="maincounter-wrap")[2].span.string values.update({"world": world}) values.update({"totalDeath": totalDeath}) values.update({"totalRec": totalRec}) values.update({"india": india}) values.update({"deathCases": deathCases}) values.update({"recCases": recCases}) values.update({"newCases": newCases}) return values
def parse(self, response): """ abstract method from Spider object overwritten in script - see documentation :param response: callback of request to handle generated responses :return: """ # inspect_response(response, self) es = ElementSelectors() item_batch = response.css(es.sale_items_batch).getall() # batch_len = len(item_batch) # batch_sel = Selector(text=item_batch[71]) # print("DEBUG1: {}".format(batch_sel.get())) dict_items_num = {} dict_items_name = {} for _ in range(len(item_batch)): dict_item_info = {} item_link = Selector(text=item_batch[_]).css('body >' + es.item_link) item_link_elem = item_link.get() item_link_sel = Selector(text=item_link_elem) dict_item_info['item_link'] = item_link.attrib['href'] dict_item_info['item_old_price'] = item_link_sel.css( es.item_old_cost).get() dict_item_info['item_sale_price'] = item_link_sel.css( es.item_current_cost).get() dict_items_num[_] = dict_item_info for _ in range(len(dict_items_num.keys())): print("ITEM {}: {} {} {} ".format( _, dict_items_num[_]['item_link'], dict_items_num[_]['item_old_price'], dict_items_num[_]['item_sale_price']))
def parse(self, response): # Find out post number (id) try: match = re.search(r't=(\d*)', response.url) post_number = match.group(1) except Exception as e: print(response.url) print(e) folder_path = "download/" + str(post_number) # Keep track of all the visited pages if (post_number not in visited): visited[post_number] = { "comments": [], "n_pages": 0 } not_found = Selector(response).xpath("//table[@class='forumline']//td[@class='row1']/table/tr[2]/td/span[@class='gen']").extract() if len(not_found) > 0: print("ERROR PAGE") return # Create folder if doesn't exist try: if not os.path.isdir(folder_path): os.mkdir(folder_path) except Exception as e: print(e) # Save page N of the current post id with open(folder_path + "/" + str(visited[post_number]['n_pages']) + ".html", "wb") as f: f.write(response.body) # Start scrapping of data row = Selector(response).xpath("//table[@class ='forumline']/tr").extract() title = Selector(response).xpath("///a[@class ='maintitle']/text()").get() next_page = Selector(response).xpath("//span[@class='nav']//a[contains(text(),'Next')]/@href") for item in row[2:]: author_1 = Selector(text=item).xpath("//td[1]//b/span/text()").get() author_2 = Selector(text=item).xpath("//td[1]//b/text()").get() author = author_1 if author_1 is not None else author_2 if author: date_text = Selector(text=item).xpath("//td[2]/table//span[@class='postdetails']/text()").get().replace("Posted: ", "").strip() comment_content = Selector(text=item).xpath("//td[2]/table/tr[3]/td").get() visited[post_number]['comments'].append(Comment(author, dateparser.parse(date_text), comment_content)) # If has more comment pages, queue them up to be visited if(len(next_page) > 0): visited[post_number]['n_pages'] += 1 next_page_url = BASE_URL + next_page.get() yield scrapy.Request(next_page_url, callback=self.parse) # For the last page, store the accumulated json in a file and delete the content of comments to not clog RAM else: post = visited[post_number]['comments'][0] new_post = Post(post_number, title, post.author, post.content, post.date, visited[post_number]['comments'][1:]) with open(folder_path + "/items.json", "w") as f: f.write(json.dumps(new_post.toJson())) del visited[post_number]['comments']