def parse_category(self, response): meta = dict(response.meta) # Navigate to item li_elms = response.css("li.pl18-item-li") items = [] for li in li_elms: brand = li.css(".pl18-item-brand ::text").extract_first().strip() names = li.css(".pl18-item-name ::text").extract() names = [name.strip() for name in names] model = " ".join(names).strip() url = self.base_url + li.css( ".pl18-item-name a::attr(href)").extract_first() items.append(dict(brand=brand, model=model, url=url)) self.logger.info("Parse url {}, Num item urls : {}".format( response.url, len(items))) for item in items: item_url = item.get("url") if utils.is_valid_url(item_url): item.update({"category": meta["category"]}) yield Request(item_url, self.parse_item, meta=item, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len(items) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category(self, response): meta = dict(response.meta) item_urls = response.css( ".product-box-list>div>a::attr(href)").extract() self.logger.info("Parse url {}, Num item urls : {}".format( response.url, len(item_urls))) for item_url in item_urls: if utils.is_valid_url(item_url): yield Request(item_url, self.parse_item, meta=meta, errback=self.errback, headers=self.headers) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( item_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback, headers=self.headers)
def crawl_reviews(self, url): # url = "https://my.lazada.vn/pdp/review/getReviewList? # itemId=102463766&pageSize=15&filter=0&sort=0&pageNo=1" ratings, reviews = {}, [] if utils.is_valid_url(url): json_data = json.loads(self.get_response(url).content.decode("utf-8")) scores = json_data["model"]["ratings"]["scores"] or [] ratings = {5-i: rating for i, rating in enumerate(scores)} full_reviews = json_data["model"]["items"] or [] reviews = [] for full_review in full_reviews: rating = full_review["rating"] review_time = full_review["zonedReviewTime"] review_time = utils.convert_unix_time(review_time) bought_time = full_review["zonedBoughtDate"] bought_time = utils.convert_unix_time(bought_time) review_title = full_review.get("reviewTitle", "") or "" review_content = full_review.get("reviewContent", "") or "" comment = review_title + " " + review_content reviews.append(dict(rating=rating, review_time=review_time, comment=comment, bought_time=bought_time)) # for review in reviews: # print("Time : {} - Star : {} - Comment : {}".format( # review["review_time"], review["rating"], review["comment"])) return ratings, reviews
def parse_category(self, response): meta = dict(response.meta) # Find item data in script tag scripts = response.css("script").extract() prefix = "<script>window.pageData=" postfix = "</script>" data = "{}" for script in scripts: if script.startswith(prefix): data = script break data = data[len(prefix):-len(postfix)] print(scripts) data = json.loads(data) # data = json.loads(response.text) items = data["mods"]["listItems"] self.logger.info("Parse url {}, Num item urls : {}".format(response.url, len(items))) for item in items: # item_url = self.base_url + item_url item_url = "https:" + item["productUrl"] item_data = dict(product_id=item["itemId"], model=item["name"], price=item["priceShow"], description=item["description"], num_reviews=item["review"], brand=item["brandName"], seller=item["sellerName"], category=meta["category"]) # print("\n\n\nItem url : {}\n\n\n".format(item_url)) if utils.is_valid_url(item_url): yield Request(item_url, self.parse_item, meta=dict(item=item_data), errback=self.errback) else: print("\n\nERROR XXXXXXXx xXXX\nItem url : {}\n\n\n".format(item_url)) # Navigate to next page print("\n\n\n======== NEXT NEXT NEXT NEXT NEXT =========\n\n") print("------ Page: {} ---- Len(items): {} --------\n".format(meta["page_idx"]+1, len(items))) if meta["page_idx"] < self.page_per_category_limit and len(items) > 0: meta["page_idx"] += 1 print("\n\n\n======== NEXT NEXT NEXT NEXT NEXT =========\n\n") print("------ {} --------".format(meta["page_idx"])) print("\n\n\n======== NEXT NEXT NEXT NEXT NEXT =========\n\n") next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category(self, response): meta = dict(response.meta) # Navigate to item item_urls = response.css("div.product-item a.product-item__thumbnail::attr(href)").extract() if len(item_urls) == 0: utils.save_str(response.text, "./Temp/adayroi_empty_category.html") self.logger.info("Parse url {}, Num item urls : {}".format(response.url, len(item_urls))) for item_url in item_urls: item_url = self.base_url + item_url if utils.is_valid_url(item_url): yield Request(item_url, self.parse_item, meta=meta, errback=self.errback) # Navigate to next page if (meta["page_idx"] + 1) < self.page_per_category_limit and len(item_urls) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format(meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)
def parse_category_from_id(self, category_id, num_items): # Get all item print("\nCategory id : {}, Number items : {}".format( category_id, num_items)) item_urls_fmt = "https://www.sendo.vn/m/wap_v2/category/product?" \ "category_id={}&p=1&s={}&sortType=default_listing_desc" all_item_url = item_urls_fmt.format(category_id, num_items) try: json_data = json.loads( self.get_response(all_item_url).content.decode("utf-8")) full_items = json_data["result"]["data"] except: self.logger.error( "\nError when all items of cat_id : {}, total_items : {}". format(category_id, num_items)) return 0 for full_item in full_items[:7]: cat_path = full_item["cat_path"] item_url_key = cat_path.replace(".html/", "") item_url = "https://www.sendo.vn/m/wap_v2/full/san-pham/{}".format( item_url_key) url = self.base_url + "/" + cat_path category = self.map_id_category.get(category_id, {}).get("Category name", "") item = dict(category=category, category_id=category_id, url=url, product_id=full_item["product_id"], model=full_item["name"], price=full_item["final_price"], seller=full_item["shop_name"]) if utils.is_valid_url(item_url): yield Request(item_url, self.parse_item, meta=item, errback=self.errback)
def parse_item(self, response): item = response.meta["item"] url = response.url item.update({"url": url}) # Extract full info of product info = "" try: scripts = response.css("script").extract() keyword = "pageUrl" page_url = "" for script in scripts: start_index = script.find(keyword) if start_index >= 0: start_index = start_index + len(keyword) + 3 end_index = script.find('"', start_index) page_url = "https:" + script[start_index: end_index] break if utils.is_valid_url(page_url): yield Request(page_url, self.parse_info, meta=response.meta, errback=self.errback) except: print("Error when extract info of item ", url)
def parse_category(self, response): meta = dict(response.meta) # # Get category id # scripts = response.css("script").extract() # pre = "window.__INITIAL_STATE__=" # post = "</script>" # str_data = None # for script in scripts: # if pre in script: # str_data = script # break # # if str_data is None: # return 0 # # start_index = str_data.find(pre) + len(pre) # end_index = str_data.find(post, start_index) # str_data = str_data[start_index: end_index] # # try: # json_data = json.loads(str_data) # category_id = json_data["data"]["ListingInfo"]["active"]["data"]["categoryId"] # # except: # self.logger.error("\nError when parse json data to get category id of ", meta["category"]) # return 0 # Get total items of category # item_urls_fmt = "https://www.sendo.vn/m/wap_v2/category/product?" \ # "category_id={}&p={}&s={}&sortType=default_listing_desc" # page_id = random.randint(1, 6) # url = item_urls_fmt.format(category_id, page_id, 5) # try: # json_data = json.loads(self.get_response(url).content.decode("utf-8")) # total_items = json_data["result"]["meta_data"]["total_count"] # except: # self.logger.error("\nError when get number items of " # "category {}, cat_id : {}".format(meta["category"], category_id)) # return 0 # Get all item # total_items = 500 # all_item_url = item_urls_fmt.format(category_id, 1, total_items) try: # json_data = json.loads(self.get_response(all_item_url).content.decode("utf-8")) json_data = json.loads(response.text) full_items = json_data["result"]["data"] except: self.logger.error( "\nError when all items of category {}, cat_id : {}".format( meta["category"], meta["category_id"])) return 0 self.logger.info("Parse url {}, Num item urls : {}".format( response.url, len(full_items))) for full_item in full_items: cat_path = full_item["cat_path"] item_url_key = cat_path.replace(".html/", "") item_url = "https://www.sendo.vn/m/wap_v2/full/san-pham/{}".format( item_url_key) url = self.base_url + "/" + cat_path item = dict(category=meta["category"], category_id=meta["category_id"], url=url, product_id=full_item["product_id"], model=full_item["name"], price=full_item["final_price"], seller=full_item["shop_name"]) if utils.is_valid_url(item_url): yield Request(item_url, self.parse_item, meta=item, errback=self.errback) # Navigate to next page if meta["page_idx"] < self.page_per_category_limit and len( full_items) > 0: meta["page_idx"] += 1 next_page = meta["category_url_fmt"].format( meta["category_id"], meta["page_idx"]) yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)