async def crawl_pages(self, category): cat_id = self.categories.get(category) offset = 0 max_results = 50 auctions = list() while True: url = self.search_category_url_format.format( cat_id=cat_id, skip=offset, max_num_of_results=max_results) _, page_content = await self.extract_async(url) if page_content is not None: json_obj = json.loads(page_content.decode("utf-8"), encoding="utf-8") items = json_obj.get("Items") auctions.extend(items) offset += max_results if len(items) < max_results: break log.debug("Found: %d auctions of category: %s" % (len(auctions), category)) output_dir = self.output_dir_path_format.format(category=category) csv_file_path = os.path.join( output_dir, "{category}.csv".format(category=category)) log.info("Csv output directory path: %s, csv file: %s" % (output_dir, csv_file_path)) Util.create_directory(output_dir) csv_manager = CsvManager(csv_file_path, self.fields, "id") csv_manager.open_file() tasks = (self.parse_item(category, item) for item in items) for res in AsyncCrawler.limited_as_completed(tasks, 5): extracted_data = await res if csv_manager.check_row_exist(extracted_data): extracted_data["flag"] = self.flags.get("updated") else: extracted_data["flag"] = self.flags.get("new") csv_manager.update_row(extracted_data) auction_output_dir = os.path.join(output_dir, extracted_data.get("id")) Util.create_directory(auction_output_dir) if extracted_data.get("images") is not None: images_urls = extracted_data.get("images").split('|') local_img = list() for img_url in images_urls: local_img_file_path = os.path.join( auction_output_dir, "{img_id}.jpg".format( img_id=self.get_image_id(img_url))) if not Util.check_file_exist(local_img_file_path): local_img.append((img_url, local_img_file_path)) download_tasks = (self.download_file(img_url, img_file_path) for img_url, img_file_path in local_img) for r in AsyncCrawler.limited_as_completed(download_tasks): await r csv_manager.close_file()
async def crawl_pages(self, category, max_pages): pages = (self.search_category_url_format.format( category=category, page_number=page_number) for page_number in range(1, max_pages + 1)) auctions_links = list() tasks = (self.extract_async(url) for url in pages) for page in AsyncCrawler.limited_as_completed(tasks, 5): url, page_content = await page if url is not None and page_content is not None: auctions_links.extend( self.parse_search_result_page(page_content)) if not auctions_links: log.warning("No results found for category: %s" % category) return log.debug("Found: %d auctions in %d pages of category: %s" % (len(auctions_links), max_pages, category)) output_dir = self.output_dir_path_format.format(category=category) csv_file_path = os.path.join( output_dir, "{category}.csv".format(category=category)) Util.create_directory(output_dir) csv_manager = CsvManager(csv_file_path, self.fields, "id") csv_manager.open_file() for auction_url in auctions_links: self.driver.get(auction_url) extracted_data = self.parse_data(category, auction_url, self.driver.page_source) if csv_manager.check_row_exist(extracted_data): log.debug("row already existed in csv") extracted_data["flag"] = self.flags.get("updated") else: log.debug("row in new") extracted_data["flag"] = self.flags.get("new") csv_manager.update_row(extracted_data) auction_output_dir = os.path.join(output_dir, extracted_data.get("id")) Util.create_directory(auction_output_dir) if extracted_data.get("images") is not None: images_urls = extracted_data.get("images").split('|') local_img = list() for img_url in images_urls: local_img_file_path = os.path.join( auction_output_dir, "{img_id}.png".format( img_id=self.get_image_id(img_url))) if not Util.check_file_exist(local_img_file_path): local_img.append((img_url, local_img_file_path)) download_tasks = (self.download_file(img_url, img_file_path) for img_url, img_file_path in local_img) for r in AsyncCrawler.limited_as_completed(download_tasks): await r csv_manager.close_file()
async def crawl_pages(self, category, max_pages): pages = (self.search_link_format.format(category=category, page_number=page_number) for page_number in range(1, max_pages + 1)) auctions_links = list() tasks = (self.extract_async(url) for url in pages) for page in AsyncCrawler.limited_as_completed(tasks, 5): url, page_content = await page if url is not None and page_content is not None: auctions_links.extend( self.parse_search_result_page(page_content)) if not auctions_links: log.warning("No results found for category: %s" % category) return log.debug("Found: %d auctions in %d pages of category: %s" % (len(auctions_links), max_pages, category)) output_dir = self.output_dir_path_format.format(category=category) csv_file_path = os.path.join( output_dir, "{category}.csv".format(category=category)) Util.create_directory(output_dir) csv_manager = CsvManager(csv_file_path, self.fields, "id") csv_manager.open_file() ''' tasks = (self.extract_multi_async([url.replace("aukcja", "zdjecia"), url]) for url in auctions_links) for pages in AsyncCrawler.limited_as_completed(tasks): results = await pages images_url, images_page_content = results[0] url, page_content = results[1] ''' tasks = (self.extract_async(url) for url in auctions_links) for page in AsyncCrawler.limited_as_completed(tasks, 5): url, page_content = await page if url is not None and page_content is not None: extracted_data = self.parse_data(category, url, page_content) images_links = list() images_url = url.replace("aukcja", "zdjecia") _, images_page_content = await self.extract_async(images_url) if images_url is not None and images_page_content is not None: images_links = self.parse_full_images_page( images_page_content) extracted_data["images"] = '|'.join(images_links) if csv_manager.check_row_exist(extracted_data): if _translate.get("finished") in extracted_data.get( "stop").lower(): extracted_data["flag"] = self.flags.get("sold") else: extracted_data["flag"] = self.flags.get("updated") else: extracted_data["flag"] = self.flags.get("new") csv_manager.update_row(extracted_data) auction_output_dir = os.path.join(output_dir, extracted_data.get("id")) Util.create_directory(auction_output_dir) if extracted_data.get("images") is not None: images_urls = extracted_data.get("images").split('|') local_img = list() for img_url in images_urls: local_img_file_path = os.path.join( auction_output_dir, "{img_id}.jpg".format( img_id=self.get_image_id(img_url))) if not Util.check_file_exist(local_img_file_path): local_img.append((img_url, local_img_file_path)) download_tasks = (self.download_file( img_url, img_file_path) for img_url, img_file_path in local_img) for r in AsyncCrawler.limited_as_completed(download_tasks): await r else: logging.error("Url or page_content none: %s" % url) csv_manager.close_file()