def multi_book_extractor_by_publishers_url(self, worker=2): """its creating multi thread extractor and then call _extract_books_by_publishers then aggregate the results Parameters ---------- publishers_url : str url of publisher link worker: int number of thread that selenium will run """ process = Process() _publishers = self._scrape_publishers(PUBLISHERS_URL) # _split_publishers_list = split_to_sublist(the_list=_publishers[-2:], number_of_sublist=worker) _split_publishers_list = split_to_sublist(the_list=_publishers, number_of_sublist=worker) worker_files = [] for publishers in _split_publishers_list: worker_files.append(process.name + ".csv") process = Process(target=self._extract_books_by_publishers, args=(publishers, process.name)) process.start() process.join() csv_handler = CSVHandler() joined_file_path = csv_handler.join_csv_files(worker_files, out_put=self.output_file) csv_handler.remove_files(worker_files) process.terminate() return joined_file_path
def _extract_books_by_publishers(self, publishers, file_name): """extract books by publishers , turn the_scrape_books_by_publishers into a csv file Parameters ---------- publishers:list its a list of publishers url """ try: driver = SeleniumDriver().chrome_driver( without_browser=self.without_browser, optimized_mode=self.optimized_mode) csv_handler = CSVHandler() for publisher in publishers: books = self._scrape_books_by_publishers(publisher, driver) csv_handler.export_book_to_csv(books=books, file_name=file_name) driver.close() except Exception as e: logging.error(e)
def __init__(self, without_browser=True, optimized_mode=True): self.without_browser = without_browser self.optimized_mode = optimized_mode self.csv_handler = CSVHandler()
class BookScrapper(object): def __init__(self, without_browser=True, optimized_mode=True): self.without_browser = without_browser self.optimized_mode = optimized_mode self.csv_handler = CSVHandler() def __scrape_genres_urls(self): driver = SeleniumDriver().chrome_driver(without_browser=self.without_browser, optimized_mode=self.optimized_mode) driver.get(url=WEBSITE) menu_items = driver.find_element_by_class_name("side-nav") items = menu_items.find_elements_by_xpath(".//a[@href]") _list_genres = [] for item in items: if "genre" in item.get_attribute("href"): _list_genres.append(item.get_attribute("href")) _list_genres_without_duplication = list(set(_list_genres)) driver.close() return _list_genres_without_duplication def __extract_genres_id(self): genre_urls = self.__scrape_genres_urls() genres = [] for url in genre_urls: unquote_url = furl(unquote(url)) "segment 2 belong to category" genres.append(unquote_url.path.segments[2]) return genres def extract_books(self): genres_id = self.__extract_genres_id() genres_id.sort() out_put = "" # for test # genres_id = genres_id[-1:] for id in genres_id: out_put = self.extract_books_api_by_category(id) return out_put def extract_books_api_by_category(self, category_id): category_url = self.__generate_category_url(category_id) books = self.__get_books_from_api(category_url) return self.csv_handler.export_book_to_csv(books, file_name="navar") def __get_books_from_api(self, category_url): logging.info("API CALLED") self.__remove_json_file() pagination = self.__get_next_offset_from_json() url = category_url logging.info(pagination.offset) list_books = [] while pagination.hasMore: response = requests.get(url) response.encoding = 'UTF-8' logging.info(pagination.offset) self.__save_json_to_file(response.json(), JSON_FILE_PATH) pagination = self.__get_next_offset_from_json() url = pagination.offset books = self.__extract_books_from_json() list_books.extend(books) return list_books def __extract_books_from_json(self): with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file: logging.info('reading from json file') parser = ijson.parse(input_file) books = [] __found_book = False for prefix, event, value in parser: if prefix == "items.item.title": __found_book = True book = Book() book.title = value book.author = " " book.publisher = " " logging.info(book.title) if prefix == "items.item.products.item.price": book.price = value if prefix == "items.item.authors.item.firstName": book.author = value if prefix == "items.item.authors.item.lastName": book.author = str(book.author) + " " + value if prefix == "items.item.audioPublisherTitle": book.publisher = value if __found_book: __found_book = False books.append(book) return books @staticmethod def __parse_xml(response): try: root_namespace = "{http://schemas.datacontract.org/2004/07/System.Web.Http.OData}" item_namespace = "{http://schemas.datacontract.org/2004/07/Navaar.Data}" print(1) root = ET.fromstring(response.data) url = root.find(root_namespace + "NextPageLink").text print(url) # print(root.getchildren()) items = root.find(root_namespace + "Items").getchildren() for item in items: book = Book() # print(item.getchildren()) book.publisher = item.find(item_namespace + "AudioPublisherTitle").text book.title = item.find(item_namespace + "Title").text products = item.find(item_namespace + 'Products') for product in products: book.price = product.find(item_namespace + "Price").text authors = item.find(item_namespace + "Authors") for author in authors: book.author = author.find(item_namespace + "FirstName").text + " " + author.find( item_namespace + "LastName").text except Exception as e: print(getattr(e, 'message', repr(e))) @staticmethod def __save_json_to_file(json_object, file_name): try: with open(file_name, 'w') as json_file: json.dump(json_object, json_file) except Exception as error: logging.error("JSON could not save") raise error @staticmethod def __remove_json_file(): if os.path.exists(JSON_FILE_PATH): os.remove(JSON_FILE_PATH) @staticmethod def __get_next_offset_from_json(): try: with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file: json_object = json.load(input_file) pagination = Pagination() pagination.offset = json_object["nextPageLink"] if pagination.offset is not None: pagination.hasMore = True else: pagination.hasMore = False return pagination except Exception: pagination = Pagination() pagination.offset = " " pagination.hasMore = True return pagination @staticmethod def __generate_category_url(category_id): unquote_url = furl(unquote(TEMPLATE_URL)) "segment 3 belong to category" unquote_url.path.segments[3] = str(category_id) return unquote_url.url
def __init__(self, without_browser=False, optimized_mode=True): self.driver = SeleniumDriver().chrome_driver( without_browser=without_browser, optimized_mode=optimized_mode) logging.info("initializing finished") self.csv_handler = CSVHandler()
class BookScrapper(object): def __init__(self, without_browser=False, optimized_mode=True): self.driver = SeleniumDriver().chrome_driver( without_browser=without_browser, optimized_mode=optimized_mode) logging.info("initializing finished") self.csv_handler = CSVHandler() def extract_books_by_category(self): logging.info("extract_books_by_category started") categories = self.__scrape_categories_link(category_url=CATEGORY_URL) out_put_file = "" for category in categories: out_put_file = self.csv_handler.export_book_to_csv( self.__scrape_books_by_category(category), file_name="ketabrah") return out_put_file def extract_books_by_publishers(self): logging.info("extract_books_by_publisher started") publishers = self.__scrape_publishers_link( publishers_url=PUBLISHERS_URL) out_put_file = "" # for test # publishers = publishers[-2:] for publisher in publishers: out_put_file = self.csv_handler.export_book_to_csv( self.__scrape_books_by_publishers(publisher), file_name="ketabrah") return out_put_file def __scrape_books_by_category(self, category): list_books = [] page_index = 1 while True: url = self.__generate_url(category.url, page_index) self.driver.get(url) try: books_list_element = self.driver.find_element_by_class_name( "book-list") books_list_element = books_list_element.find_elements_by_class_name( "item") try: for book in books_list_element: book_instance = Book() book_instance.title = book.find_element_by_class_name( "title").text logging.info(book_instance.title) book_instance.author = book.find_element_by_class_name( "authors").text book_instance.price = book.find_element_by_class_name( "price").text book_instance.category = category.title list_books.append(book_instance) except Exception as e: pass page_index = page_index + 1 except Exception as e: return list_books @staticmethod def __generate_url(url, page_index): _url = url + "/page-" + str(page_index) return _url def __scrape_categories_link(self, category_url): categories = [] self.driver.get(url=category_url) menu_items = self.driver.find_element_by_class_name("cr-menu") items = menu_items.find_elements_by_class_name("crm-item") for item in items: category_url = item.find_element_by_xpath( ".//a[@href]").get_attribute("href") if "book-category" in category_url: category = Category() category.url = category_url logging.info(item.text) category.title = item.text categories.append(category) return categories def __scrape_publishers_link(self, publishers_url): self.driver.get(publishers_url) publishers_list_element = self.driver.find_element_by_class_name( "publishers-list") publisher_blocks_elements = publishers_list_element.find_elements_by_class_name( "publisher-block") list_publishers = [] for publisher_block in publisher_blocks_elements: publisher = Publisher() publisher.url = publisher_block.get_attribute("href") publisher.name = publisher_block.find_element_by_class_name( "publisher-block-name").text logging.info(publisher.name) list_publishers.append(publisher) return list_publishers def __scrape_books_by_publishers(self, publisher): list_books = [] page_index = 1 while True: url = self.__generate_url(publisher.url, page_index) self.driver.get(url) try: books_list_element = self.driver.find_element_by_class_name( "book-list") books_list_element = books_list_element.find_elements_by_class_name( "item") try: for book in books_list_element: book_instance = Book() book_instance.title = book.find_element_by_class_name( "title").text logging.info(book_instance.title) book_instance.author = book.find_element_by_class_name( "authors").text book_instance.price = book.find_element_by_class_name( "price").text book_instance.publisher = publisher.name list_books.append(book_instance) except Exception as e: pass page_index = page_index + 1 except Exception as e: return list_books
def __init__(self): self.__remove_json_file() self.response_count = 100 self.INITIAL_OFFSET = "0-0-0-100" self.csv_handler = CSVHandler()
class BookScrapper(object): def __init__(self): self.__remove_json_file() self.response_count = 100 self.INITIAL_OFFSET = "0-0-0-100" self.csv_handler = CSVHandler() def extract_books_api_by_category(self, category_id): __pagination = self.__get_next_offset_from_json() out_put_file = "" while True: if __pagination.hasMore: url = self.__generate_url_pagination_by_category( category_id, offset=__pagination.offset) out_put_file = self.csv_handler.export_book_to_csv( file_name="taghche", books=self.__get_books_from_api(url=url)) __pagination = self.__get_next_offset_from_json() else: return out_put_file def set_response_count(self, count: int): self.INITIAL_OFFSET = "0-0-0-" + str(count) self.response_count = count @staticmethod def __generate_url_pagination_by_category(category_id, offset): _unquote_url = furl(unquote(TEMPLATE_URL)) _filters = json.loads(_unquote_url.query.params["filters"]) _filters["list"][0]["value"] = str(category_id) _category_filter = json.dumps(_filters) _paginated_furl = furl(unquote(HOST)).add(args={ 'filters': _category_filter, 'offset': offset }) return _paginated_furl.url def __get_books_from_api(self, url): logging.info("API Called") r = requests.get(url) r.encoding = 'UTF-8' self.__save_json_to_file(r.json(), JSON_FILE_PATH) with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file: logging.info('reading from json file') json_books = ijson.items(input_file, "bookList.books.item") books = [] for book in json_books: book_instance = Book() book_instance.title = book["title"] book_instance.page_number = book["numberOfPages"] book_instance.price = book["price"] book_instance.publisher = book["publisher"] book_instance.author = self.convert_authors_to_string( book["authors"]) book_instance.category = self.convert_categories_to_string( book["categories"]) # book_instance.beforeOffPrice = book["beforeOffPrice"] # book_instance.rating = book["rating"] # book_instance.physicalPrice = book["PhysicalPrice"] # book_instance.publish_date = book["publishDate"] books.append(book_instance) return books @staticmethod def convert_authors_to_string(list_authors): authors_name = " " for author_index in range(list_authors.__len__()): author = list_authors[author_index] authors_name = authors_name + str(author["firstName"]) + " " + str( author["lastName"]) + " " return authors_name @staticmethod def convert_categories_to_string(list_categories): category_name = " " for category_index in range(list_categories.__len__()): category = list_categories[category_index] category_name = category_name + str(category["title"]) + " " return category_name @staticmethod def change_offset_lentgh(offset, length=20): _custom_offset = offset.split('-') _custom_offset[-1] = str(length) _offset = '-'.join(_custom_offset) logging.info(offset) return _offset def __get_next_offset_from_json(self, ): try: with open(JSON_FILE_PATH, encoding='utf-8-sig') as input_file: json_object = json.load(input_file) pagination = Pagination() pagination.hasMore = json_object["hasMore"] pagination.offset = self.change_offset_lentgh( json_object["nextOffset"], self.response_count) return pagination except Exception as error: pagination = Pagination() pagination.hasMore = True pagination.offset = self.INITIAL_OFFSET return pagination @staticmethod def __save_json_to_file(json_object, file_name): try: with open(file_name, 'w') as json_file: json.dump(json_object, json_file) except Exception as error: logging.error("JSON could not save") raise error @staticmethod def __remove_json_file(): if os.path.exists(JSON_FILE_PATH): os.remove(JSON_FILE_PATH)