def fix(self): import pprint as pp d = DataEngine() session, table = d.get_engine() r = session.query(table).filter(table.date == "0000-00-00").all() for i in r: processed = self.process_item(i.code) print("------------------ begin ----------------------") #table.__table__.update().where(table.id==i.id).values(date=processed['date']) session.query(table).filter(table.id == i.id).update({table.date: processed['date']}, synchronize_session = False) pp.pprint((processed['url'], ": ", processed['date'])) print("------------------ end -----------------------") session.commit()
class Engine(object): """ Engine to process: https://coderprog.com/ """ __host__ = 'coderprog' baseurl: str = "https://coderprog.com" total_of_pages: int = 0 total_of_pages_classified: int = 0 orm: str = '' data_engine: str = object def __init__(self, orm: str = '', **kwargs) -> None: self.orm = orm self.data_engine = DataEngine(orm=self.orm) def item_save(self, book_data: list) -> bool: try: result = self.data_engine.save(book_data) except Exception: result = False return result def process_item(self, code: str, referer: str = '') -> object: item_url = self.baseurl + "/" + code + "/" #item_url = "https://coderprog.com/red-hat-certified-engineer-rhel-8-rhce/" bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser') du = bs.find("span", { 'class': 'thetime date updated' }).get_text().strip() try: du = bs.find("span", { 'class': 'thetime date updated' }).get_text().strip() date_posted = parser.parse(du).date() except Exception: date_posted = None try: thumb = self.baseurl + bs.find("div", { 'class': 'thecontent' }).find("img")['src'] except Exception: thumb = 'none' try: description = 'none' except Exception: description = 'none' try: title = bs.find("div", {'class': 'thecontent'}).find("img")['alt'] except Exception: title = 'none' metadata = bs.find("div", { 'class': 'thecontent' }).findAll("div")[0].get_text().strip().split("\n")[1] submetadata = metadata.split("|") video = True if "MP4" in metadata else False if video is True: date_published = None pages = 0 try: language = submetadata[0].strip() except Exception: language = 0 try: size = 0 size_literal = submetadata[-1].strip() except Exception: size = 0 size_literal = None try: duration = 0 duration_literal = submetadata[-2].strip() except Exception: duration = None duration_literal = None isbn10 = isbn13 = 0 else: try: mdate = submetadata[1].strip() date_published = datetime.strptime(mdate, '%Y').date() except Exception: date_published = None try: pages = submetadata[3].strip() pages = re.search("([0-9]+) Pages", pages)[1] except Exception: pages = 0 try: language = submetadata[0].strip() except Exception: language = 0 try: s = submetadata[5].strip() size = int(re.search("([0-9]+) MB", s)[1]) * 1024 * 1024 size_literal = s except Exception: size = 0 size_literal = None try: isbn = submetadata[2].strip() ib = re.search("ISBN: ([0-9]+)", isbn)[1] isbn13 = "978-" + ib isbn10 = ib except Exception: isbn13 = 0 isbn10 = 0 duration_literal = duration = None data = { 'title': title, 'date_published': date_published, 'date_posted': date_posted, 'pages': pages, 'language': language, 'code': code, 'url': item_url, 'author': "none", 'publisher': "none", 'isbn10': isbn10, 'isbn13': isbn13, 'thumbnail': thumb, 'engine': self.__host__, 'format': 'text' if video is False else "video", 'size': size, 'size_literal': size_literal, 'duration': duration, 'duration_literal': duration_literal, 'description': str(description) } return data def process_page(self, page_number: int = 1, progressbar: object = None) -> []: #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages)) page_url = self.baseurl + "/page/" + str( page_number) + "/" if page_number > 1 else self.baseurl bs = BeautifulSoup(wget(page_url), 'html.parser') nameList = bs.find("div", { 'id': 'content_box' }).findAll('article', {'class': 'latestPost'}) data = [] for _index, i in enumerate(nameList): if progressbar is not None: progressbar() data = i.find('h2').find('a') data_text = data.get_text() code = data['href'].replace(self.baseurl, "").replace("/", "") #print(f"\t\t[page={page_number}]item: " + str(index + 1) + " of " + str(len(nameList))) isset = self.data_engine.isset_code(code=code, engine=self.__host__) #print(f"{code} => {isset}") if isset is False: try: book_data = self.process_item(code=code, referer=page_url) self.item_save(book_data=book_data) pass except Exception as e: print( f"Error processing page: {page_url} , title: {data_text}, item: " + self.baseurl + "/" + code + "/") print(e) return True def count_total_pages(self) -> int: bs = BeautifulSoup(wget(self.baseurl), 'html.parser') content = bs.findAll("a", {'class': 'page-numbers'}) total_pages = int(content[-2].get_text().strip().replace(",", "")) total_items = bs.find("div", { 'id': 'content_box' }).findAll('article', {'class': 'latestPost'}) self.total_of_pages = total_pages self.totat_items_per_page = len(total_items) return total_pages, self.totat_items_per_page def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int): """ Return all the sanitized pages Keyword Arguments: start_from_page {int} -- What page are going to start (default: {1}) Returns: list -- All the pages to be processed """ total_pages, num_items_per_page = self.count_total_pages() entries = [] for i in range(total_pages): current_page = i + 1 if current_page >= start_from_page: entries.append(i + 1) self.total_of_pages_classified = len(entries) return entries, num_items_per_page def run(self, start_from_page: int = 1) -> None: pages, _ = self.num_of_pages_to_process( start_from_page=start_from_page) for current_page in pages: self.process_page(current_page) def fix(self): import pprint as pp d = DataEngine() session, table = d.get_engine() r = session.query(table).filter(table.date == "0000-00-00").all() for i in r: processed = self.process_item(i.code) print("------------------ begin ----------------------") #table.__table__.update().where(table.id==i.id).values(date=processed['date']) session.query(table).filter(table.id == i.id).update( {table.date: processed['date']}, synchronize_session=False) pp.pprint((processed['url'], ": ", processed['date'])) print("------------------ end -----------------------") session.commit()
def __init__(self, orm: str = '', **kwargs) -> None: self.orm = orm self.data_engine = DataEngine(orm=self.orm)
class Engine(object): """ Engine to process: https://www.letmeread.net """ __host__ = 'letmeread' baseurl: str = "https://www.letmeread.net" total_of_pages: int = 0 total_of_pages_classified: int = 0 orm: str = '' data_engine: str = object def __init__(self, orm: str = '', **kwargs) -> None: self.orm = orm self.data_engine = DataEngine(orm=self.orm) def item_save(self, book_data: list) -> bool: try: result = self.data_engine.save(book_data) except Exception: result = False return result def process_item(self, code: str) -> object: item_url = self.baseurl + "/" + code + "/" bs = BeautifulSoup(wget(item_url), 'html.parser') try: du = bs.find("meta", {'property': 'article:published_time'})['content'] date_posted = parser.parse(du).date() except Exception: date_posted = None try: thumb = bs.find("img", {'class': 'align-self-start img-fluid'})['src'] except Exception: thumb = 'none' try: description = bs.find("div", { 'class': 'col-md-8' }).find("div", { 'class': 'card mb-4' }).find("div", {'class': 'card-body'}) except Exception: description = 'none' data = { 'title': "none", 'date_published': None, 'date_posted': date_posted, 'pages': 0, 'language': "none", 'code': code, 'url': item_url, 'author': "none", 'publisher': "none", 'isbn10': "", 'isbn13': "none", 'thumbnail': thumb, 'engine': 'letmeread', 'format': 'text', 'size': 0, 'description': (description) } c = bs.find("ul", {'class': 'list-unstyled mb-0'}).findAll("li") for i in c: cc = i.get_text().strip() item = re.findall("([a-zA-Z0-9\- ]+): (.*)", cc) # print(item) ititle = item[0][0].strip() ivalue = item[0][1].strip() if (ititle == "Title"): data['title'] = ivalue elif (ititle == "Author"): data['author'] = ivalue elif (ititle == "Length"): num_of_pages = re.search("([0-9]+) pages", ivalue)[1] data['pages'] = num_of_pages elif (ititle == "Language"): data['language'] = ivalue elif (ititle == "Publisher"): data['publisher'] = ivalue elif (ititle == "Publication Date"): try: d = datetime.strptime(ivalue, '%Y').date() data['date_published'] = d except Exception: try: d = datetime.strptime(ivalue, '%Y-%m-%d').date() data['date_published'] = d except Exception: try: d = datetime.strptime(ivalue, '%Y-%m').date() data['date_published'] = d except Exception: pass elif (ititle == "ISBN-10"): data['isbn10'] = ivalue elif (ititle == "ISBN-13"): data['isbn13'] = ivalue return data def process_page(self, page_number: int = 1, progressbar: object = None) -> []: #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages)) page_url = self.baseurl + "/page/" + str( page_number) + "/" if page_number > 1 else self.baseurl bs = BeautifulSoup(wget(page_url), 'html.parser') nameList = bs.findAll('div', {'class': 'card-body p-2'}) data = [] for _index, i in enumerate(nameList): if progressbar is not None: progressbar() data = i.find('a') data_text = data.get_text() code = data['href'].replace("/", "") #print(f"\t\t[page={page_number}]item: " + str(index + 1) + " of " + str(len(nameList))) isset = self.data_engine.isset_code(code=code, engine=self.__host__) if isset is False: try: book_data = self.process_item(code=code) self.item_save(book_data=book_data) pass except Exception as e: print( f"Error processing page: {page_url} , title: {data_text}, item: " + self.baseurl + "/" + code + "/") print(e) return True def count_total_pages(self) -> int: bs = BeautifulSoup(wget(self.baseurl), 'html.parser') content = bs.find("li", {'class': 'page-item disabled d-none d-lg-block'}) sp = re.search('of ([0-9]+)', content.get_text().strip(), flags=re.IGNORECASE) total_pages = int(sp[1]) total_items = bs.findAll('div', {'class': 'card-body p-2'}) self.total_of_pages = total_pages self.totat_items_per_page = len(total_items) return total_pages, self.totat_items_per_page def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int): """ Return all the sanitized pages Keyword Arguments: start_from_page {int} -- What page are going to start (default: {1}) Returns: list -- All the pages to be processed """ total_pages, num_items_per_page = self.count_total_pages() entries = [] for i in range(total_pages): current_page = i + 1 if current_page >= start_from_page: entries.append(i + 1) self.total_of_pages_classified = len(entries) return entries, num_items_per_page def run(self, start_from_page: int = 1) -> None: pages, _ = self.num_of_pages_to_process( start_from_page=start_from_page) for current_page in pages: self.process_page(current_page) def fix(self): import pprint as pp d = DataEngine() session, table = d.get_engine() r = session.query(table).filter(table.date == "0000-00-00").all() for i in r: processed = self.process_item(i.code) print("------------------ begin ----------------------") #table.__table__.update().where(table.id==i.id).values(date=processed['date']) session.query(table).filter(table.uid == i.uid).update( {table.date_published: processed['date_published']}, synchronize_session=False) pp.pprint((processed['url'], ": ", processed['date'])) print("------------------ end -----------------------") session.commit()
class Engine(object): """ Engine to process: http://www.allitebooks.org/ """ __host__ = 'allitebooks' baseurl: str = "http://www.allitebooks.org/" total_of_pages: int = 0 total_of_pages_classified: int = 0 orm: str = '' data_engine: str = object def __init__(self, orm: str = '', **kwargs) -> None: self.orm = orm self.data_engine = DataEngine(orm=self.orm) def item_save(self, book_data: list) -> bool: try: result = self.data_engine.save(book_data) except Exception: result = False return result def process_item(self, code: str, referer: str = '', url: str = None) -> object: if url is None: return False item_url = url bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser') try: title = bs.find("h1", {'class': 'single-title'}).get_text() try: sub = bs.find("header", {'class': 'entry-header'}).find("h4").get_text().strip() sub = ": " + sub except Exception: sub = "" title = f"{title}{sub}" except Exception: title = 'none' try: du = re.search("/uploads/([0-9]+)/([0-9]+)/", bs.find("img", {'class': 'attachment-post-thumbnail'})['src'].strip()) date_posted = datetime.strptime(f"{du[1]}-{du[2]}", "%Y-%m").date() except Exception: date_posted = None try: thumb = bs.find("img", {'class': 'attachment-post-thumbnail'})['src'].strip() except Exception: thumb = 'none' try: description = bs.find("div", "entry-content") description.find("h3").decompose() except Exception: description = 'none' submetadata = bs.find("div", {'class': 'book-detail'}).findAll("dd") #pp(submetadata) #exit() try: mdate = submetadata[2].get_text().strip() date_published = datetime.strptime(mdate, '%Y').date() except Exception: date_published = None try: author = submetadata[0].get_text().strip() except Exception: author = None try: publisher = None except Exception: publisher = None try: pages = submetadata[3].get_text().strip() except Exception: pages = 0 try: language = submetadata[4].get_text().strip() except Exception: language = None s = submetadata[5].get_text().strip() try: s = submetadata[5].get_text().strip() size = int(round(float(re.search("(.*) MB", s)[1]))) * 1024 * 1024 size_literal = s except Exception: size = 0 size_literal = None try: isbn = submetadata[1].get_text().strip().replace("-", "").split(",")[0] isbn13 = f"978{isbn}" if len(isbn) < 13 else isbn isbn10 = isbn except Exception: isbn13 = 0 isbn10 = 0 duration_literal = duration = None data = { 'title': title, 'date_published': date_published, 'date_posted': date_posted, 'pages': pages, 'language': language, 'code': code, 'url': item_url, 'author': author, 'publisher': publisher, 'isbn10': isbn10, 'isbn13': isbn13, 'thumbnail': thumb, 'engine': self.__host__, 'format': 'text', 'size': size, 'size_literal': size_literal, 'duration': duration, 'duration_literal': duration_literal, 'description': str(description) } return data def process_page(self, page_number: int = 1, progressbar: object = None) -> []: #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages)) page_url = f"{self.baseurl}/page/{page_number}/" if page_number > 1 else self.baseurl bs = BeautifulSoup(wget(page_url), 'html.parser') nameList = bs.find("div", {'class': 'main-content-inner'}).findAll("article", {'class': 'post'}) data = [] for _index, i in enumerate(nameList): if progressbar is not None: progressbar() data = i.find('h2').find('a') code = data['href'].replace(self.baseurl, "").replace("/", "") url = data['href'] isset = self.data_engine.isset_code(code=code, engine=self.__host__) if isset is False: try: book_data = self.process_item(code=code, referer=page_url, url=url) self.item_save(book_data=book_data) pass except Exception as e: print(f"Error processing page: {page_url} , title: {data.get_text()}, item: " + url) print(e) return True def count_total_pages(self) -> int: bs = BeautifulSoup(wget(self.baseurl), 'html.parser') content = bs.find("div", {'class': 'main-content-inner'}).findAll("article") total_pages = int(bs.find("div", {'class': 'pagination'}).findAll("a")[-1].get_text()) self.total_of_pages = total_pages self.total_items_per_page = len(content) return total_pages, self.total_items_per_page def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int): """ Return all the sanitized pages Keyword Arguments: start_from_page {int} -- What page are going to start (default: {1}) Returns: list -- All the pages to be processed """ total_pages, num_items_per_page = self.count_total_pages() entries = [] for i in range(total_pages): current_page = i + 1 if current_page >= start_from_page: entries.append(i + 1) self.total_of_pages_classified = len(entries) return entries, num_items_per_page def run(self, start_from_page: int = 1) -> None: pages, _ = self.num_of_pages_to_process(start_from_page=start_from_page) for current_page in pages: self.process_page(current_page) def fix(self): import pprint as pp d = DataEngine() session, table = d.get_engine() r = session.query(table).filter(table.date == "0000-00-00").all() for i in r: processed = self.process_item(i.code) print("------------------ begin ----------------------") #table.__table__.update().where(table.id==i.id).values(date=processed['date']) session.query(table).filter(table.id == i.id).update({table.date: processed['date']}, synchronize_session = False) pp.pprint((processed['url'], ": ", processed['date'])) print("------------------ end -----------------------") session.commit()
class Engine(object): """ Engine to process: https://coderprog.com/ """ __host__ = 'freedwnlds' baseurl: str = "https://freedwnlds.com" total_of_pages: int = 0 total_of_pages_classified: int = 0 orm: str = '' data_engine: str = object def __init__(self, orm: str = '', **kwargs) -> None: self.orm = orm self.data_engine = DataEngine(orm=self.orm) def item_save(self, book_data: list) -> bool: try: result = self.data_engine.save(book_data) except Exception: result = False return result def process_item(self, code: str, referer: str = '', url: str = None) -> object: if url is None: return False item_url = url #item_url = "https://freedwnlds.com/2020/05/hidden-figures-songbook-music-from-the-motion-picture-soundtrack-201/" bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser') try: title = bs.find("h1", {'class': 'entry-title'}).get_text() except Exception: title = 'none' try: du = bs.find( "time", {'class': 'entry-date published'})['datetime'].strip() date_posted = parser.parse(du).date() except Exception: date_posted = None try: thumb = bs.find("div", { 'class': 'page-header-image-single' }).find('img', {'class': 'attachment-full'})['data-lazy-src'] except Exception: thumb = 'none' try: description = 'none' except Exception: description = 'none' metadata = bs.find("meta", {'property': 'og:description'})['content'] submetadata = metadata.split("|") print(submetadata) try: mdate = submetadata[-3].strip() date_published = datetime.strptime(mdate, '%Y').date() except Exception: date_published = None try: author = submetadata[-2].strip() except Exception: author = None try: publisher = submetadata[-4].strip() except Exception: publisher = None try: pages = submetadata[-1].strip() pages = re.search("Page: ([0-9]+)", pages)[1] except Exception: pages = 0 try: language = submetadata[2].strip() except Exception: language = None try: s = submetadata[1].strip() size = int(re.search("([0-9]+) MB", s)[1]) * 1024 * 1024 size_literal = s except Exception: size = 0 size_literal = None try: isbn = submetadata[3].strip() ib = re.search("([0-9]+)", isbn)[1] isbn13 = ib isbn10 = ib[3:] except Exception: isbn13 = 0 isbn10 = 0 duration_literal = duration = None data = { 'title': title, 'date_published': date_published, 'date_posted': date_posted, 'pages': pages, 'language': language, 'code': code, 'url': item_url, 'author': author, 'publisher': publisher, 'isbn10': isbn10, 'isbn13': isbn13, 'thumbnail': thumb, 'engine': self.__host__, 'format': 'text', 'size': size, 'size_literal': size_literal, 'duration': duration, 'duration_literal': duration_literal, 'description': str(description) } #pp(data) #exit() return data def process_page(self, page_number: int = 1, progressbar: object = None) -> []: #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages)) page_url = self.baseurl + "/category/ebooks/page/" + str( page_number ) + "/" if page_number > 1 else self.baseurl + "/category/ebooks/" bs = BeautifulSoup(wget(page_url), 'html.parser') nameList = bs.findAll("article", {'class': 'post'}) data = [] for _index, i in enumerate(nameList): if progressbar is not None: progressbar() data = i.find('h2').find('a') data_text = data.get_text() code = re.search("/([0-9]+)/([0-9]+)/([0-9a-z\-]+)/", data['href'])[3] url = data['href'] isset = self.data_engine.isset_code(code=code, engine=self.__host__) if isset is False: try: book_data = self.process_item(code=code, referer=page_url, url=url) self.item_save(book_data=book_data) pass except Exception as e: print( f"Error processing page: {page_url} , title: {data_text}, item: " + url) print(e) return True def count_total_pages(self) -> int: bs = BeautifulSoup(wget(self.baseurl + "/category/ebooks/"), 'html.parser') content = bs.findAll("a", {'class': 'page-numbers'}) total_pages = int(content[-2].get_text().strip().replace(",", "")) total_items = bs.findAll("article", {'class': 'post'}) self.total_of_pages = total_pages self.totat_items_per_page = len(total_items) return total_pages, self.totat_items_per_page def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int): """ Return all the sanitized pages Keyword Arguments: start_from_page {int} -- What page are going to start (default: {1}) Returns: list -- All the pages to be processed """ total_pages, num_items_per_page = self.count_total_pages() entries = [] for i in range(total_pages): current_page = i + 1 if current_page >= start_from_page: entries.append(i + 1) self.total_of_pages_classified = len(entries) return entries, num_items_per_page def run(self, start_from_page: int = 1) -> None: pages, _ = self.num_of_pages_to_process( start_from_page=start_from_page) for current_page in pages: self.process_page(current_page) def fix(self): import pprint as pp d = DataEngine() session, table = d.get_engine() r = session.query(table).filter(table.date == "0000-00-00").all() for i in r: processed = self.process_item(i.code) print("------------------ begin ----------------------") #table.__table__.update().where(table.id==i.id).values(date=processed['date']) session.query(table).filter(table.id == i.id).update( {table.date: processed['date']}, synchronize_session=False) pp.pprint((processed['url'], ": ", processed['date'])) print("------------------ end -----------------------") session.commit()
class Engine(object): """ Engine to process: https://www.wowebook.org/ """ __host__ = 'wowebook' baseurl: str = "https://www.wowebook.org" total_of_pages: int = 0 total_items_per_page: int = 0 total_of_pages_classified: int = 0 orm: str = '' data_engine: str = object def __init__(self, orm: str = '', **kwargs) -> None: self.orm = orm self.data_engine = DataEngine(orm=self.orm) def item_save(self, book_data: list) -> bool: try: result = self.data_engine.save(book_data) except Exception: result = False return result def process_item(self, code: str = None, referer: str = '', url: str = None) -> object: if url is None: return False item_url = url bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser') try: title = bs.find("h1", {'class': 'post-title'}).get_text() except Exception: title = 'none' try: du = bs.find("time", {'class': 'published'}).get_text() date_posted = parser.parse(du).date() except Exception: date_posted = None try: thumb = bs.find('div', { 'class': 'entry-inner' }).find("img", {'class': 'size-full'})['src'].strip() except Exception: thumb = 'none' try: description = 'none' except Exception: description = 'none' try: submetadata = bs.find("div", { 'class': 'entry-inner' }).find("ul").get_text() except Exception: submetadata = "" #print(submetadata) #s = re.search(r"ISBN-13:\s([0-9a-zA-Z\-]+)", submetadata)[1] #print(s) #exit() try: date_published = re.search(r"\s\(([0-9a-zA-Z,\s]+)\)", submetadata)[1] date_published = parser.parse(date_published).date() except Exception: date_published = None try: author = None except Exception: author = None try: publisher = None except Exception: publisher = None try: pages = int(re.search(r":\s([0-9]+) pages", submetadata)[1]) except Exception: pages = 0 try: language = re.search(r"Language:\s([a-zA-Z]+)", submetadata)[1] except Exception: language = None try: size = None size_literal = size except Exception: size = 0 size_literal = None try: isbn13 = re.search(r"ISBN-13:\s([0-9a-zA-Z\-]+)", submetadata)[1] isbn10 = re.search(r"ISBN-10:\s([0-9a-zA-Z]+)", submetadata)[1] except Exception: isbn13 = 0 isbn10 = 0 duration_literal = duration = None data = { 'title': title, 'date_published': date_published, 'date_posted': date_posted, 'pages': pages, 'language': language, 'code': code, 'url': item_url, 'author': author, 'publisher': publisher, 'isbn10': isbn10, 'isbn13': isbn13, 'thumbnail': thumb, 'engine': self.__host__, 'format': 'text', 'size': size, 'size_literal': size_literal, 'duration': duration, 'duration_literal': duration_literal, 'description': str(description) } #pp(data) #exit() return data def process_page(self, page_number: int = 1, progressbar: object = None) -> []: #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages)) page_url = f"{self.baseurl}/page/{page_number}/" if page_number > 1 else self.baseurl bs = BeautifulSoup(wget(page_url), 'html.parser') nameList = bs.find("div", { 'class': 'post-list-standard' }).findAll("article") data = [] for _index, i in enumerate(nameList): if progressbar is not None: progressbar() data = i.find('h2').find('a') code = data['href'].replace(self.baseurl, "").replace("/", "") url = data['href'] isset = self.data_engine.isset_code(code=code, engine=self.__host__) if isset is False: try: book_data = self.process_item(code=code, referer=page_url, url=url) self.item_save(book_data=book_data) pass except Exception as e: print( f"Error processing page: {page_url} , title: {data.get_text()}, item: " + url) print(e) return True def count_total_pages(self) -> int: if self.total_of_pages > 0: return self.total_of_pages, self.total_items_per_page bs = BeautifulSoup(wget(self.baseurl), 'html.parser') content = bs.find("div", { 'class': 'post-list-standard' }).findAll("article") self.total_of_pages = self.inner_total_pages() self.total_items_per_page = len(content) return self.total_of_pages, self.total_items_per_page def b(self, lista: list = []): tam = len(lista) if tam <= 1: return lista[0] mid = tam // 2 r = self.check_pn(lista[mid]) if r == 1: return self.b(lista[mid:]) elif r == 2: return self.b(lista[:mid]) return lista[mid] def check_pn(self, data: object = {}) -> int: p = data['page'] uri = f"{self.baseurl}/page/{p}" print(f" Checking: {uri}") r = wget(url=uri, only_status=True) if r == 200: rb = wget(url=f"{self.baseurl}/page/{p+1}", only_status=True) if rb != 200: return 3 else: return 1 else: return 2 return 3 def inner_total_pages(self): return 1500 #Forced a = [] e = 10000 for i in range(e): a.append({'page': i + 1}) r = self.b(a) return r['page'] def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int): """ Return all the sanitized pages Keyword Arguments: start_from_page {int} -- What page are going to start (default: {1}) Returns: list -- All the pages to be processed """ total_pages, num_items_per_page = self.count_total_pages() entries = [] for i in range(total_pages): current_page = i + 1 if current_page >= start_from_page: entries.append(i + 1) self.total_of_pages_classified = len(entries) return entries, num_items_per_page def run(self, start_from_page: int = 1) -> None: pages, _ = self.num_of_pages_to_process( start_from_page=start_from_page) for current_page in pages: self.process_page(current_page) def fix(self): import pprint as pp d = DataEngine() session, table = d.get_engine() r = session.query(table).filter(table.date == "0000-00-00").all() for i in r: processed = self.process_item(i.code) print("------------------ begin ----------------------") #table.__table__.update().where(table.id==i.id).values(date=processed['date']) session.query(table).filter(table.id == i.id).update( {table.date: processed['date']}, synchronize_session=False) pp.pprint((processed['url'], ": ", processed['date'])) print("------------------ end -----------------------") session.commit()