Пример #1
0
 def fix(self):
     import pprint as pp
     d = DataEngine()
     session, table = d.get_engine()
     r = session.query(table).filter(table.date == "0000-00-00").all()
     for i in r:
         processed = self.process_item(i.code)
         print("------------------ begin ----------------------")
         #table.__table__.update().where(table.id==i.id).values(date=processed['date'])
         session.query(table).filter(table.id == i.id).update({table.date: processed['date']}, synchronize_session = False)
         pp.pprint((processed['url'], ": ", processed['date']))
         print("------------------ end -----------------------")
         session.commit()
Пример #2
0
class Engine(object):
    """
    Engine to process: https://coderprog.com/
    """
    __host__ = 'coderprog'
    baseurl: str = "https://coderprog.com"
    total_of_pages: int = 0
    total_of_pages_classified: int = 0
    orm: str = ''
    data_engine: str = object

    def __init__(self, orm: str = '', **kwargs) -> None:
        self.orm = orm
        self.data_engine = DataEngine(orm=self.orm)

    def item_save(self, book_data: list) -> bool:
        try:
            result = self.data_engine.save(book_data)
        except Exception:
            result = False
        return result

    def process_item(self, code: str, referer: str = '') -> object:
        item_url = self.baseurl + "/" + code + "/"
        #item_url = "https://coderprog.com/red-hat-certified-engineer-rhel-8-rhce/"
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        du = bs.find("span", {
            'class': 'thetime date updated'
        }).get_text().strip()
        try:
            du = bs.find("span", {
                'class': 'thetime date updated'
            }).get_text().strip()
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = self.baseurl + bs.find("div", {
                'class': 'thecontent'
            }).find("img")['src']
        except Exception:
            thumb = 'none'
        try:
            description = 'none'
        except Exception:
            description = 'none'
        try:
            title = bs.find("div", {'class': 'thecontent'}).find("img")['alt']
        except Exception:
            title = 'none'
        metadata = bs.find("div", {
            'class': 'thecontent'
        }).findAll("div")[0].get_text().strip().split("\n")[1]
        submetadata = metadata.split("|")
        video = True if "MP4" in metadata else False
        if video is True:
            date_published = None
            pages = 0
            try:
                language = submetadata[0].strip()
            except Exception:
                language = 0
            try:
                size = 0
                size_literal = submetadata[-1].strip()
            except Exception:
                size = 0
                size_literal = None
            try:
                duration = 0
                duration_literal = submetadata[-2].strip()
            except Exception:
                duration = None
                duration_literal = None
            isbn10 = isbn13 = 0

        else:
            try:
                mdate = submetadata[1].strip()
                date_published = datetime.strptime(mdate, '%Y').date()
            except Exception:
                date_published = None
            try:
                pages = submetadata[3].strip()
                pages = re.search("([0-9]+) Pages", pages)[1]
            except Exception:
                pages = 0
            try:
                language = submetadata[0].strip()
            except Exception:
                language = 0
            try:
                s = submetadata[5].strip()
                size = int(re.search("([0-9]+) MB", s)[1]) * 1024 * 1024
                size_literal = s
            except Exception:
                size = 0
                size_literal = None
            try:
                isbn = submetadata[2].strip()
                ib = re.search("ISBN: ([0-9]+)", isbn)[1]
                isbn13 = "978-" + ib
                isbn10 = ib
            except Exception:
                isbn13 = 0
                isbn10 = 0
            duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': "none",
            'publisher': "none",
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text' if video is False else "video",
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal,
            'description': str(description)
        }
        return data

    def process_page(self,
                     page_number: int = 1,
                     progressbar: object = None) -> []:
        #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
        page_url = self.baseurl + "/page/" + str(
            page_number) + "/" if page_number > 1 else self.baseurl
        bs = BeautifulSoup(wget(page_url), 'html.parser')
        nameList = bs.find("div", {
            'id': 'content_box'
        }).findAll('article', {'class': 'latestPost'})
        data = []
        for _index, i in enumerate(nameList):
            if progressbar is not None:
                progressbar()

            data = i.find('h2').find('a')
            data_text = data.get_text()
            code = data['href'].replace(self.baseurl, "").replace("/", "")
            #print(f"\t\t[page={page_number}]item: " + str(index + 1) + " of " + str(len(nameList)))
            isset = self.data_engine.isset_code(code=code,
                                                engine=self.__host__)
            #print(f"{code} => {isset}")
            if isset is False:
                try:
                    book_data = self.process_item(code=code, referer=page_url)
                    self.item_save(book_data=book_data)
                    pass
                except Exception as e:
                    print(
                        f"Error processing page: {page_url} , title: {data_text}, item: "
                        + self.baseurl + "/" + code + "/")
                    print(e)
        return True

    def count_total_pages(self) -> int:
        bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
        content = bs.findAll("a", {'class': 'page-numbers'})
        total_pages = int(content[-2].get_text().strip().replace(",", ""))

        total_items = bs.find("div", {
            'id': 'content_box'
        }).findAll('article', {'class': 'latestPost'})
        self.total_of_pages = total_pages
        self.totat_items_per_page = len(total_items)
        return total_pages, self.totat_items_per_page

    def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int):
        """
        Return all the sanitized pages

        Keyword Arguments:
            start_from_page {int} -- What page are going to start (default: {1})

        Returns:
            list -- All the pages to be processed
        """
        total_pages, num_items_per_page = self.count_total_pages()
        entries = []
        for i in range(total_pages):
            current_page = i + 1
            if current_page >= start_from_page:
                entries.append(i + 1)
        self.total_of_pages_classified = len(entries)
        return entries, num_items_per_page

    def run(self, start_from_page: int = 1) -> None:
        pages, _ = self.num_of_pages_to_process(
            start_from_page=start_from_page)
        for current_page in pages:
            self.process_page(current_page)

    def fix(self):
        import pprint as pp
        d = DataEngine()
        session, table = d.get_engine()
        r = session.query(table).filter(table.date == "0000-00-00").all()
        for i in r:
            processed = self.process_item(i.code)
            print("------------------ begin ----------------------")
            #table.__table__.update().where(table.id==i.id).values(date=processed['date'])
            session.query(table).filter(table.id == i.id).update(
                {table.date: processed['date']}, synchronize_session=False)
            pp.pprint((processed['url'], ": ", processed['date']))
            print("------------------ end -----------------------")
            session.commit()
Пример #3
0
 def __init__(self, orm: str = '', **kwargs) -> None:
     self.orm = orm
     self.data_engine = DataEngine(orm=self.orm)
Пример #4
0
class Engine(object):
    """
    Engine to process: https://www.letmeread.net
    """
    __host__ = 'letmeread'
    baseurl: str = "https://www.letmeread.net"
    total_of_pages: int = 0
    total_of_pages_classified: int = 0
    orm: str = ''
    data_engine: str = object

    def __init__(self, orm: str = '', **kwargs) -> None:
        self.orm = orm
        self.data_engine = DataEngine(orm=self.orm)

    def item_save(self, book_data: list) -> bool:
        try:
            result = self.data_engine.save(book_data)
        except Exception:
            result = False
        return result

    def process_item(self, code: str) -> object:
        item_url = self.baseurl + "/" + code + "/"
        bs = BeautifulSoup(wget(item_url), 'html.parser')
        try:
            du = bs.find("meta",
                         {'property': 'article:published_time'})['content']
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = bs.find("img",
                            {'class': 'align-self-start img-fluid'})['src']
        except Exception:
            thumb = 'none'
        try:
            description = bs.find("div", {
                'class': 'col-md-8'
            }).find("div", {
                'class': 'card mb-4'
            }).find("div", {'class': 'card-body'})
        except Exception:
            description = 'none'
        data = {
            'title': "none",
            'date_published': None,
            'date_posted': date_posted,
            'pages': 0,
            'language': "none",
            'code': code,
            'url': item_url,
            'author': "none",
            'publisher': "none",
            'isbn10': "",
            'isbn13': "none",
            'thumbnail': thumb,
            'engine': 'letmeread',
            'format': 'text',
            'size': 0,
            'description': (description)
        }
        c = bs.find("ul", {'class': 'list-unstyled mb-0'}).findAll("li")
        for i in c:
            cc = i.get_text().strip()
            item = re.findall("([a-zA-Z0-9\- ]+): (.*)", cc)
            # print(item)
            ititle = item[0][0].strip()
            ivalue = item[0][1].strip()
            if (ititle == "Title"):
                data['title'] = ivalue
            elif (ititle == "Author"):
                data['author'] = ivalue
            elif (ititle == "Length"):
                num_of_pages = re.search("([0-9]+) pages", ivalue)[1]
                data['pages'] = num_of_pages
            elif (ititle == "Language"):
                data['language'] = ivalue
            elif (ititle == "Publisher"):
                data['publisher'] = ivalue
            elif (ititle == "Publication Date"):
                try:
                    d = datetime.strptime(ivalue, '%Y').date()
                    data['date_published'] = d
                except Exception:
                    try:
                        d = datetime.strptime(ivalue, '%Y-%m-%d').date()
                        data['date_published'] = d
                    except Exception:
                        try:
                            d = datetime.strptime(ivalue, '%Y-%m').date()
                            data['date_published'] = d
                        except Exception:
                            pass
            elif (ititle == "ISBN-10"):
                data['isbn10'] = ivalue
            elif (ititle == "ISBN-13"):
                data['isbn13'] = ivalue
        return data

    def process_page(self,
                     page_number: int = 1,
                     progressbar: object = None) -> []:
        #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
        page_url = self.baseurl + "/page/" + str(
            page_number) + "/" if page_number > 1 else self.baseurl
        bs = BeautifulSoup(wget(page_url), 'html.parser')
        nameList = bs.findAll('div', {'class': 'card-body p-2'})
        data = []
        for _index, i in enumerate(nameList):
            if progressbar is not None:
                progressbar()
            data = i.find('a')
            data_text = data.get_text()
            code = data['href'].replace("/", "")
            #print(f"\t\t[page={page_number}]item: " + str(index + 1) + " of " + str(len(nameList)))
            isset = self.data_engine.isset_code(code=code,
                                                engine=self.__host__)
            if isset is False:
                try:
                    book_data = self.process_item(code=code)
                    self.item_save(book_data=book_data)
                    pass
                except Exception as e:
                    print(
                        f"Error processing page: {page_url} , title: {data_text}, item: "
                        + self.baseurl + "/" + code + "/")
                    print(e)
        return True

    def count_total_pages(self) -> int:
        bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
        content = bs.find("li",
                          {'class': 'page-item disabled d-none d-lg-block'})
        sp = re.search('of ([0-9]+)',
                       content.get_text().strip(),
                       flags=re.IGNORECASE)
        total_pages = int(sp[1])
        total_items = bs.findAll('div', {'class': 'card-body p-2'})
        self.total_of_pages = total_pages
        self.totat_items_per_page = len(total_items)
        return total_pages, self.totat_items_per_page

    def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int):
        """
        Return all the sanitized pages

        Keyword Arguments:
            start_from_page {int} -- What page are going to start (default: {1})

        Returns:
            list -- All the pages to be processed
        """
        total_pages, num_items_per_page = self.count_total_pages()
        entries = []
        for i in range(total_pages):
            current_page = i + 1
            if current_page >= start_from_page:
                entries.append(i + 1)
        self.total_of_pages_classified = len(entries)
        return entries, num_items_per_page

    def run(self, start_from_page: int = 1) -> None:
        pages, _ = self.num_of_pages_to_process(
            start_from_page=start_from_page)
        for current_page in pages:
            self.process_page(current_page)

    def fix(self):
        import pprint as pp
        d = DataEngine()
        session, table = d.get_engine()
        r = session.query(table).filter(table.date == "0000-00-00").all()
        for i in r:
            processed = self.process_item(i.code)
            print("------------------ begin ----------------------")
            #table.__table__.update().where(table.id==i.id).values(date=processed['date'])
            session.query(table).filter(table.uid == i.uid).update(
                {table.date_published: processed['date_published']},
                synchronize_session=False)
            pp.pprint((processed['url'], ": ", processed['date']))
            print("------------------ end -----------------------")
            session.commit()
Пример #5
0
class Engine(object):
    """
    Engine to process: http://www.allitebooks.org/
    """
    __host__ = 'allitebooks'
    baseurl: str = "http://www.allitebooks.org/"
    total_of_pages: int = 0
    total_of_pages_classified: int = 0
    orm: str = ''
    data_engine: str = object

    def __init__(self, orm: str = '', **kwargs) -> None:
        self.orm = orm
        self.data_engine = DataEngine(orm=self.orm)

    def item_save(self, book_data: list) -> bool:
        try:
            result = self.data_engine.save(book_data)
        except Exception:
            result = False
        return result

    def process_item(self, code: str, referer: str = '', url: str = None) -> object:
        if url is None:
            return False
        item_url = url
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        try:
            title = bs.find("h1", {'class': 'single-title'}).get_text()
            try:
                sub = bs.find("header", {'class': 'entry-header'}).find("h4").get_text().strip()
                sub = ": " + sub
            except Exception:
                sub = ""
            title = f"{title}{sub}"
        except Exception:
            title = 'none'
        try:
            du = re.search("/uploads/([0-9]+)/([0-9]+)/", bs.find("img", {'class': 'attachment-post-thumbnail'})['src'].strip())
            date_posted = datetime.strptime(f"{du[1]}-{du[2]}", "%Y-%m").date()
        except Exception:
            date_posted = None
        try:
            thumb = bs.find("img", {'class': 'attachment-post-thumbnail'})['src'].strip()
        except Exception:
            thumb = 'none'
        try:
            description = bs.find("div", "entry-content")
            description.find("h3").decompose()
        except Exception:
            description = 'none'
        submetadata = bs.find("div", {'class': 'book-detail'}).findAll("dd")
        #pp(submetadata)
        #exit()
        try:
            mdate = submetadata[2].get_text().strip()
            date_published = datetime.strptime(mdate, '%Y').date()
        except Exception:
            date_published = None
        try:
            author = submetadata[0].get_text().strip()
        except Exception:
            author = None
        try:
            publisher = None
        except Exception:
            publisher = None
        try:
            pages = submetadata[3].get_text().strip()
        except Exception:
            pages = 0
        try:
            language = submetadata[4].get_text().strip()
        except Exception:
            language = None
        s = submetadata[5].get_text().strip()
        try:
            s = submetadata[5].get_text().strip()
            size = int(round(float(re.search("(.*) MB", s)[1]))) * 1024 * 1024
            size_literal = s
        except Exception:
            size = 0
            size_literal = None
        try:
            isbn = submetadata[1].get_text().strip().replace("-", "").split(",")[0]
            isbn13 = f"978{isbn}" if len(isbn) < 13 else isbn
            isbn10 = isbn
        except Exception:
            isbn13 = 0
            isbn10 = 0
        duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': author,
            'publisher': publisher,
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text',
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal, 
            'description': str(description)
        }
        return data

    def process_page(self, page_number: int = 1, progressbar: object = None) -> []:
        #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
        page_url = f"{self.baseurl}/page/{page_number}/" if page_number > 1 else self.baseurl
        bs = BeautifulSoup(wget(page_url), 'html.parser')
        nameList = bs.find("div", {'class': 'main-content-inner'}).findAll("article", {'class': 'post'})
        data = []
        for _index, i in enumerate(nameList):
            if progressbar is not None:
                progressbar()
            data = i.find('h2').find('a')
            code = data['href'].replace(self.baseurl, "").replace("/", "")
            url = data['href']
            isset = self.data_engine.isset_code(code=code, engine=self.__host__)
            if isset is False:
                try:
                    book_data = self.process_item(code=code, referer=page_url, url=url)
                    self.item_save(book_data=book_data)
                    pass
                except Exception as e:
                    print(f"Error processing page: {page_url} , title: {data.get_text()}, item: " + url)
                    print(e)
        return True

    def count_total_pages(self) -> int:
        bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
        content = bs.find("div", {'class': 'main-content-inner'}).findAll("article")
        total_pages = int(bs.find("div", {'class': 'pagination'}).findAll("a")[-1].get_text())
        self.total_of_pages = total_pages
        self.total_items_per_page = len(content)
        return total_pages, self.total_items_per_page

    def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int):
        """
        Return all the sanitized pages

        Keyword Arguments:
            start_from_page {int} -- What page are going to start (default: {1})

        Returns:
            list -- All the pages to be processed
        """
        total_pages, num_items_per_page = self.count_total_pages()
        entries = []
        for i in range(total_pages):
            current_page = i + 1
            if current_page >= start_from_page:
                entries.append(i + 1)
        self.total_of_pages_classified = len(entries)
        return entries, num_items_per_page

    def run(self, start_from_page: int = 1) -> None:
        pages, _ = self.num_of_pages_to_process(start_from_page=start_from_page)
        for current_page in pages:
            self.process_page(current_page)

    def fix(self):
        import pprint as pp
        d = DataEngine()
        session, table = d.get_engine()
        r = session.query(table).filter(table.date == "0000-00-00").all()
        for i in r:
            processed = self.process_item(i.code)
            print("------------------ begin ----------------------")
            #table.__table__.update().where(table.id==i.id).values(date=processed['date'])
            session.query(table).filter(table.id == i.id).update({table.date: processed['date']}, synchronize_session = False)
            pp.pprint((processed['url'], ": ", processed['date']))
            print("------------------ end -----------------------")
            session.commit()
Пример #6
0
class Engine(object):
    """
    Engine to process: https://coderprog.com/
    """
    __host__ = 'freedwnlds'
    baseurl: str = "https://freedwnlds.com"
    total_of_pages: int = 0
    total_of_pages_classified: int = 0
    orm: str = ''
    data_engine: str = object

    def __init__(self, orm: str = '', **kwargs) -> None:
        self.orm = orm
        self.data_engine = DataEngine(orm=self.orm)

    def item_save(self, book_data: list) -> bool:
        try:
            result = self.data_engine.save(book_data)
        except Exception:
            result = False
        return result

    def process_item(self,
                     code: str,
                     referer: str = '',
                     url: str = None) -> object:
        if url is None:
            return False
        item_url = url
        #item_url = "https://freedwnlds.com/2020/05/hidden-figures-songbook-music-from-the-motion-picture-soundtrack-201/"
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        try:
            title = bs.find("h1", {'class': 'entry-title'}).get_text()
        except Exception:
            title = 'none'
        try:
            du = bs.find(
                "time", {'class': 'entry-date published'})['datetime'].strip()
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = bs.find("div", {
                'class': 'page-header-image-single'
            }).find('img', {'class': 'attachment-full'})['data-lazy-src']
        except Exception:
            thumb = 'none'
        try:
            description = 'none'
        except Exception:
            description = 'none'

        metadata = bs.find("meta", {'property': 'og:description'})['content']
        submetadata = metadata.split("|")
        print(submetadata)
        try:
            mdate = submetadata[-3].strip()
            date_published = datetime.strptime(mdate, '%Y').date()
        except Exception:
            date_published = None
        try:
            author = submetadata[-2].strip()
        except Exception:
            author = None
        try:
            publisher = submetadata[-4].strip()
        except Exception:
            publisher = None
        try:
            pages = submetadata[-1].strip()
            pages = re.search("Page: ([0-9]+)", pages)[1]
        except Exception:
            pages = 0
        try:
            language = submetadata[2].strip()
        except Exception:
            language = None
        try:
            s = submetadata[1].strip()
            size = int(re.search("([0-9]+) MB", s)[1]) * 1024 * 1024
            size_literal = s
        except Exception:
            size = 0
            size_literal = None
        try:
            isbn = submetadata[3].strip()
            ib = re.search("([0-9]+)", isbn)[1]
            isbn13 = ib
            isbn10 = ib[3:]
        except Exception:
            isbn13 = 0
            isbn10 = 0
        duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': author,
            'publisher': publisher,
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text',
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal,
            'description': str(description)
        }
        #pp(data)
        #exit()
        return data

    def process_page(self,
                     page_number: int = 1,
                     progressbar: object = None) -> []:
        #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
        page_url = self.baseurl + "/category/ebooks/page/" + str(
            page_number
        ) + "/" if page_number > 1 else self.baseurl + "/category/ebooks/"
        bs = BeautifulSoup(wget(page_url), 'html.parser')
        nameList = bs.findAll("article", {'class': 'post'})
        data = []
        for _index, i in enumerate(nameList):
            if progressbar is not None:
                progressbar()
            data = i.find('h2').find('a')
            data_text = data.get_text()
            code = re.search("/([0-9]+)/([0-9]+)/([0-9a-z\-]+)/",
                             data['href'])[3]
            url = data['href']
            isset = self.data_engine.isset_code(code=code,
                                                engine=self.__host__)
            if isset is False:
                try:
                    book_data = self.process_item(code=code,
                                                  referer=page_url,
                                                  url=url)
                    self.item_save(book_data=book_data)
                    pass
                except Exception as e:
                    print(
                        f"Error processing page: {page_url} , title: {data_text}, item: "
                        + url)
                    print(e)
        return True

    def count_total_pages(self) -> int:
        bs = BeautifulSoup(wget(self.baseurl + "/category/ebooks/"),
                           'html.parser')
        content = bs.findAll("a", {'class': 'page-numbers'})
        total_pages = int(content[-2].get_text().strip().replace(",", ""))
        total_items = bs.findAll("article", {'class': 'post'})
        self.total_of_pages = total_pages
        self.totat_items_per_page = len(total_items)
        return total_pages, self.totat_items_per_page

    def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int):
        """
        Return all the sanitized pages

        Keyword Arguments:
            start_from_page {int} -- What page are going to start (default: {1})

        Returns:
            list -- All the pages to be processed
        """
        total_pages, num_items_per_page = self.count_total_pages()
        entries = []
        for i in range(total_pages):
            current_page = i + 1
            if current_page >= start_from_page:
                entries.append(i + 1)
        self.total_of_pages_classified = len(entries)
        return entries, num_items_per_page

    def run(self, start_from_page: int = 1) -> None:
        pages, _ = self.num_of_pages_to_process(
            start_from_page=start_from_page)
        for current_page in pages:
            self.process_page(current_page)

    def fix(self):
        import pprint as pp
        d = DataEngine()
        session, table = d.get_engine()
        r = session.query(table).filter(table.date == "0000-00-00").all()
        for i in r:
            processed = self.process_item(i.code)
            print("------------------ begin ----------------------")
            #table.__table__.update().where(table.id==i.id).values(date=processed['date'])
            session.query(table).filter(table.id == i.id).update(
                {table.date: processed['date']}, synchronize_session=False)
            pp.pprint((processed['url'], ": ", processed['date']))
            print("------------------ end -----------------------")
            session.commit()
Пример #7
0
class Engine(object):
    """
    Engine to process: https://www.wowebook.org/
    """
    __host__ = 'wowebook'
    baseurl: str = "https://www.wowebook.org"
    total_of_pages: int = 0
    total_items_per_page: int = 0
    total_of_pages_classified: int = 0
    orm: str = ''
    data_engine: str = object

    def __init__(self, orm: str = '', **kwargs) -> None:
        self.orm = orm
        self.data_engine = DataEngine(orm=self.orm)

    def item_save(self, book_data: list) -> bool:
        try:
            result = self.data_engine.save(book_data)
        except Exception:
            result = False
        return result

    def process_item(self,
                     code: str = None,
                     referer: str = '',
                     url: str = None) -> object:
        if url is None:
            return False
        item_url = url
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        try:
            title = bs.find("h1", {'class': 'post-title'}).get_text()
        except Exception:
            title = 'none'
        try:
            du = bs.find("time", {'class': 'published'}).get_text()
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = bs.find('div', {
                'class': 'entry-inner'
            }).find("img", {'class': 'size-full'})['src'].strip()
        except Exception:
            thumb = 'none'
        try:
            description = 'none'
        except Exception:
            description = 'none'

        try:
            submetadata = bs.find("div", {
                'class': 'entry-inner'
            }).find("ul").get_text()
        except Exception:
            submetadata = ""
        #print(submetadata)
        #s = re.search(r"ISBN-13:\s([0-9a-zA-Z\-]+)", submetadata)[1]
        #print(s)
        #exit()
        try:
            date_published = re.search(r"\s\(([0-9a-zA-Z,\s]+)\)",
                                       submetadata)[1]
            date_published = parser.parse(date_published).date()
        except Exception:
            date_published = None
        try:
            author = None
        except Exception:
            author = None
        try:
            publisher = None
        except Exception:
            publisher = None
        try:
            pages = int(re.search(r":\s([0-9]+) pages", submetadata)[1])
        except Exception:
            pages = 0
        try:
            language = re.search(r"Language:\s([a-zA-Z]+)", submetadata)[1]
        except Exception:
            language = None

        try:
            size = None
            size_literal = size
        except Exception:
            size = 0
            size_literal = None
        try:
            isbn13 = re.search(r"ISBN-13:\s([0-9a-zA-Z\-]+)", submetadata)[1]
            isbn10 = re.search(r"ISBN-10:\s([0-9a-zA-Z]+)", submetadata)[1]
        except Exception:
            isbn13 = 0
            isbn10 = 0
        duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': author,
            'publisher': publisher,
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text',
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal,
            'description': str(description)
        }
        #pp(data)
        #exit()
        return data

    def process_page(self,
                     page_number: int = 1,
                     progressbar: object = None) -> []:
        #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
        page_url = f"{self.baseurl}/page/{page_number}/" if page_number > 1 else self.baseurl
        bs = BeautifulSoup(wget(page_url), 'html.parser')
        nameList = bs.find("div", {
            'class': 'post-list-standard'
        }).findAll("article")
        data = []
        for _index, i in enumerate(nameList):
            if progressbar is not None:
                progressbar()
            data = i.find('h2').find('a')
            code = data['href'].replace(self.baseurl, "").replace("/", "")
            url = data['href']
            isset = self.data_engine.isset_code(code=code,
                                                engine=self.__host__)
            if isset is False:
                try:
                    book_data = self.process_item(code=code,
                                                  referer=page_url,
                                                  url=url)
                    self.item_save(book_data=book_data)
                    pass
                except Exception as e:
                    print(
                        f"Error processing page: {page_url} , title: {data.get_text()}, item: "
                        + url)
                    print(e)
        return True

    def count_total_pages(self) -> int:
        if self.total_of_pages > 0:
            return self.total_of_pages, self.total_items_per_page
        bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
        content = bs.find("div", {
            'class': 'post-list-standard'
        }).findAll("article")
        self.total_of_pages = self.inner_total_pages()
        self.total_items_per_page = len(content)
        return self.total_of_pages, self.total_items_per_page

    def b(self, lista: list = []):
        tam = len(lista)
        if tam <= 1:
            return lista[0]
        mid = tam // 2
        r = self.check_pn(lista[mid])
        if r == 1:
            return self.b(lista[mid:])
        elif r == 2:
            return self.b(lista[:mid])
        return lista[mid]

    def check_pn(self, data: object = {}) -> int:
        p = data['page']
        uri = f"{self.baseurl}/page/{p}"
        print(f" Checking: {uri}")
        r = wget(url=uri, only_status=True)
        if r == 200:
            rb = wget(url=f"{self.baseurl}/page/{p+1}", only_status=True)
            if rb != 200:
                return 3
            else:
                return 1
        else:
            return 2
        return 3

    def inner_total_pages(self):
        return 1500  #Forced
        a = []
        e = 10000
        for i in range(e):
            a.append({'page': i + 1})
        r = self.b(a)
        return r['page']

    def num_of_pages_to_process(self, start_from_page: int = 1) -> ([], int):
        """
        Return all the sanitized pages

        Keyword Arguments:
            start_from_page {int} -- What page are going to start (default: {1})

        Returns:
            list -- All the pages to be processed
        """
        total_pages, num_items_per_page = self.count_total_pages()
        entries = []
        for i in range(total_pages):
            current_page = i + 1
            if current_page >= start_from_page:
                entries.append(i + 1)
        self.total_of_pages_classified = len(entries)
        return entries, num_items_per_page

    def run(self, start_from_page: int = 1) -> None:
        pages, _ = self.num_of_pages_to_process(
            start_from_page=start_from_page)
        for current_page in pages:
            self.process_page(current_page)

    def fix(self):
        import pprint as pp
        d = DataEngine()
        session, table = d.get_engine()
        r = session.query(table).filter(table.date == "0000-00-00").all()
        for i in r:
            processed = self.process_item(i.code)
            print("------------------ begin ----------------------")
            #table.__table__.update().where(table.id==i.id).values(date=processed['date'])
            session.query(table).filter(table.id == i.id).update(
                {table.date: processed['date']}, synchronize_session=False)
            pp.pprint((processed['url'], ": ", processed['date']))
            print("------------------ end -----------------------")
            session.commit()