Пример #1
0
 def check_pn(self, data: object = {}) -> int:
     p = data['page']
     uri = f"{self.baseurl}/page/{p}"
     print(f" Checking: {uri}")
     r = wget(url=uri, only_status=True)
     if r == 200:
         rb = wget(url=f"{self.baseurl}/page/{p+1}", only_status=True)
         if rb != 200:
             return 3
         else:
             return 1
     else:
         return 2
     return 3
Пример #2
0
    def process_page(self,
                     page_number: int = 1,
                     progressbar: object = None) -> []:
        #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
        page_url = self.baseurl + "/page/" + str(
            page_number) + "/" if page_number > 1 else self.baseurl
        bs = BeautifulSoup(wget(page_url), 'html.parser')
        nameList = bs.find("div", {
            'id': 'content_box'
        }).findAll('article', {'class': 'latestPost'})
        data = []
        for _index, i in enumerate(nameList):
            if progressbar is not None:
                progressbar()

            data = i.find('h2').find('a')
            data_text = data.get_text()
            code = data['href'].replace(self.baseurl, "").replace("/", "")
            #print(f"\t\t[page={page_number}]item: " + str(index + 1) + " of " + str(len(nameList)))
            isset = self.data_engine.isset_code(code=code,
                                                engine=self.__host__)
            #print(f"{code} => {isset}")
            if isset is False:
                try:
                    book_data = self.process_item(code=code, referer=page_url)
                    self.item_save(book_data=book_data)
                    pass
                except Exception as e:
                    print(
                        f"Error processing page: {page_url} , title: {data_text}, item: "
                        + self.baseurl + "/" + code + "/")
                    print(e)
        return True
Пример #3
0
 def count_total_pages(self) -> int:
     bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
     content = bs.find("div", {'class': 'main-content-inner'}).findAll("article")
     total_pages = int(bs.find("div", {'class': 'pagination'}).findAll("a")[-1].get_text())
     self.total_of_pages = total_pages
     self.total_items_per_page = len(content)
     return total_pages, self.total_items_per_page
Пример #4
0
 def process_page(self,
                  page_number: int = 1,
                  progressbar: object = None) -> []:
     #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
     page_url = self.baseurl + "/category/ebooks/page/" + str(
         page_number
     ) + "/" if page_number > 1 else self.baseurl + "/category/ebooks/"
     bs = BeautifulSoup(wget(page_url), 'html.parser')
     nameList = bs.findAll("article", {'class': 'post'})
     data = []
     for _index, i in enumerate(nameList):
         if progressbar is not None:
             progressbar()
         data = i.find('h2').find('a')
         data_text = data.get_text()
         code = re.search("/([0-9]+)/([0-9]+)/([0-9a-z\-]+)/",
                          data['href'])[3]
         url = data['href']
         isset = self.data_engine.isset_code(code=code,
                                             engine=self.__host__)
         if isset is False:
             try:
                 book_data = self.process_item(code=code,
                                               referer=page_url,
                                               url=url)
                 self.item_save(book_data=book_data)
                 pass
             except Exception as e:
                 print(
                     f"Error processing page: {page_url} , title: {data_text}, item: "
                     + url)
                 print(e)
     return True
Пример #5
0
 def process_page(self,
                  page_number: int = 1,
                  progressbar: object = None) -> []:
     #print("Processing Page: " + str(page_number) + " of " + str(self.total_of_pages))
     page_url = f"{self.baseurl}/page/{page_number}/" if page_number > 1 else self.baseurl
     bs = BeautifulSoup(wget(page_url), 'html.parser')
     nameList = bs.find("div", {
         'class': 'post-list-standard'
     }).findAll("article")
     data = []
     for _index, i in enumerate(nameList):
         if progressbar is not None:
             progressbar()
         data = i.find('h2').find('a')
         code = data['href'].replace(self.baseurl, "").replace("/", "")
         url = data['href']
         isset = self.data_engine.isset_code(code=code,
                                             engine=self.__host__)
         if isset is False:
             try:
                 book_data = self.process_item(code=code,
                                               referer=page_url,
                                               url=url)
                 self.item_save(book_data=book_data)
                 pass
             except Exception as e:
                 print(
                     f"Error processing page: {page_url} , title: {data.get_text()}, item: "
                     + url)
                 print(e)
     return True
Пример #6
0
 def count_total_pages(self) -> int:
     bs = BeautifulSoup(wget(self.baseurl + "/category/ebooks/"),
                        'html.parser')
     content = bs.findAll("a", {'class': 'page-numbers'})
     total_pages = int(content[-2].get_text().strip().replace(",", ""))
     total_items = bs.findAll("article", {'class': 'post'})
     self.total_of_pages = total_pages
     self.totat_items_per_page = len(total_items)
     return total_pages, self.totat_items_per_page
Пример #7
0
 def count_total_pages(self) -> int:
     if self.total_of_pages > 0:
         return self.total_of_pages, self.total_items_per_page
     bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
     content = bs.find("div", {
         'class': 'post-list-standard'
     }).findAll("article")
     self.total_of_pages = self.inner_total_pages()
     self.total_items_per_page = len(content)
     return self.total_of_pages, self.total_items_per_page
Пример #8
0
    def count_total_pages(self) -> int:
        bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
        content = bs.findAll("a", {'class': 'page-numbers'})
        total_pages = int(content[-2].get_text().strip().replace(",", ""))

        total_items = bs.find("div", {
            'id': 'content_box'
        }).findAll('article', {'class': 'latestPost'})
        self.total_of_pages = total_pages
        self.totat_items_per_page = len(total_items)
        return total_pages, self.totat_items_per_page
Пример #9
0
 def count_total_pages(self) -> int:
     bs = BeautifulSoup(wget(self.baseurl), 'html.parser')
     content = bs.find("li",
                       {'class': 'page-item disabled d-none d-lg-block'})
     sp = re.search('of ([0-9]+)',
                    content.get_text().strip(),
                    flags=re.IGNORECASE)
     total_pages = int(sp[1])
     total_items = bs.findAll('div', {'class': 'card-body p-2'})
     self.total_of_pages = total_pages
     self.totat_items_per_page = len(total_items)
     return total_pages, self.totat_items_per_page
Пример #10
0
    def process_item(self, code: str, referer: str = '') -> object:
        item_url = self.baseurl + "/" + code + "/"
        #item_url = "https://coderprog.com/red-hat-certified-engineer-rhel-8-rhce/"
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        du = bs.find("span", {
            'class': 'thetime date updated'
        }).get_text().strip()
        try:
            du = bs.find("span", {
                'class': 'thetime date updated'
            }).get_text().strip()
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = self.baseurl + bs.find("div", {
                'class': 'thecontent'
            }).find("img")['src']
        except Exception:
            thumb = 'none'
        try:
            description = 'none'
        except Exception:
            description = 'none'
        try:
            title = bs.find("div", {'class': 'thecontent'}).find("img")['alt']
        except Exception:
            title = 'none'
        metadata = bs.find("div", {
            'class': 'thecontent'
        }).findAll("div")[0].get_text().strip().split("\n")[1]
        submetadata = metadata.split("|")
        video = True if "MP4" in metadata else False
        if video is True:
            date_published = None
            pages = 0
            try:
                language = submetadata[0].strip()
            except Exception:
                language = 0
            try:
                size = 0
                size_literal = submetadata[-1].strip()
            except Exception:
                size = 0
                size_literal = None
            try:
                duration = 0
                duration_literal = submetadata[-2].strip()
            except Exception:
                duration = None
                duration_literal = None
            isbn10 = isbn13 = 0

        else:
            try:
                mdate = submetadata[1].strip()
                date_published = datetime.strptime(mdate, '%Y').date()
            except Exception:
                date_published = None
            try:
                pages = submetadata[3].strip()
                pages = re.search("([0-9]+) Pages", pages)[1]
            except Exception:
                pages = 0
            try:
                language = submetadata[0].strip()
            except Exception:
                language = 0
            try:
                s = submetadata[5].strip()
                size = int(re.search("([0-9]+) MB", s)[1]) * 1024 * 1024
                size_literal = s
            except Exception:
                size = 0
                size_literal = None
            try:
                isbn = submetadata[2].strip()
                ib = re.search("ISBN: ([0-9]+)", isbn)[1]
                isbn13 = "978-" + ib
                isbn10 = ib
            except Exception:
                isbn13 = 0
                isbn10 = 0
            duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': "none",
            'publisher': "none",
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text' if video is False else "video",
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal,
            'description': str(description)
        }
        return data
Пример #11
0
 def process_item(self, code: str) -> object:
     item_url = self.baseurl + "/" + code + "/"
     bs = BeautifulSoup(wget(item_url), 'html.parser')
     try:
         du = bs.find("meta",
                      {'property': 'article:published_time'})['content']
         date_posted = parser.parse(du).date()
     except Exception:
         date_posted = None
     try:
         thumb = bs.find("img",
                         {'class': 'align-self-start img-fluid'})['src']
     except Exception:
         thumb = 'none'
     try:
         description = bs.find("div", {
             'class': 'col-md-8'
         }).find("div", {
             'class': 'card mb-4'
         }).find("div", {'class': 'card-body'})
     except Exception:
         description = 'none'
     data = {
         'title': "none",
         'date_published': None,
         'date_posted': date_posted,
         'pages': 0,
         'language': "none",
         'code': code,
         'url': item_url,
         'author': "none",
         'publisher': "none",
         'isbn10': "",
         'isbn13': "none",
         'thumbnail': thumb,
         'engine': 'letmeread',
         'format': 'text',
         'size': 0,
         'description': (description)
     }
     c = bs.find("ul", {'class': 'list-unstyled mb-0'}).findAll("li")
     for i in c:
         cc = i.get_text().strip()
         item = re.findall("([a-zA-Z0-9\- ]+): (.*)", cc)
         # print(item)
         ititle = item[0][0].strip()
         ivalue = item[0][1].strip()
         if (ititle == "Title"):
             data['title'] = ivalue
         elif (ititle == "Author"):
             data['author'] = ivalue
         elif (ititle == "Length"):
             num_of_pages = re.search("([0-9]+) pages", ivalue)[1]
             data['pages'] = num_of_pages
         elif (ititle == "Language"):
             data['language'] = ivalue
         elif (ititle == "Publisher"):
             data['publisher'] = ivalue
         elif (ititle == "Publication Date"):
             try:
                 d = datetime.strptime(ivalue, '%Y').date()
                 data['date_published'] = d
             except Exception:
                 try:
                     d = datetime.strptime(ivalue, '%Y-%m-%d').date()
                     data['date_published'] = d
                 except Exception:
                     try:
                         d = datetime.strptime(ivalue, '%Y-%m').date()
                         data['date_published'] = d
                     except Exception:
                         pass
         elif (ititle == "ISBN-10"):
             data['isbn10'] = ivalue
         elif (ititle == "ISBN-13"):
             data['isbn13'] = ivalue
     return data
Пример #12
0
 def process_item(self, code: str, referer: str = '', url: str = None) -> object:
     if url is None:
         return False
     item_url = url
     bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
     try:
         title = bs.find("h1", {'class': 'single-title'}).get_text()
         try:
             sub = bs.find("header", {'class': 'entry-header'}).find("h4").get_text().strip()
             sub = ": " + sub
         except Exception:
             sub = ""
         title = f"{title}{sub}"
     except Exception:
         title = 'none'
     try:
         du = re.search("/uploads/([0-9]+)/([0-9]+)/", bs.find("img", {'class': 'attachment-post-thumbnail'})['src'].strip())
         date_posted = datetime.strptime(f"{du[1]}-{du[2]}", "%Y-%m").date()
     except Exception:
         date_posted = None
     try:
         thumb = bs.find("img", {'class': 'attachment-post-thumbnail'})['src'].strip()
     except Exception:
         thumb = 'none'
     try:
         description = bs.find("div", "entry-content")
         description.find("h3").decompose()
     except Exception:
         description = 'none'
     submetadata = bs.find("div", {'class': 'book-detail'}).findAll("dd")
     #pp(submetadata)
     #exit()
     try:
         mdate = submetadata[2].get_text().strip()
         date_published = datetime.strptime(mdate, '%Y').date()
     except Exception:
         date_published = None
     try:
         author = submetadata[0].get_text().strip()
     except Exception:
         author = None
     try:
         publisher = None
     except Exception:
         publisher = None
     try:
         pages = submetadata[3].get_text().strip()
     except Exception:
         pages = 0
     try:
         language = submetadata[4].get_text().strip()
     except Exception:
         language = None
     s = submetadata[5].get_text().strip()
     try:
         s = submetadata[5].get_text().strip()
         size = int(round(float(re.search("(.*) MB", s)[1]))) * 1024 * 1024
         size_literal = s
     except Exception:
         size = 0
         size_literal = None
     try:
         isbn = submetadata[1].get_text().strip().replace("-", "").split(",")[0]
         isbn13 = f"978{isbn}" if len(isbn) < 13 else isbn
         isbn10 = isbn
     except Exception:
         isbn13 = 0
         isbn10 = 0
     duration_literal = duration = None
     data = {
         'title': title,
         'date_published': date_published,
         'date_posted': date_posted,
         'pages': pages,
         'language': language,
         'code': code,
         'url': item_url,
         'author': author,
         'publisher': publisher,
         'isbn10': isbn10,
         'isbn13': isbn13,
         'thumbnail': thumb,
         'engine': self.__host__,
         'format': 'text',
         'size': size,
         'size_literal': size_literal,
         'duration': duration,
         'duration_literal': duration_literal, 
         'description': str(description)
     }
     return data
Пример #13
0
    def process_item(self,
                     code: str,
                     referer: str = '',
                     url: str = None) -> object:
        if url is None:
            return False
        item_url = url
        #item_url = "https://freedwnlds.com/2020/05/hidden-figures-songbook-music-from-the-motion-picture-soundtrack-201/"
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        try:
            title = bs.find("h1", {'class': 'entry-title'}).get_text()
        except Exception:
            title = 'none'
        try:
            du = bs.find(
                "time", {'class': 'entry-date published'})['datetime'].strip()
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = bs.find("div", {
                'class': 'page-header-image-single'
            }).find('img', {'class': 'attachment-full'})['data-lazy-src']
        except Exception:
            thumb = 'none'
        try:
            description = 'none'
        except Exception:
            description = 'none'

        metadata = bs.find("meta", {'property': 'og:description'})['content']
        submetadata = metadata.split("|")
        print(submetadata)
        try:
            mdate = submetadata[-3].strip()
            date_published = datetime.strptime(mdate, '%Y').date()
        except Exception:
            date_published = None
        try:
            author = submetadata[-2].strip()
        except Exception:
            author = None
        try:
            publisher = submetadata[-4].strip()
        except Exception:
            publisher = None
        try:
            pages = submetadata[-1].strip()
            pages = re.search("Page: ([0-9]+)", pages)[1]
        except Exception:
            pages = 0
        try:
            language = submetadata[2].strip()
        except Exception:
            language = None
        try:
            s = submetadata[1].strip()
            size = int(re.search("([0-9]+) MB", s)[1]) * 1024 * 1024
            size_literal = s
        except Exception:
            size = 0
            size_literal = None
        try:
            isbn = submetadata[3].strip()
            ib = re.search("([0-9]+)", isbn)[1]
            isbn13 = ib
            isbn10 = ib[3:]
        except Exception:
            isbn13 = 0
            isbn10 = 0
        duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': author,
            'publisher': publisher,
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text',
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal,
            'description': str(description)
        }
        #pp(data)
        #exit()
        return data
Пример #14
0
    def process_item(self,
                     code: str = None,
                     referer: str = '',
                     url: str = None) -> object:
        if url is None:
            return False
        item_url = url
        bs = BeautifulSoup(wget(item_url, referer=referer), 'html.parser')
        try:
            title = bs.find("h1", {'class': 'post-title'}).get_text()
        except Exception:
            title = 'none'
        try:
            du = bs.find("time", {'class': 'published'}).get_text()
            date_posted = parser.parse(du).date()
        except Exception:
            date_posted = None
        try:
            thumb = bs.find('div', {
                'class': 'entry-inner'
            }).find("img", {'class': 'size-full'})['src'].strip()
        except Exception:
            thumb = 'none'
        try:
            description = 'none'
        except Exception:
            description = 'none'

        try:
            submetadata = bs.find("div", {
                'class': 'entry-inner'
            }).find("ul").get_text()
        except Exception:
            submetadata = ""
        #print(submetadata)
        #s = re.search(r"ISBN-13:\s([0-9a-zA-Z\-]+)", submetadata)[1]
        #print(s)
        #exit()
        try:
            date_published = re.search(r"\s\(([0-9a-zA-Z,\s]+)\)",
                                       submetadata)[1]
            date_published = parser.parse(date_published).date()
        except Exception:
            date_published = None
        try:
            author = None
        except Exception:
            author = None
        try:
            publisher = None
        except Exception:
            publisher = None
        try:
            pages = int(re.search(r":\s([0-9]+) pages", submetadata)[1])
        except Exception:
            pages = 0
        try:
            language = re.search(r"Language:\s([a-zA-Z]+)", submetadata)[1]
        except Exception:
            language = None

        try:
            size = None
            size_literal = size
        except Exception:
            size = 0
            size_literal = None
        try:
            isbn13 = re.search(r"ISBN-13:\s([0-9a-zA-Z\-]+)", submetadata)[1]
            isbn10 = re.search(r"ISBN-10:\s([0-9a-zA-Z]+)", submetadata)[1]
        except Exception:
            isbn13 = 0
            isbn10 = 0
        duration_literal = duration = None
        data = {
            'title': title,
            'date_published': date_published,
            'date_posted': date_posted,
            'pages': pages,
            'language': language,
            'code': code,
            'url': item_url,
            'author': author,
            'publisher': publisher,
            'isbn10': isbn10,
            'isbn13': isbn13,
            'thumbnail': thumb,
            'engine': self.__host__,
            'format': 'text',
            'size': size,
            'size_literal': size_literal,
            'duration': duration,
            'duration_literal': duration_literal,
            'description': str(description)
        }
        #pp(data)
        #exit()
        return data