Python Common примеры, Site.Common Python примеры использования

Пример #1

0

Показать файл

    def __init__(self, url):
        self.title=''#
        self.chapters=['']
        #initial author only for title page
        self.author=''#
        #the h1 tag
        self.temp=[]
        self.rawstoryhtml=['']
        self.truestoryhttml=[]
        self.length=1
        self.pbar=None
        self.url=url
        self.images=[] #testing images
        self.hasimages = True
        self.isize=0
        self.duplicate = False
        self.queue = queue.Queue()
        
        page = Common.RequestPage(url)
        
        if page is None:
            print('Could not complete request for page: ' + url)
            return None
        
        soup=BeautifulSoup(page.content, 'html.parser')
        self.title = soup.find('meta', attrs={'itemprop':'name'}).get('content')
        
        if Common.dup:
            if Common.CheckDuplicate(self.title):
                self.duplicate = True
                return None
        
        for au in soup.find_all('div', attrs={'class':'tag-container'}):
            #print('HERE1')
            for au2 in au.find_all('a'):
                #print('HERE2')
                if au2.get('href')[:7]=='/artist':
                    #print('HERE')
                    self.author=au2.get('href')[8:-1]
                    #print(self.author)
        Common.prnt(self.title+' by '+self.author)
        
        self.truestoryhttml.append('')
        self.isize=len(soup.find_all('a', attrs={'rel':'nofollow'}))

        if any(x in ('html', 'HTML', 'txt', 'TXT') for x in Common.opf):
            self.pbar = Common.Progress(self.isize)
        
        for i in soup.find_all('a', attrs={'rel':'nofollow'}):
            self.GetURLS(i.get('href'))
            break
        self.AddPage()
        
        if any(x in ('txt', 'html', 'TXT', 'HTML') for x in Common.opf) and Common.mt:
            for i in range(0, len(self.images)):
                self.queue.get()
        
        if self.pbar is not None:
            self.pbar.End()

Пример #2

0

Показать файл

 def requestPage(self, url):
     headerlist = [
         'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
         'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/41.0',
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586',
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
     ]
     header = {'user-agent': headerlist[randint(0, len(headerlist) - 1)]}
     return Common.RequestPage(url, headers=header)

Пример #3

0

Показать файл

    def __init__(self, url):
        self.title = ''
        self.author = ''
        self.story = ''
        self.rawstoryhtml = [0]
        self.storyhtml = ''
        self.url = url
        self.duplicate = False
        #page = Common.RequestPage(url)
        '''if page is None:
            print('Could not complete request for page: ' + url)
            return None
        '''

        #while page.status_code!=200:
        #print("Error getting page, trying again: status code: "+str(page.status_code))
        #time.sleep(5)
        soup = BeautifulSoup(self.requestPage(self.url).content, 'html.parser')
        #print(soup.prettify())
        titlehtml = soup.find('h1')
        self.title = titlehtml.text.strip()
        #print(self.title)

        if Common.dup:
            if Common.CheckDuplicate(self.title):
                self.duplicate = True
                return None

        authorhtml = soup.find('a', attrs={'class': 'y_eU'})
        #print(authorhtml.prettify())
        self.author = authorhtml.text.strip()
        #print(self.author)
        self.rawstoryhtml[0] = soup.find('div', attrs={'class': 'aa_ht'})
        self.story = self.rawstoryhtml[0].get_text(separator=Common.lineEnding)
        Common.prnt(self.title + ' by ' + self.author)

        nextLinkSoup = soup.find('a', attrs={'title': 'Next Page'})
        if nextLinkSoup is not None:
            self.AddNextPage(nextLinkSoup.get('href'))

        for i in self.rawstoryhtml:
            self.storyhtml += str(i.contents[0].prettify())

Пример #4

0

Показать файл

 def AddPage(self):
     i = 1
     for thisimage in self.images:      
         #print(thisimage)
         if any(x in ('html', 'HTML', 'epub', 'EPUB') for x in Common.opf):
             zeros = '0' * (len(str(self.isize))-1)
             num = i
             if len(zeros)>1 and num > 9:
                 zeros='0'
             elif len(zeros)==1 and num > 9:
                 zeros = ''
             if num > 99:
                 zeros = ''
             self.truestoryhttml[0]=self.truestoryhttml[0]+'<p><img src="'+zeros+str(num)+'.jpg" /></p>\n'
         if any(x in ('html', 'HTML', 'txt', 'TXT') for x in Common.opf):
             if Common.mt:
                 t=threading.Thread(target=Common.imageDL, args=(self.title, thisimage, i, self.isize, self.pbar, self.queue), daemon=False)
                 t.start()
             else:
                 Common.imageDL(self.title, thisimage, i, self.isize, self.pbar)
         i+=1

Пример #5

0

Показать файл

    def GetURLS(self, url):
        page=Common.RequestPage('https://nhentai.net'+url.rstrip(), headers={'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'})

        if page is None:
            print('Could not complete request for page: ' + url)
            return None

        soup=BeautifulSoup(page.content, 'html.parser')
        try:
            thisimage=soup.find('section', attrs={'id':'image-container'}).find('img').get('src')
            self.images.append(thisimage)
        except:
            print('Error in: '+url)
            
        for i in range(2, self.isize+1):
            self.images.append(thisimage[:-5]+str(i)+thisimage[-4:])

Пример #6

0

Показать файл

Файл: Classicreader.py Проект: theslavicbear/Ebook-Publisher

 def AddNextPage(self, link):
     page=Common.RequestPage(link)
     
     if page is None:
         print('Could not complete request for page: ' + url)
         return None
     
     soup=BeautifulSoup(page.content, 'html.parser')
     paragraphs=soup.find_all('p')
     #print(paragraphs)
     text=''
     for p in paragraphs:
         self.story+=re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'\n\n'
         #print(p.get_text())
         text+='<p>'+re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'</p>\n'
     temp=BeautifulSoup(text, 'html.parser')
     self.rawstoryhtml.append(temp)

Пример #7

0

Показать файл

    def AddNextPage(self, soup):
        for i in soup.find_all('button'):
            if i.text.strip() == 'Next >':
                rawnexturl = i.get('onclick')
                if urllib.parse.urlparse(self.url)[1] == 'www.fanfiction.net':
                    nexturl = 'https://www.fanfiction.net' + rawnexturl[15:-1]
                else:
                    nexturl = 'https://www.fictionpress.com' + rawnexturl[15:-1]
                #print(nexturl)
                page = Common.RequestPage(nexturl)

                if page is None:
                    print('Could not complete request for page: ' + url)
                    return None

                soup = BeautifulSoup(page.content, 'html.parser')
                self.rawstoryhtml.append(
                    soup.find('div', attrs={'id': 'storytext'}))
                self.pbar.Update()
                self.AddNextPage(soup)
                break

Пример #8

0

Показать файл

Файл: Chyoa.py Проект: theslavicbear/Ebook-Publisher

    def AddPrevPage(self, url):
        page = Common.RequestPage(url)

        if page is None:
            print('Could not complete request for page: ' + url)
            return None

        soup = BeautifulSoup(page.content, 'html.parser')
        self.authors.insert(0, soup.find_all('a')[7].get_text())
        self.chapters.insert(0, soup.find('h1').get_text())

        if Common.images:
            if soup.find('div', attrs={
                    'class': 'chapter-content'
            }).find('img'):
                for simg in soup.find('div',
                                      attrs={
                                          'class': 'chapter-content'
                                      }).find_all('img'):
                    self.images.append(simg.get('src'))
                    simg['src'] = 'img' + str(len(self.images)) + '.jpg'
                    self.hasimages = True
        temp = str(soup.find('div', attrs={'class': 'chapter-content'}))
        self.questions.insert(
            0,
            soup.find('header', attrs={
                'class': "question-header"
            }).get_text())
        temp += '<h2>' + self.questions[0] + '</h2>'
        self.temp.insert(0, temp)

        self.pbar.Update()
        for i in soup.find_all('a'):
            if i.text.strip() == 'Previous Chapter':
                self.AddPrevPage(i.get('href'))
                return
        #gets author name if on last/first page I guess
        self.authors[0] = soup.find_all('a')[5].get_text()

Пример #9

0

Показать файл

Файл: Classicreader.py Проект: theslavicbear/Ebook-Publisher

 def __init__(self, url):
     self.title=''
     self.author=''
     self.story=''
     self.rawstoryhtml=[]
     self.chapters=[]
     self.pbar=None
     self.url=url
     self.duplicate = False
     page=Common.RequestPage(url)
     if page is None:
         print('Could not complete request for page: ' + url)
         return None
     
     soup=BeautifulSoup(page.content, 'html.parser')
     #grabs important metadata information
     self.title=soup.find('span', attrs={'class': 'book-header'}).get_text()
     
     if Common.dup:
         if Common.CheckDuplicate(self.title):
             self.duplicate = True
             return None
     
     
     Common.prnt(self.title)
     self.author=soup.find('span', attrs={'class': 'by-line'}).contents[1].get_text()
     Common.prnt(self.author)
     
     #looks to see if on table of contents page
     #exception handling could be removed from here
     if soup.find('h2') is None: #and len(soup.find_all('a', attrs={'class':'categories'}))>15:
         #checks to see if single page story
         if len(soup.find_all('a', attrs={'class':'categories'}))==15:
             paragraphs=soup.find_all('p')
             #print(paragraphs)
             text=''
             for p in paragraphs:
                 self.story+=re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'\n\n'
                 #print(p.get_text())
                 text+='<p>'+re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'</p>\n'
             temp=BeautifulSoup(text, 'html.parser')
             self.chapters.append(self.title)
             self.rawstoryhtml.append(temp)
             return
         try:
             url='https://www.classicreader.com'+soup.find_all('a', attrs={'class':'categories'})[7].get('href')
             page=requests.get(url)
             soup=BeautifulSoup(page.content, 'html.parser')
             Common.prnt('got table of contents page')
         except:
             paragraphs=soup.find_all('p')
             #print(paragraphs)
             text=''
             for p in paragraphs:
                 self.story+=re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'\n\n'
                 #print(p.get_text())
                 text+='<p>'+re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'</p>\n'
             temp=BeautifulSoup(text, 'html.parser')
             self.chapters.append(self.title)
             self.rawstoryhtml.append(temp)
             return
         
     
     
     
     links=soup.find_all('a', attrs={'class': 'chapter-title'})
     
     self.pbar=Common.Progress(len(links))
     #self.pbar.Update()
     
     for i in links:
         self.AddNextPage('https://www.classicreader.com'+i.get('href'))
         self.chapters.append(i.get_text())
         self.pbar.Update()
     
     self.pbar.End()

Пример #10

0

Показать файл

    def __init__(self, url):
        self.title = ''
        self.author = ''
        self.story = ''
        self.rawstoryhtml = []
        self.length = 1
        self.summary = ''
        self.pbar = None
        self.url = url
        self.chapters = []
        self.page = None
        self.duplicate = False
        #try:
        #  page=requests.get(self.url)
        #except:
        #  print('Error accessing '+self.url+'  Try checking internet connection and url')
        #return None

        soup = BeautifulSoup(self.requestPage(self.url).content, 'html.parser')
        #print(soup.prettify())
        self.title = soup.find('h1').get_text()

        if Common.dup:
            if Common.CheckDuplicate(self.title):
                self.duplicate = True
                return None

        self.author = soup.find('span', attrs={
            'class': 'author h6'
        }).get_text()[3:]
        self.chapters.append(soup.find('h2').get_text())
        self.summary = soup.find('p', attrs={
            'class': 'item-description'
        }).get_text()
        self.rawstoryhtml.append(soup.find('pre'))

        Common.prnt(self.title + '\nby ' + self.author + '\n' + self.summary)

        self.length = len(
            soup.find('ul', attrs={
                'class': 'table-of-contents'
            }).find_all('li'))

        self.pbar = Common.Progress(self.length)
        self.pbar.Update()

        #print(self.rawstoryhtml[0].prettify())

        if soup.find('a', attrs={'class': 'next-part-link'}):
            #print(soup.find('a', attrs={'class': 'next-part-link'}).get('href'))
            self.addNextPage(
                soup.find('a', attrs={
                    'class': 'next-part-link'
                }).get('href'))

        self.pbar.End()

        for j in range(0, len(self.rawstoryhtml)):
            tmp = self.rawstoryhtml[j].prettify()[5:]
            tmp = tmp.replace('&amp;apos', '\'')
            self.rawstoryhtml[j] = BeautifulSoup(tmp, 'html.parser')

        for i in range(0, len(self.rawstoryhtml)):
            self.story = self.story + self.chapters[i] + '\n'
            self.story = self.story + self.rawstoryhtml[i].get_text()
        self.story = self.story.replace('\n', Common.lineEnding)

Пример #11

0

Показать файл

    def __init__(self, url):
        #simple string for the title
        self.title = ''
        #simple string for the author
        self.author = ''
        #Extra long string containing the text of the story
        self.story = ''
        #each node of the list contains the raw html for one page of the story
        self.rawstoryhtml = [0]
        #the raw html but prettified and concatenated together
        self.storyhtml = ''
        #array of chapter names
        self.chapters = []
        #summary
        self.summary = ''
        self.pbar = None
        self.url = url
        self.duplicate = False

        page = Common.RequestPage(url)

        if page is None:
            print('Could not complete request for page: ' + url)
            return None

        soup = BeautifulSoup(page.content, 'html.parser')
        self.rawstoryhtml[0] = soup.find('div', attrs={'id': 'storytext'})
        #self.chapters=soup.find_all('option', attrs={'selected':''})

        #F*****g magic that collects the chapter titles
        #probably doesn't work for all stories
        #seems to work for all stories, adds extra chapter title to end, oh well
        try:
            for child in soup.find(attrs={'id': 'chap_select'}).descendants:
                if child.string is None:
                    continue
                else:
                    self.chapters.append(child.string)
            #we end up with an extra chapter at the end of the file, so the band-aid fix is to delete the last node
            del self.chapters[len(self.chapters) - 1]
        except:
            print('Chapter name couldn\'t be grabbed')
            self.chapters.append(
                soup.find('b', attrs={
                    'class': 'xcontrast_txt'
                }).text.strip())
        '''So here's the deal. fanfiction.net doesn't close any of the <option> tags that contain the chapter names, so BeautifulSoup closes them all 
            at the end. This means that each option is the child of the option above it. so good luck extracting the name of each chapter individually
            There's also two (2) chapter selection fields on each web page, which makes the output look worse than it really is, since we're only ever
            going to use the first one we won't have to worry about it
            '''
        #print("Chapters:")
        #print(self.chapters)
        self.summary = soup.find_all('div',
                                     attrs={'class':
                                            'xcontrast_txt'})[0].text.strip()
        self.author = soup.find_all('a',
                                    attrs={'class':
                                           'xcontrast_txt'})[2].text.strip()
        self.title = soup.find('b', attrs={
            'class': 'xcontrast_txt'
        }).text.strip()

        if Common.dup:
            if Common.CheckDuplicate(self.title):
                self.duplicate = True
                return None

        Common.prnt(self.title + '\nby ' + self.author + '\n' + self.summary)

        #setup progress bar

        #exception handling to avoid errors on single page stories
        if soup.find('button', attrs={
                'type': 'BUTTON'
        }) is not None and soup.find('button', attrs={
                'type': 'BUTTON'
        }).text.strip() == '< Prev':
            print(
                "Non-first page entered. Ebook-Publisher will only add subsequent pages and chapter titles will be wrong"
            )
        for i in soup.find_all('button', attrs={'type': 'BUTTON'}):
            if i.text.strip() == 'Next >':
                self.pbar = Common.Progress(len(self.chapters))
                self.pbar.Update()
                self.AddNextPage(soup)
                break

        try:
            self.pbar.End()
        except:
            pass

        for i in self.rawstoryhtml:
            for j in i.contents:
                try:
                    self.storyhtml += j.get_text() + '\n\n'
                except:
                    self.storyhtml += j
        #print(self.storyhtml)
        self.story = self.storyhtml
        self.story = BeautifulSoup(self.story, 'html.parser').text
        self.story = re.sub(r'\n\s*\n', r'\n\n', self.story, flags=re.M)

Пример #12

0

Показать файл

Файл: Chyoa.py Проект: theslavicbear/Ebook-Publisher

    def AddNextPage(self, url, depth):
        #print(url)
        page = Common.RequestPage(url)

        if page is None:
            print('Could not complete request for page: ' + url)
            return None

        soup = BeautifulSoup(page.content, 'html.parser')
        self.author = (soup.find_all('a')[7].get_text())
        self.chapter = (soup.find('h1').get_text())

        if Common.images:
            if soup.find('div', attrs={
                    'class': 'chapter-content'
            }).find('img'):
                with lock2:
                    for simg in soup.find('div',
                                          attrs={
                                              'class': 'chapter-content'
                                          }).find_all('img'):

                        imgtemp = simg.get('src')
                        simg['src'] = 'img' + str(
                            len(Common.urlDict[self.ogUrl]) + 1) + '.jpg'
                        Common.urlDict[self.ogUrl][len(
                            Common.urlDict[self.ogUrl])] = imgtemp
                self.hasimages = True

        temp2 = soup.find('div', attrs={'class': 'chapter-content'})
        #self.depth+=(str(depth))
        Common.prnt(str(depth))
        temp = '<div id="' + str(depth) + '">' + str(temp2)
        self.questions.append(
            soup.find('header', attrs={
                'class': "question-header"
            }).get_text())
        temp += '<h2>' + self.questions[-1] + '</h2>\n</div>'
        #Common.prnt(str(depth))
        j = 1

        nextpages = []
        epubnextpages = []
        nextpagesurl = []
        nextpagesdepth = []
        urls = []
        temp += '<br />'
        epubtemp = temp
        nextLinks = []
        #epubNextLinks=[]
        epubCurrLink = '\n<a href="' + str(
            depth) + '.xhtml">' + 'Previous Chapter' + '</a>\n<br />'

        temp += self.prevLink

        for i in soup.find('div', attrs={
                'class': 'question-content'
        }).find_all('a'):
            if i.get_text().strip() != 'Add a new chapter':

                link = i.get_text()
                #Band aid fix for replaceable text in the next chapter links
                for l in range(len(self.renames)):
                    link = link.replace(self.oldnames[l], self.renames[l])

                if any(x in ('epub', 'EPUB') for x in Common.opf):
                    epubnextpages.append('\n<a href="' + str(depth) + '.' +
                                         str(j) + '.xhtml">' + link.strip() +
                                         '</a>\n<br />')
                nextLink = '\n<a href="#' + str(depth) + '.' + str(
                    j) + '">' + 'Previous Chapter' + '</a>\n<br />'
                nextLinks.append(nextLink)
                nextpages.append('\n<a href="#' + str(depth) + '.' + str(j) +
                                 '">' + link.strip() + '</a>\n<br />')
                nextpagesurl.append(i)
                urls.append(i.get('href'))
                nextpagesdepth.append(j)
                j += 1

        if any(x in ('epub', 'EPUB') for x in Common.opf):
            epubtemp += self.epubPrevLink
            for j in epubnextpages:
                epubtemp += j
            self.epubtemp.append(epubtemp)

        for j in nextpages:
            temp += j
        self.temp.append(temp)
        try:
            self.pbar.Update()
        except:
            pass

        #Checks if new page was a link backwards and exits if so
        self.chapNum = int(
            soup.find('p', attrs={
                'class': 'meta'
            }).get_text().split()[1])

        if self.prevChapNum >= self.chapNum:
            return None

        #Other check if current page is a link and doesn't continue if so
        prevLinkCheck1 = soup.find('span', attrs={'class': 'controls-left'})
        prevLinkCheck2 = prevLinkCheck1.find_all('a')[0].get('href')
        prevLinkId = urllib.parse.urlparse(prevLinkCheck2)[2].split('.')[1]

        currLinkId = urllib.parse.urlparse(url)[2].split('.')[1]
        if self.prevLinkId is not None and prevLinkId != self.prevLinkId:
            #print(self.prevLinkId)
            #print(prevLinkId)

            return

        self.children.extend(urls)
        for i in range(0,
                       len(nextpagesurl)):  #zip(nextpagesurl, nextpagesdepth):
            threading.Thread(target=self.ThreadAdd,
                             args=(nextpagesurl[i].get('href'),
                                   str(depth) + '.' + str(nextpagesdepth[i]),
                                   self.renames, self.oldnames, self.currLink,
                                   epubCurrLink, nextLinks[i], currLinkId),
                             daemon=True).start()

Пример #13

0

Показать файл

Файл: Chyoa.py Проект: theslavicbear/Ebook-Publisher

    def AddNextPage(self, url, depth, prevChapNum, prevLink, epubPrevLink,
                    currLink, prevLinkId):
        page = Common.RequestPage(url)

        if page is None:
            print('Could not complete request for page: ' + url)
            return None

        soup = BeautifulSoup(page.content, 'html.parser')
        self.authors.append(soup.find_all('a')[7].get_text())
        self.chapters.append(soup.find('h1').get_text())

        epubCurrLink = '\n<a href="' + str(
            depth) + '.xhtml">' + 'Previous Chapter' + '</a>\n<br />'

        if Common.images:
            if soup.find('div', attrs={
                    'class': 'chapter-content'
            }).find('img'):
                for simg in soup.find('div',
                                      attrs={
                                          'class': 'chapter-content'
                                      }).find_all('img'):
                    self.images.append(simg.get('src'))
                    simg['src'] = 'img' + str(len(self.images)) + '.jpg'
                    self.hasimages = True

        temp2 = soup.find('div', attrs={'class': 'chapter-content'})
        self.depth.append(str(depth))
        temp = '<div id="' + str(depth) + '">' + str(temp2)
        self.questions.append(
            soup.find('header', attrs={
                'class': "question-header"
            }).get_text())
        temp += '<h2>' + self.questions[-1] + '</h2>\n</div>'
        if self.partial:
            Common.prnt(str(depth))
        j = 1

        nextpages = []
        epubnextpages = []
        nextpagesurl = []
        nextpagesdepth = []
        nextLinks = []
        temp += '<br />'
        epubtemp = temp
        for i in soup.find('div', attrs={
                'class': 'question-content'
        }).find_all('a'):
            if i.get_text().strip() != 'Add a new chapter':

                link = i.get_text()
                #Band aid fix for replaceable text in the next chapter links
                for l in range(len(self.renames)):
                    link = link.replace(self.oldnames[l], self.renames[l])
                nextLink = '\n<a href="#' + str(depth) + '.' + str(
                    j) + '">' + 'Previous Chapter' + '</a>\n<br />'
                #nextLinks.append(nextLink)

                if any(x in ('epub', 'EPUB') for x in Common.opf):
                    epubnextpages.append('\n<a href="' + str(depth) + '.' +
                                         str(j) + '.xhtml">' + link.strip() +
                                         '</a>\n<br />')
                nextpages.append('\n<a href="#' + str(depth) + '.' + str(j) +
                                 '">' + link.strip() + '</a>\n<br />')
                #nextpages.append(prevLink)
                nextpagesurl.append(i)
                nextpagesdepth.append(j)
                j += 1
        temp += prevLink

        if any(x in ('epub', 'EPUB') for x in Common.opf):
            epubtemp += epubPrevLink
            for j in epubnextpages:
                epubtemp += j
            self.epubtemp.append(epubtemp)

        for j in nextpages:
            temp += j
        self.temp.append(temp)
        try:
            self.pbar.Update()
        except:
            pass
        #Checks if new page was a link backwards and exits if so
        chapNum = int(
            soup.find('p', attrs={
                'class': 'meta'
            }).get_text().split()[1])

        if prevChapNum >= chapNum:
            return None

        #Other check if current page is a link and doesn't continue if so
        prevLinkCheck1 = soup.find('span', attrs={'class': 'controls-left'})
        prevLinkCheck2 = prevLinkCheck1.find_all('a')[0].get('href')
        prevLinkId1 = urllib.parse.urlparse(prevLinkCheck2)[2].split('.')[1]

        currLinkId = urllib.parse.urlparse(url)[2].split('.')[1]
        if prevLinkId is not None and prevLinkId1 != prevLinkId:
            #print(prevLinkId1)
            #print(prevLinkId)

            return

        for i, j in zip(nextpagesurl, nextpagesdepth):
            self.AddNextPage(i.get('href'),
                             str(depth) + '.' + str(j), chapNum, currLink,
                             epubCurrLink, nextLink, currLinkId)

Пример #14

0

Показать файл

Файл: Chyoa.py Проект: theslavicbear/Ebook-Publisher

    def __init__(self, url):
        self.title = ''
        #initial author only for title page
        self.author = ''
        #author for each individual chapter
        self.authors = []
        #the h1 tag
        self.chapters = []
        self.story = ''
        self.temp = []
        self.epubtemp = []
        self.rawstoryhtml = []
        self.epubrawstoryhtml = []
        #the question at the end of each page
        self.questions = []
        self.summary = ''
        self.renames = []
        self.oldnames = []
        self.truestoryhttml = []
        self.epubtruestoryhttml = []
        self.length = 1
        self.pbar = None
        self.url = url
        self.images = []  #testing images
        self.hasimages = False
        self.duplicate = False
        self.backwards = not Common.chyoa_force_forwards
        self.depth = []
        self.quiet = Common.quiet
        self.epubnextpages = []
        self.nextLinks = []
        self.partial = False
        self.partialStart = 1
        self.ogUrl = self.url

        page = Common.RequestPage(url)

        if page is None:
            print('Could not complete request for page: ' + url)
            return None

        soup = BeautifulSoup(page.content, 'html.parser')
        self.title = soup.find('h3').get_text()
        if self.title == 'Log In':
            try:
                self.title = soup.find('h1').get_text()
                self.backwards = False

            except:
                pass

        elif not self.backwards:
            self.partial = True

        #get update timestamp:
        if (self.backwards or self.partial) and Common.chyoaDupCheck:
            date = soup.find('p', attrs={'class': 'dates'}).strong.get_text()
            #date='Jun 18, 2022'
            timestamp = datetime.strptime(date, "%b %d, %Y")
            #print(timestamp)
            if not Common.CheckDuplicateTime(self.title, timestamp):
                Common.prnt('Story not updated: ' + self.url, f=True)
                self.duplicate = True
                return None

        #check duplicate with timestamp

        if Common.dup:
            if Common.CheckDuplicate(self.title):
                self.duplicate = True
                return None

        if self.backwards or self.partial:
            self.authors.insert(0, soup.find_all('a')[7].get_text())
        else:
            self.authors.insert(0, soup.find_all('a')[5].get_text())
        self.chapters.insert(0, soup.find('h1').get_text())
        self.summary = soup.find('p', attrs={'class': 'synopsis'}).get_text()

        tmp = soup.find('p', attrs={'class': 'meta'}).get_text()
        t = [s for s in tmp.split() if s.isdigit()]
        self.length = int(t[0])
        self.partialStart = self.length

        if soup.find('form', attrs={'id': 'immersion-form'}) is not None:
            inputs = soup.find('form', attrs={
                'id': 'immersion-form'
            }).find_all('input', attrs={'value': ''})
            with lock:
                if Common.mt == True:
                    Common.quiet = True
                for i in range(len(inputs)):
                    print(self.title)
                    print('Input immersion variable ' + str(i) + ' ' +
                          soup.find('label', attrs={
                              'for': 'c' + str(i)
                          }).get_text() + ' (' + inputs[i].get('placeholder') +
                          ') (Leave blank to keep placeholder name)')
                    try:
                        newname = input()
                        self.renames.append(newname)
                    except:
                        self.renames.append('')
                    self.oldnames.append(inputs[i].get('placeholder'))
                    if self.renames[i] == '':
                        self.renames[i] = self.oldnames[i]
            if Common.mt == True:
                Common.quiet = self.quiet
                #if q:
                #    sys.stdout=open(os.devnull, 'w')
        #if args.quiet:
        Common.prnt(self.title + '\n' + str(self.authors) + '\n' +
                    self.summary)
        #print(self.chapters)
        if self.backwards:
            self.pbar = Common.Progress(self.length)

        #for name in self.renames:

        if Common.images:
            if soup.find('div', attrs={
                    'class': 'chapter-content'
            }).find('img'):
                with lock2:
                    for simg in soup.find('div',
                                          attrs={
                                              'class': 'chapter-content'
                                          }).find_all('img'):

                        imgtemp = simg.get('src')
                        simg['src'] = 'img' + str(
                            len(Common.urlDict[self.ogUrl]) + 1) + '.jpg'
                        Common.urlDict[self.ogUrl][len(
                            Common.urlDict[self.ogUrl])] = imgtemp
                self.hasimages = True

        temp = str(soup.find('div', attrs={'class': 'chapter-content'}))

        #The second H2 tag may not exist if there is no sub title on a story, so we grab the first in such an event
        try:
            self.questions.insert(
                0,
                soup.find('header', attrs={
                    'class': "question-header"
                }).get_text())
        except IndexError as IE:
            self.questions.insert(0, soup.find_all('h2')[0].get_text())
        temp += '<h2>' + self.questions[0] + '</h2>'
        self.temp.insert(0, temp)
        if self.backwards:
            self.pbar.Update()

        #if soup.find('a').text.strip()==
        #self.backwards = not Common.chyoa_force_forwards
        for i in soup.find_all('a'):
            if i.text.strip() == 'Previous Chapter' and self.backwards:
                self.AddPrevPage(i.get('href'))
                self.backwards = True
                break

        #Gets here if it's the intro page that is used
        if not self.backwards:
            self.Pages = []
            urls = []

            #Starting the Progress Bar
            numChaptersTempTemp = soup.find_all('li')
            for i in numChaptersTempTemp:
                if i.find('i', attrs={'class': 'bt-book-open'}):
                    numChapters = i.get_text().split()[0]
                    #Removes commas from stories with over 999 pages
                    numChapters = numChapters.replace(',', '')
            try:
                if not Common.mt:
                    if self.partial:
                        print('Downloading an unknown number of pages')
                    else:
                        self.pbar = Common.Progress(int(numChapters))
                        self.pbar.Update()
            except:
                pass

            self.q = queue.Queue()
            #print(threading.active_count())

            j = 1
            self.temp[0] += '\n<br />'
            self.epubtemp = self.temp.copy()
            for i in soup.find('div', attrs={
                    'class': 'question-content'
            }).find_all('a'):
                link = i.get_text()
                if link.strip() != 'Add a new chapter':

                    #Band aid fix for replaceable text in the next chapter links

                    for l in range(len(self.renames)):
                        link = link.replace(self.oldnames[l], self.renames[l])

                    if any(x in ('epub', 'EPUB') for x in Common.opf):
                        self.epubtemp[0] += '\n<a href="' + str(
                            j) + '.xhtml">' + link.strip() + '</a>\n<br />'

                    nextLink = '\n<a href="#' + str(
                        j) + '">' + 'Previous Chapter' + '</a>\n<br />'
                    self.temp[0] += '\n<a href="#' + str(
                        j) + '">' + link.strip() + '</a>\n<br />'
                    self.nextLinks.append(nextLink)
                    urls.append(i.get('href'))
                    j += 1
            self.Pages.extend(urls)
            j = 1
            for u in urls:
                if Common.mt and not self.partial:
                    chapNum = int(
                        soup.find('p', attrs={
                            'class': 'meta'
                        }).get_text().split()[1])
                    firstLinkId = None
                    threading.Thread(
                        target=self.ThreadAdd,
                        args=(
                            u, j, self.renames, self.oldnames, chapNum,
                            '<a href="#Chapter 0">Previous Chapter</a>\n<br />',
                            '\n<a href="' + 'Chapter 1' + '.xhtml">' +
                            'Previous Chapter' + '</a>\n<br />',
                            self.nextLinks[j - 1], firstLinkId, self.url),
                        daemon=True).start()  #TODO
                else:
                    if Common.mt:
                        Common.prnt(
                            'Warning: Cannot multithread partial Chyoa story: '
                            + self.url +
                            '\nUsing default method to download an unknown number of pages'
                        )

                    self.AddNextPage(
                        u, j, 1,
                        '<a href="#Chapter 0">Previous Chapter</a>\n<br />',
                        '\n<a href="' + 'Chapter 1' + '.xhtml">' +
                        'Previous Chapter' + '</a>\n<br />',
                        self.nextLinks[j - 1], None)
                j += 1
            if Common.mt and not self.partial:
                i = int(numChapters) - 1
                print("Pages to add: " + str(i))
                while i > 0:
                    #print(str(i))
                    self.q.get()
                    i -= 1
                #print(threading.active_count())
                for page in self.Pages:
                    self.addPage(page)

        try:
            self.pbar.End()
        except:
            pass
        if self.backwards:

            self.epubtemp = self.temp.copy()

        #band-aid fix for names in chapter titles
        #WARNING DO NOT PUT THIS TO PRODUCTION
        for i in range(len(self.chapters)):
            for j in range(len(self.renames)):
                #print(self.chapters[i])
                self.chapters[i] = self.chapters[i].replace(
                    self.oldnames[j], self.renames[j])
                #print(self.chapters[i])

        #TODO regular expressions go here

        for i in range(len(self.temp)):
            self.temp[
                i] = '\n<h4>by ' + self.authors[i] + '</h4>' + self.temp[i]
            if any(x in ('epub', 'EPUB') for x in Common.opf):
                self.epubtemp[i] = '\n<h4>by ' + self.authors[
                    i] + '</h4>' + self.epubtemp[i]
            self.rawstoryhtml.append(BeautifulSoup(self.temp[i],
                                                   'html.parser'))
            if any(x in ('epub', 'EPUB') for x in Common.opf):
                self.epubrawstoryhtml.append(
                    BeautifulSoup(self.epubtemp[i], 'html.parser'))
        #print(self.rawstoryhtml[len(self.rawstoryhtml)-1].get_text())
        self.author = self.authors[0]
        #print(self.chapters)

        #replaces replaceable text in the story
        for i in self.rawstoryhtml:
            for j in range(len(self.renames)):
                for k in i.find_all(
                        'span',
                        attrs={'class': 'js-immersion-receiver-c' + str(j)}):
                    k.string = self.renames[j]
            self.story += self.chapters[self.rawstoryhtml.index(
                i)] + i.get_text()

            self.truestoryhttml.append(str(i))
        if any(x in ('epub', 'EPUB') for x in Common.opf):
            for i in self.epubrawstoryhtml:
                for j in range(len(self.renames)):
                    for l in i.find_all('span',
                                        attrs={
                                            'class':
                                            'js-immersion-receiver-c' + str(j)
                                        }):
                        l.string = self.renames[j]
                self.epubtruestoryhttml.append(str(i))

        for i in range(len(self.truestoryhttml)):
            self.truestoryhttml[i] = self.truestoryhttml[i].replace(
                '\n  <span', '<span')
            self.truestoryhttml[i] = self.truestoryhttml[i].replace(
                '<span', ' <span')
            for j in self.renames:
                self.truestoryhttml[i] = self.truestoryhttml[i].replace(
                    '\n   ' + j + '\n', j)
            self.truestoryhttml[i] = self.truestoryhttml[i].replace(
                '  </span>\n  ', '</span> ')

        if any(x in ('epub', 'EPUB') for x in Common.opf):
            for i in range(len(self.epubtruestoryhttml)):
                self.epubtruestoryhttml[i] = self.epubtruestoryhttml[
                    i].replace('\n <span', '<span')
                self.epubtruestoryhttml[i] = self.epubtruestoryhttml[
                    i].replace('<span', ' <span')
                for j in self.renames:
                    self.epubtruestoryhttml[i] = self.epubtruestoryhttml[
                        i].replace('\n   ' + j + '\n', j)
                self.epubtruestoryhttml[i] = self.epubtruestoryhttml[
                    i].replace('  </span>\n  ', '</span> ')

        self.story = self.story.replace('\n', Common.lineEnding)

        for i in range(0, len(self.truestoryhttml)):
            self.rawstoryhtml[i] = BeautifulSoup(self.truestoryhttml[i],
                                                 'html.parser')

        if any(x in ('epub', 'EPUB') for x in Common.opf):
            for i in range(0, len(self.epubtruestoryhttml)):
                self.epubrawstoryhtml[i] = BeautifulSoup(
                    self.epubtruestoryhttml[i], 'html.parser')

        if Common.images and self.hasimages and any(x in ('html', 'HTML')
                                                    for x in Common.opf):
            for i in range(0, len(Common.urlDict[self.url])):
                Common.prnt("Getting image " + str(i + 1) + " at: " +
                            str(Common.urlDict[self.url][i]))
                try:
                    Common.imageDL(self.title,
                                   Common.urlDict[self.url][i],
                                   i + 1,
                                   size=len(Common.urlDict[self.url]))
                except urllib.error.HTTPError as FE:
                    continue

Python Common примеры использования