def __init__(self, url): self.title=''# self.chapters=[''] #initial author only for title page self.author=''# #the h1 tag self.temp=[] self.rawstoryhtml=[''] self.truestoryhttml=[] self.length=1 self.pbar=None self.url=url self.images=[] #testing images self.hasimages = True self.isize=0 self.duplicate = False self.queue = queue.Queue() page = Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup=BeautifulSoup(page.content, 'html.parser') self.title = soup.find('meta', attrs={'itemprop':'name'}).get('content') if Common.dup: if Common.CheckDuplicate(self.title): self.duplicate = True return None for au in soup.find_all('div', attrs={'class':'tag-container'}): #print('HERE1') for au2 in au.find_all('a'): #print('HERE2') if au2.get('href')[:7]=='/artist': #print('HERE') self.author=au2.get('href')[8:-1] #print(self.author) Common.prnt(self.title+' by '+self.author) self.truestoryhttml.append('') self.isize=len(soup.find_all('a', attrs={'rel':'nofollow'})) if any(x in ('html', 'HTML', 'txt', 'TXT') for x in Common.opf): self.pbar = Common.Progress(self.isize) for i in soup.find_all('a', attrs={'rel':'nofollow'}): self.GetURLS(i.get('href')) break self.AddPage() if any(x in ('txt', 'html', 'TXT', 'HTML') for x in Common.opf) and Common.mt: for i in range(0, len(self.images)): self.queue.get() if self.pbar is not None: self.pbar.End()
def requestPage(self, url): headerlist = [ 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/41.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' ] header = {'user-agent': headerlist[randint(0, len(headerlist) - 1)]} return Common.RequestPage(url, headers=header)
def __init__(self, url): self.title = '' self.author = '' self.story = '' self.rawstoryhtml = [0] self.storyhtml = '' self.url = url self.duplicate = False #page = Common.RequestPage(url) '''if page is None: print('Could not complete request for page: ' + url) return None ''' #while page.status_code!=200: #print("Error getting page, trying again: status code: "+str(page.status_code)) #time.sleep(5) soup = BeautifulSoup(self.requestPage(self.url).content, 'html.parser') #print(soup.prettify()) titlehtml = soup.find('h1') self.title = titlehtml.text.strip() #print(self.title) if Common.dup: if Common.CheckDuplicate(self.title): self.duplicate = True return None authorhtml = soup.find('a', attrs={'class': 'y_eU'}) #print(authorhtml.prettify()) self.author = authorhtml.text.strip() #print(self.author) self.rawstoryhtml[0] = soup.find('div', attrs={'class': 'aa_ht'}) self.story = self.rawstoryhtml[0].get_text(separator=Common.lineEnding) Common.prnt(self.title + ' by ' + self.author) nextLinkSoup = soup.find('a', attrs={'title': 'Next Page'}) if nextLinkSoup is not None: self.AddNextPage(nextLinkSoup.get('href')) for i in self.rawstoryhtml: self.storyhtml += str(i.contents[0].prettify())
def AddPage(self): i = 1 for thisimage in self.images: #print(thisimage) if any(x in ('html', 'HTML', 'epub', 'EPUB') for x in Common.opf): zeros = '0' * (len(str(self.isize))-1) num = i if len(zeros)>1 and num > 9: zeros='0' elif len(zeros)==1 and num > 9: zeros = '' if num > 99: zeros = '' self.truestoryhttml[0]=self.truestoryhttml[0]+'<p><img src="'+zeros+str(num)+'.jpg" /></p>\n' if any(x in ('html', 'HTML', 'txt', 'TXT') for x in Common.opf): if Common.mt: t=threading.Thread(target=Common.imageDL, args=(self.title, thisimage, i, self.isize, self.pbar, self.queue), daemon=False) t.start() else: Common.imageDL(self.title, thisimage, i, self.isize, self.pbar) i+=1
def GetURLS(self, url): page=Common.RequestPage('https://nhentai.net'+url.rstrip(), headers={'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}) if page is None: print('Could not complete request for page: ' + url) return None soup=BeautifulSoup(page.content, 'html.parser') try: thisimage=soup.find('section', attrs={'id':'image-container'}).find('img').get('src') self.images.append(thisimage) except: print('Error in: '+url) for i in range(2, self.isize+1): self.images.append(thisimage[:-5]+str(i)+thisimage[-4:])
def AddNextPage(self, link): page=Common.RequestPage(link) if page is None: print('Could not complete request for page: ' + url) return None soup=BeautifulSoup(page.content, 'html.parser') paragraphs=soup.find_all('p') #print(paragraphs) text='' for p in paragraphs: self.story+=re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'\n\n' #print(p.get_text()) text+='<p>'+re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'</p>\n' temp=BeautifulSoup(text, 'html.parser') self.rawstoryhtml.append(temp)
def AddNextPage(self, soup): for i in soup.find_all('button'): if i.text.strip() == 'Next >': rawnexturl = i.get('onclick') if urllib.parse.urlparse(self.url)[1] == 'www.fanfiction.net': nexturl = 'https://www.fanfiction.net' + rawnexturl[15:-1] else: nexturl = 'https://www.fictionpress.com' + rawnexturl[15:-1] #print(nexturl) page = Common.RequestPage(nexturl) if page is None: print('Could not complete request for page: ' + url) return None soup = BeautifulSoup(page.content, 'html.parser') self.rawstoryhtml.append( soup.find('div', attrs={'id': 'storytext'})) self.pbar.Update() self.AddNextPage(soup) break
def AddPrevPage(self, url): page = Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup = BeautifulSoup(page.content, 'html.parser') self.authors.insert(0, soup.find_all('a')[7].get_text()) self.chapters.insert(0, soup.find('h1').get_text()) if Common.images: if soup.find('div', attrs={ 'class': 'chapter-content' }).find('img'): for simg in soup.find('div', attrs={ 'class': 'chapter-content' }).find_all('img'): self.images.append(simg.get('src')) simg['src'] = 'img' + str(len(self.images)) + '.jpg' self.hasimages = True temp = str(soup.find('div', attrs={'class': 'chapter-content'})) self.questions.insert( 0, soup.find('header', attrs={ 'class': "question-header" }).get_text()) temp += '<h2>' + self.questions[0] + '</h2>' self.temp.insert(0, temp) self.pbar.Update() for i in soup.find_all('a'): if i.text.strip() == 'Previous Chapter': self.AddPrevPage(i.get('href')) return #gets author name if on last/first page I guess self.authors[0] = soup.find_all('a')[5].get_text()
def __init__(self, url): self.title='' self.author='' self.story='' self.rawstoryhtml=[] self.chapters=[] self.pbar=None self.url=url self.duplicate = False page=Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup=BeautifulSoup(page.content, 'html.parser') #grabs important metadata information self.title=soup.find('span', attrs={'class': 'book-header'}).get_text() if Common.dup: if Common.CheckDuplicate(self.title): self.duplicate = True return None Common.prnt(self.title) self.author=soup.find('span', attrs={'class': 'by-line'}).contents[1].get_text() Common.prnt(self.author) #looks to see if on table of contents page #exception handling could be removed from here if soup.find('h2') is None: #and len(soup.find_all('a', attrs={'class':'categories'}))>15: #checks to see if single page story if len(soup.find_all('a', attrs={'class':'categories'}))==15: paragraphs=soup.find_all('p') #print(paragraphs) text='' for p in paragraphs: self.story+=re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'\n\n' #print(p.get_text()) text+='<p>'+re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'</p>\n' temp=BeautifulSoup(text, 'html.parser') self.chapters.append(self.title) self.rawstoryhtml.append(temp) return try: url='https://www.classicreader.com'+soup.find_all('a', attrs={'class':'categories'})[7].get('href') page=requests.get(url) soup=BeautifulSoup(page.content, 'html.parser') Common.prnt('got table of contents page') except: paragraphs=soup.find_all('p') #print(paragraphs) text='' for p in paragraphs: self.story+=re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'\n\n' #print(p.get_text()) text+='<p>'+re.sub(r'\n\s*', r'', p.get_text(), flags=re.M)+'</p>\n' temp=BeautifulSoup(text, 'html.parser') self.chapters.append(self.title) self.rawstoryhtml.append(temp) return links=soup.find_all('a', attrs={'class': 'chapter-title'}) self.pbar=Common.Progress(len(links)) #self.pbar.Update() for i in links: self.AddNextPage('https://www.classicreader.com'+i.get('href')) self.chapters.append(i.get_text()) self.pbar.Update() self.pbar.End()
def __init__(self, url): self.title = '' self.author = '' self.story = '' self.rawstoryhtml = [] self.length = 1 self.summary = '' self.pbar = None self.url = url self.chapters = [] self.page = None self.duplicate = False #try: # page=requests.get(self.url) #except: # print('Error accessing '+self.url+' Try checking internet connection and url') #return None soup = BeautifulSoup(self.requestPage(self.url).content, 'html.parser') #print(soup.prettify()) self.title = soup.find('h1').get_text() if Common.dup: if Common.CheckDuplicate(self.title): self.duplicate = True return None self.author = soup.find('span', attrs={ 'class': 'author h6' }).get_text()[3:] self.chapters.append(soup.find('h2').get_text()) self.summary = soup.find('p', attrs={ 'class': 'item-description' }).get_text() self.rawstoryhtml.append(soup.find('pre')) Common.prnt(self.title + '\nby ' + self.author + '\n' + self.summary) self.length = len( soup.find('ul', attrs={ 'class': 'table-of-contents' }).find_all('li')) self.pbar = Common.Progress(self.length) self.pbar.Update() #print(self.rawstoryhtml[0].prettify()) if soup.find('a', attrs={'class': 'next-part-link'}): #print(soup.find('a', attrs={'class': 'next-part-link'}).get('href')) self.addNextPage( soup.find('a', attrs={ 'class': 'next-part-link' }).get('href')) self.pbar.End() for j in range(0, len(self.rawstoryhtml)): tmp = self.rawstoryhtml[j].prettify()[5:] tmp = tmp.replace('&apos', '\'') self.rawstoryhtml[j] = BeautifulSoup(tmp, 'html.parser') for i in range(0, len(self.rawstoryhtml)): self.story = self.story + self.chapters[i] + '\n' self.story = self.story + self.rawstoryhtml[i].get_text() self.story = self.story.replace('\n', Common.lineEnding)
def __init__(self, url): #simple string for the title self.title = '' #simple string for the author self.author = '' #Extra long string containing the text of the story self.story = '' #each node of the list contains the raw html for one page of the story self.rawstoryhtml = [0] #the raw html but prettified and concatenated together self.storyhtml = '' #array of chapter names self.chapters = [] #summary self.summary = '' self.pbar = None self.url = url self.duplicate = False page = Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup = BeautifulSoup(page.content, 'html.parser') self.rawstoryhtml[0] = soup.find('div', attrs={'id': 'storytext'}) #self.chapters=soup.find_all('option', attrs={'selected':''}) #F*****g magic that collects the chapter titles #probably doesn't work for all stories #seems to work for all stories, adds extra chapter title to end, oh well try: for child in soup.find(attrs={'id': 'chap_select'}).descendants: if child.string is None: continue else: self.chapters.append(child.string) #we end up with an extra chapter at the end of the file, so the band-aid fix is to delete the last node del self.chapters[len(self.chapters) - 1] except: print('Chapter name couldn\'t be grabbed') self.chapters.append( soup.find('b', attrs={ 'class': 'xcontrast_txt' }).text.strip()) '''So here's the deal. fanfiction.net doesn't close any of the <option> tags that contain the chapter names, so BeautifulSoup closes them all at the end. This means that each option is the child of the option above it. so good luck extracting the name of each chapter individually There's also two (2) chapter selection fields on each web page, which makes the output look worse than it really is, since we're only ever going to use the first one we won't have to worry about it ''' #print("Chapters:") #print(self.chapters) self.summary = soup.find_all('div', attrs={'class': 'xcontrast_txt'})[0].text.strip() self.author = soup.find_all('a', attrs={'class': 'xcontrast_txt'})[2].text.strip() self.title = soup.find('b', attrs={ 'class': 'xcontrast_txt' }).text.strip() if Common.dup: if Common.CheckDuplicate(self.title): self.duplicate = True return None Common.prnt(self.title + '\nby ' + self.author + '\n' + self.summary) #setup progress bar #exception handling to avoid errors on single page stories if soup.find('button', attrs={ 'type': 'BUTTON' }) is not None and soup.find('button', attrs={ 'type': 'BUTTON' }).text.strip() == '< Prev': print( "Non-first page entered. Ebook-Publisher will only add subsequent pages and chapter titles will be wrong" ) for i in soup.find_all('button', attrs={'type': 'BUTTON'}): if i.text.strip() == 'Next >': self.pbar = Common.Progress(len(self.chapters)) self.pbar.Update() self.AddNextPage(soup) break try: self.pbar.End() except: pass for i in self.rawstoryhtml: for j in i.contents: try: self.storyhtml += j.get_text() + '\n\n' except: self.storyhtml += j #print(self.storyhtml) self.story = self.storyhtml self.story = BeautifulSoup(self.story, 'html.parser').text self.story = re.sub(r'\n\s*\n', r'\n\n', self.story, flags=re.M)
def AddNextPage(self, url, depth): #print(url) page = Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup = BeautifulSoup(page.content, 'html.parser') self.author = (soup.find_all('a')[7].get_text()) self.chapter = (soup.find('h1').get_text()) if Common.images: if soup.find('div', attrs={ 'class': 'chapter-content' }).find('img'): with lock2: for simg in soup.find('div', attrs={ 'class': 'chapter-content' }).find_all('img'): imgtemp = simg.get('src') simg['src'] = 'img' + str( len(Common.urlDict[self.ogUrl]) + 1) + '.jpg' Common.urlDict[self.ogUrl][len( Common.urlDict[self.ogUrl])] = imgtemp self.hasimages = True temp2 = soup.find('div', attrs={'class': 'chapter-content'}) #self.depth+=(str(depth)) Common.prnt(str(depth)) temp = '<div id="' + str(depth) + '">' + str(temp2) self.questions.append( soup.find('header', attrs={ 'class': "question-header" }).get_text()) temp += '<h2>' + self.questions[-1] + '</h2>\n</div>' #Common.prnt(str(depth)) j = 1 nextpages = [] epubnextpages = [] nextpagesurl = [] nextpagesdepth = [] urls = [] temp += '<br />' epubtemp = temp nextLinks = [] #epubNextLinks=[] epubCurrLink = '\n<a href="' + str( depth) + '.xhtml">' + 'Previous Chapter' + '</a>\n<br />' temp += self.prevLink for i in soup.find('div', attrs={ 'class': 'question-content' }).find_all('a'): if i.get_text().strip() != 'Add a new chapter': link = i.get_text() #Band aid fix for replaceable text in the next chapter links for l in range(len(self.renames)): link = link.replace(self.oldnames[l], self.renames[l]) if any(x in ('epub', 'EPUB') for x in Common.opf): epubnextpages.append('\n<a href="' + str(depth) + '.' + str(j) + '.xhtml">' + link.strip() + '</a>\n<br />') nextLink = '\n<a href="#' + str(depth) + '.' + str( j) + '">' + 'Previous Chapter' + '</a>\n<br />' nextLinks.append(nextLink) nextpages.append('\n<a href="#' + str(depth) + '.' + str(j) + '">' + link.strip() + '</a>\n<br />') nextpagesurl.append(i) urls.append(i.get('href')) nextpagesdepth.append(j) j += 1 if any(x in ('epub', 'EPUB') for x in Common.opf): epubtemp += self.epubPrevLink for j in epubnextpages: epubtemp += j self.epubtemp.append(epubtemp) for j in nextpages: temp += j self.temp.append(temp) try: self.pbar.Update() except: pass #Checks if new page was a link backwards and exits if so self.chapNum = int( soup.find('p', attrs={ 'class': 'meta' }).get_text().split()[1]) if self.prevChapNum >= self.chapNum: return None #Other check if current page is a link and doesn't continue if so prevLinkCheck1 = soup.find('span', attrs={'class': 'controls-left'}) prevLinkCheck2 = prevLinkCheck1.find_all('a')[0].get('href') prevLinkId = urllib.parse.urlparse(prevLinkCheck2)[2].split('.')[1] currLinkId = urllib.parse.urlparse(url)[2].split('.')[1] if self.prevLinkId is not None and prevLinkId != self.prevLinkId: #print(self.prevLinkId) #print(prevLinkId) return self.children.extend(urls) for i in range(0, len(nextpagesurl)): #zip(nextpagesurl, nextpagesdepth): threading.Thread(target=self.ThreadAdd, args=(nextpagesurl[i].get('href'), str(depth) + '.' + str(nextpagesdepth[i]), self.renames, self.oldnames, self.currLink, epubCurrLink, nextLinks[i], currLinkId), daemon=True).start()
def AddNextPage(self, url, depth, prevChapNum, prevLink, epubPrevLink, currLink, prevLinkId): page = Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup = BeautifulSoup(page.content, 'html.parser') self.authors.append(soup.find_all('a')[7].get_text()) self.chapters.append(soup.find('h1').get_text()) epubCurrLink = '\n<a href="' + str( depth) + '.xhtml">' + 'Previous Chapter' + '</a>\n<br />' if Common.images: if soup.find('div', attrs={ 'class': 'chapter-content' }).find('img'): for simg in soup.find('div', attrs={ 'class': 'chapter-content' }).find_all('img'): self.images.append(simg.get('src')) simg['src'] = 'img' + str(len(self.images)) + '.jpg' self.hasimages = True temp2 = soup.find('div', attrs={'class': 'chapter-content'}) self.depth.append(str(depth)) temp = '<div id="' + str(depth) + '">' + str(temp2) self.questions.append( soup.find('header', attrs={ 'class': "question-header" }).get_text()) temp += '<h2>' + self.questions[-1] + '</h2>\n</div>' if self.partial: Common.prnt(str(depth)) j = 1 nextpages = [] epubnextpages = [] nextpagesurl = [] nextpagesdepth = [] nextLinks = [] temp += '<br />' epubtemp = temp for i in soup.find('div', attrs={ 'class': 'question-content' }).find_all('a'): if i.get_text().strip() != 'Add a new chapter': link = i.get_text() #Band aid fix for replaceable text in the next chapter links for l in range(len(self.renames)): link = link.replace(self.oldnames[l], self.renames[l]) nextLink = '\n<a href="#' + str(depth) + '.' + str( j) + '">' + 'Previous Chapter' + '</a>\n<br />' #nextLinks.append(nextLink) if any(x in ('epub', 'EPUB') for x in Common.opf): epubnextpages.append('\n<a href="' + str(depth) + '.' + str(j) + '.xhtml">' + link.strip() + '</a>\n<br />') nextpages.append('\n<a href="#' + str(depth) + '.' + str(j) + '">' + link.strip() + '</a>\n<br />') #nextpages.append(prevLink) nextpagesurl.append(i) nextpagesdepth.append(j) j += 1 temp += prevLink if any(x in ('epub', 'EPUB') for x in Common.opf): epubtemp += epubPrevLink for j in epubnextpages: epubtemp += j self.epubtemp.append(epubtemp) for j in nextpages: temp += j self.temp.append(temp) try: self.pbar.Update() except: pass #Checks if new page was a link backwards and exits if so chapNum = int( soup.find('p', attrs={ 'class': 'meta' }).get_text().split()[1]) if prevChapNum >= chapNum: return None #Other check if current page is a link and doesn't continue if so prevLinkCheck1 = soup.find('span', attrs={'class': 'controls-left'}) prevLinkCheck2 = prevLinkCheck1.find_all('a')[0].get('href') prevLinkId1 = urllib.parse.urlparse(prevLinkCheck2)[2].split('.')[1] currLinkId = urllib.parse.urlparse(url)[2].split('.')[1] if prevLinkId is not None and prevLinkId1 != prevLinkId: #print(prevLinkId1) #print(prevLinkId) return for i, j in zip(nextpagesurl, nextpagesdepth): self.AddNextPage(i.get('href'), str(depth) + '.' + str(j), chapNum, currLink, epubCurrLink, nextLink, currLinkId)
def __init__(self, url): self.title = '' #initial author only for title page self.author = '' #author for each individual chapter self.authors = [] #the h1 tag self.chapters = [] self.story = '' self.temp = [] self.epubtemp = [] self.rawstoryhtml = [] self.epubrawstoryhtml = [] #the question at the end of each page self.questions = [] self.summary = '' self.renames = [] self.oldnames = [] self.truestoryhttml = [] self.epubtruestoryhttml = [] self.length = 1 self.pbar = None self.url = url self.images = [] #testing images self.hasimages = False self.duplicate = False self.backwards = not Common.chyoa_force_forwards self.depth = [] self.quiet = Common.quiet self.epubnextpages = [] self.nextLinks = [] self.partial = False self.partialStart = 1 self.ogUrl = self.url page = Common.RequestPage(url) if page is None: print('Could not complete request for page: ' + url) return None soup = BeautifulSoup(page.content, 'html.parser') self.title = soup.find('h3').get_text() if self.title == 'Log In': try: self.title = soup.find('h1').get_text() self.backwards = False except: pass elif not self.backwards: self.partial = True #get update timestamp: if (self.backwards or self.partial) and Common.chyoaDupCheck: date = soup.find('p', attrs={'class': 'dates'}).strong.get_text() #date='Jun 18, 2022' timestamp = datetime.strptime(date, "%b %d, %Y") #print(timestamp) if not Common.CheckDuplicateTime(self.title, timestamp): Common.prnt('Story not updated: ' + self.url, f=True) self.duplicate = True return None #check duplicate with timestamp if Common.dup: if Common.CheckDuplicate(self.title): self.duplicate = True return None if self.backwards or self.partial: self.authors.insert(0, soup.find_all('a')[7].get_text()) else: self.authors.insert(0, soup.find_all('a')[5].get_text()) self.chapters.insert(0, soup.find('h1').get_text()) self.summary = soup.find('p', attrs={'class': 'synopsis'}).get_text() tmp = soup.find('p', attrs={'class': 'meta'}).get_text() t = [s for s in tmp.split() if s.isdigit()] self.length = int(t[0]) self.partialStart = self.length if soup.find('form', attrs={'id': 'immersion-form'}) is not None: inputs = soup.find('form', attrs={ 'id': 'immersion-form' }).find_all('input', attrs={'value': ''}) with lock: if Common.mt == True: Common.quiet = True for i in range(len(inputs)): print(self.title) print('Input immersion variable ' + str(i) + ' ' + soup.find('label', attrs={ 'for': 'c' + str(i) }).get_text() + ' (' + inputs[i].get('placeholder') + ') (Leave blank to keep placeholder name)') try: newname = input() self.renames.append(newname) except: self.renames.append('') self.oldnames.append(inputs[i].get('placeholder')) if self.renames[i] == '': self.renames[i] = self.oldnames[i] if Common.mt == True: Common.quiet = self.quiet #if q: # sys.stdout=open(os.devnull, 'w') #if args.quiet: Common.prnt(self.title + '\n' + str(self.authors) + '\n' + self.summary) #print(self.chapters) if self.backwards: self.pbar = Common.Progress(self.length) #for name in self.renames: if Common.images: if soup.find('div', attrs={ 'class': 'chapter-content' }).find('img'): with lock2: for simg in soup.find('div', attrs={ 'class': 'chapter-content' }).find_all('img'): imgtemp = simg.get('src') simg['src'] = 'img' + str( len(Common.urlDict[self.ogUrl]) + 1) + '.jpg' Common.urlDict[self.ogUrl][len( Common.urlDict[self.ogUrl])] = imgtemp self.hasimages = True temp = str(soup.find('div', attrs={'class': 'chapter-content'})) #The second H2 tag may not exist if there is no sub title on a story, so we grab the first in such an event try: self.questions.insert( 0, soup.find('header', attrs={ 'class': "question-header" }).get_text()) except IndexError as IE: self.questions.insert(0, soup.find_all('h2')[0].get_text()) temp += '<h2>' + self.questions[0] + '</h2>' self.temp.insert(0, temp) if self.backwards: self.pbar.Update() #if soup.find('a').text.strip()== #self.backwards = not Common.chyoa_force_forwards for i in soup.find_all('a'): if i.text.strip() == 'Previous Chapter' and self.backwards: self.AddPrevPage(i.get('href')) self.backwards = True break #Gets here if it's the intro page that is used if not self.backwards: self.Pages = [] urls = [] #Starting the Progress Bar numChaptersTempTemp = soup.find_all('li') for i in numChaptersTempTemp: if i.find('i', attrs={'class': 'bt-book-open'}): numChapters = i.get_text().split()[0] #Removes commas from stories with over 999 pages numChapters = numChapters.replace(',', '') try: if not Common.mt: if self.partial: print('Downloading an unknown number of pages') else: self.pbar = Common.Progress(int(numChapters)) self.pbar.Update() except: pass self.q = queue.Queue() #print(threading.active_count()) j = 1 self.temp[0] += '\n<br />' self.epubtemp = self.temp.copy() for i in soup.find('div', attrs={ 'class': 'question-content' }).find_all('a'): link = i.get_text() if link.strip() != 'Add a new chapter': #Band aid fix for replaceable text in the next chapter links for l in range(len(self.renames)): link = link.replace(self.oldnames[l], self.renames[l]) if any(x in ('epub', 'EPUB') for x in Common.opf): self.epubtemp[0] += '\n<a href="' + str( j) + '.xhtml">' + link.strip() + '</a>\n<br />' nextLink = '\n<a href="#' + str( j) + '">' + 'Previous Chapter' + '</a>\n<br />' self.temp[0] += '\n<a href="#' + str( j) + '">' + link.strip() + '</a>\n<br />' self.nextLinks.append(nextLink) urls.append(i.get('href')) j += 1 self.Pages.extend(urls) j = 1 for u in urls: if Common.mt and not self.partial: chapNum = int( soup.find('p', attrs={ 'class': 'meta' }).get_text().split()[1]) firstLinkId = None threading.Thread( target=self.ThreadAdd, args=( u, j, self.renames, self.oldnames, chapNum, '<a href="#Chapter 0">Previous Chapter</a>\n<br />', '\n<a href="' + 'Chapter 1' + '.xhtml">' + 'Previous Chapter' + '</a>\n<br />', self.nextLinks[j - 1], firstLinkId, self.url), daemon=True).start() #TODO else: if Common.mt: Common.prnt( 'Warning: Cannot multithread partial Chyoa story: ' + self.url + '\nUsing default method to download an unknown number of pages' ) self.AddNextPage( u, j, 1, '<a href="#Chapter 0">Previous Chapter</a>\n<br />', '\n<a href="' + 'Chapter 1' + '.xhtml">' + 'Previous Chapter' + '</a>\n<br />', self.nextLinks[j - 1], None) j += 1 if Common.mt and not self.partial: i = int(numChapters) - 1 print("Pages to add: " + str(i)) while i > 0: #print(str(i)) self.q.get() i -= 1 #print(threading.active_count()) for page in self.Pages: self.addPage(page) try: self.pbar.End() except: pass if self.backwards: self.epubtemp = self.temp.copy() #band-aid fix for names in chapter titles #WARNING DO NOT PUT THIS TO PRODUCTION for i in range(len(self.chapters)): for j in range(len(self.renames)): #print(self.chapters[i]) self.chapters[i] = self.chapters[i].replace( self.oldnames[j], self.renames[j]) #print(self.chapters[i]) #TODO regular expressions go here for i in range(len(self.temp)): self.temp[ i] = '\n<h4>by ' + self.authors[i] + '</h4>' + self.temp[i] if any(x in ('epub', 'EPUB') for x in Common.opf): self.epubtemp[i] = '\n<h4>by ' + self.authors[ i] + '</h4>' + self.epubtemp[i] self.rawstoryhtml.append(BeautifulSoup(self.temp[i], 'html.parser')) if any(x in ('epub', 'EPUB') for x in Common.opf): self.epubrawstoryhtml.append( BeautifulSoup(self.epubtemp[i], 'html.parser')) #print(self.rawstoryhtml[len(self.rawstoryhtml)-1].get_text()) self.author = self.authors[0] #print(self.chapters) #replaces replaceable text in the story for i in self.rawstoryhtml: for j in range(len(self.renames)): for k in i.find_all( 'span', attrs={'class': 'js-immersion-receiver-c' + str(j)}): k.string = self.renames[j] self.story += self.chapters[self.rawstoryhtml.index( i)] + i.get_text() self.truestoryhttml.append(str(i)) if any(x in ('epub', 'EPUB') for x in Common.opf): for i in self.epubrawstoryhtml: for j in range(len(self.renames)): for l in i.find_all('span', attrs={ 'class': 'js-immersion-receiver-c' + str(j) }): l.string = self.renames[j] self.epubtruestoryhttml.append(str(i)) for i in range(len(self.truestoryhttml)): self.truestoryhttml[i] = self.truestoryhttml[i].replace( '\n <span', '<span') self.truestoryhttml[i] = self.truestoryhttml[i].replace( '<span', ' <span') for j in self.renames: self.truestoryhttml[i] = self.truestoryhttml[i].replace( '\n ' + j + '\n', j) self.truestoryhttml[i] = self.truestoryhttml[i].replace( ' </span>\n ', '</span> ') if any(x in ('epub', 'EPUB') for x in Common.opf): for i in range(len(self.epubtruestoryhttml)): self.epubtruestoryhttml[i] = self.epubtruestoryhttml[ i].replace('\n <span', '<span') self.epubtruestoryhttml[i] = self.epubtruestoryhttml[ i].replace('<span', ' <span') for j in self.renames: self.epubtruestoryhttml[i] = self.epubtruestoryhttml[ i].replace('\n ' + j + '\n', j) self.epubtruestoryhttml[i] = self.epubtruestoryhttml[ i].replace(' </span>\n ', '</span> ') self.story = self.story.replace('\n', Common.lineEnding) for i in range(0, len(self.truestoryhttml)): self.rawstoryhtml[i] = BeautifulSoup(self.truestoryhttml[i], 'html.parser') if any(x in ('epub', 'EPUB') for x in Common.opf): for i in range(0, len(self.epubtruestoryhttml)): self.epubrawstoryhtml[i] = BeautifulSoup( self.epubtruestoryhttml[i], 'html.parser') if Common.images and self.hasimages and any(x in ('html', 'HTML') for x in Common.opf): for i in range(0, len(Common.urlDict[self.url])): Common.prnt("Getting image " + str(i + 1) + " at: " + str(Common.urlDict[self.url][i])) try: Common.imageDL(self.title, Common.urlDict[self.url][i], i + 1, size=len(Common.urlDict[self.url])) except urllib.error.HTTPError as FE: continue