def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') # wooh hardcoding fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") fic.title = 'The Waves Arisen' fic.ageRating = 'M' self.setAuthor(fic, 'wertifloke', 'https://wertifloke.wordpress.com/', str(2)) # taken from https://www.parahumans.net/about/ fic.description = ''' A young Naruto found refuge in the village library, and grew up smart, but by blood he is Ninja, and what place is there for curiosity and calculation in this brutal world of warring states? The Waves Arisen is a complete novel-length work of Rationalist Naruto Fanfiction. No prior knowledge of the Naruto universe is necessary to follow along. ''' chapterUrls = self.getChapterUrls(html) oldChapterCount = fic.chapterCount fic.chapterCount = len(chapterUrls) # TODO? fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 if fic.ficStatus is None or fic.ficStatus == FicStatus.broken: fic.ficStatus = FicStatus.ongoing fic.published = self.getChapterPublishDate(chapterUrls[0]) fic.updated = self.getChapterPublishDate(chapterUrls[-1]) if oldChapterCount is None or fic.chapterCount > oldChapterCount: fic.wordCount = 0 if fic.wordCount == 0: fic.upsert() for cid in range(1, fic.chapterCount + 1): c = fic.chapter(cid) c.cache() chtml = c.html() if chtml is not None: fic.wordCount += len(chtml.split()) fic.add(Fandom.define('Naruto')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(wwwHtml, 'html5lib') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? if fic.ficStatus is None or fic.ficStatus == FicStatus.broken: fic.ficStatus = FicStatus.ongoing # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO? # grab title from <title> element titles = soup.find('head').find_all('title') if len(titles) != 1: raise Exception(f'error: cannot find title: {len(titles)}') ntitle = '' try: ntitle = titles[0].get_text() except: pass # TODO FIXME if fic.title is None or len(ntitle.strip()) > 0: fic.title = ntitle if len(self.titleSuffix) > 0 and fic.title.endswith(self.titleSuffix): fic.title = fic.title[:-len(self.titleSuffix)] fic.title = fic.title.strip() # determine author authorPost = self.getRealAuthorPost(fic) authorPostUsernames = authorPost.find_all('a', {'class': 'username'}) if len(authorPostUsernames) < 1: raise Exception('error: unable to find author username') author = authorPostUsernames[0].get_text() auth_href = authorPostUsernames[0].get('href') authorUrl = urllib.parse.urljoin(self.baseUrl, auth_href) if not authorUrl.startswith(self.baseUrl): raise Exception('error: unknown username href format') authorId = authorUrl[len(self.baseUrl):] if not authorId.startswith('members/'): raise Exception(f'error: unknown author id format: {authorId}') authorId = authorId.split('/')[1] self.setAuthor(fic, author, authorUrl, authorId) if fic.description is None: # TODO? fic.description = htmlEscape(fic.title + ' by ' + fic.getAuthorName()) # try grabbing reader version, fallback to full pages threadmarksHtml = None try: sep = '?' if self.baseUrl.find('?') < 0 else '&' url = f'{self.baseUrl}threads/{fic.localId}/threadmarks{sep}category_id=1' threadmarksHtml = self.scrapeLike(url) self.readerSoftScrape(fic) except: # note: we do this before the theardmarks check for old-style fics # soft scrape all thread pages to ensure we have everything self.deepSoftScrape(fic) postSoups: Dict[str, Any] = {} postUrls: List[str] = [] chapterTitles = {} try: # scrape the threadmarks page, assuming there is one threadmarksSoup = BeautifulSoup(threadmarksHtml, 'html5lib') # attempt to extract a fic description threadmarkExtraInfo = threadmarksSoup.find( 'div', {'class': 'threadmarkListingHeader-extraInfo'}) if threadmarkExtraInfo is not None: bbWrapper = threadmarkExtraInfo.find('div', {'class': 'bbWrapper'}) if bbWrapper is not None: desc = bbWrapper.decode_contents() descView = HtmlView(desc, markdown=False) fic.description = ''.join( [f'<p>{l}</p>' for l in descView.text]) # determine chapter count based on threadmarks threadmarkList = threadmarksSoup.find('div', {'class': 'threadmarkList'}) threadmarks = None if threadmarkList is not None: threadmarks = threadmarkList.find_all( 'li', {'class': 'threadmarkListItem'}) else: threadmarkList = threadmarksSoup.find( 'div', {'class': 'block-body--threadmarkBody'}) if threadmarkList is None: raise Exception('error: unable to find threadmark menu') if threadmarkList.find(class_='fa-ellipsis-h') is not None: raise Exception('unable to handle elided threamdarks') threadmarks = threadmarkList.find_all('li') if len(threadmarks) == 0: threadmarks = threadmarkList.find_all('tr') util.logMessage( f'XenForo|new threadmarks count|{len(threadmarks)}') for threadmark in threadmarks: if threadmark.find( 'span', {'class': 'message-newIndicator'}) is not None: continue a = threadmark.find('a') purl = a.get('href') if purl.startswith('threads/'): purl = '{}{}'.format(self.baseUrl, purl) elif purl.startswith('/threads/'): purl = '{}{}'.format(self.baseUrl, purl[1:]) postUrls += [purl] chapterTitles[len(postUrls)] = a.getText().strip() try: postSoups, _ = self.getReaderPosts(fic) except Exception as ie: # FIXME oh boy: # https://forum.questionablequesting.com/threads/worm-cyoa-things-to-do-in-brockton-bay-when-youre-a-bored-demigod.1247/reader # Reader page says 36 threadmarks, but actual threadmark list says 33 # First reader page abruptly stops at 27 threadmarks util.logMessage( 'XenForoAdapter: unable to getReaderPosts: {}\n{}'.format( ie, traceback.format_exc())) except Exception as e: util.logMessage( 'XenForoAdapter: unable to parse threadmarks: {}\n{}'.format( e, traceback.format_exc())) try: postUrls = self.getReaderPostUrls(fic) postSoups, chapterTitles = self.getReaderPosts(fic) except Exception as ie: util.logMessage( 'XenForoAdapter: unable to parse reader posts: {}\n{}'. format(ie, traceback.format_exc())) postUrls = self.getDeepAuthorPostUrls(fic) # if we fallback to here, don't immediately setup postSoups at all; # they'll be fetched as needed later fic.chapterCount = len(postUrls) chapterPosts: List[Optional[str]] = [] chapterUrls: List[str] = [] chapterPostIds: List[str] = [] lastSoupUrl: Optional[str] = None lastSoup: Optional[Any] = None for purl in postUrls: parts = purl.split('#') burl = parts[0] postId = authorPost.get('id') if len(parts) < 2 else parts[1] rawPost = None # first try getting the post from the reader pages if postId in postSoups and postSoups[postId] is not None: rawPost = str(postSoups[postId]) else: # if needed, fallback to grabbing that page from the entire thread pageSoup = None if lastSoupUrl is not None and lastSoupUrl == burl: pageSoup = lastSoup else: pageContent = self.scrapeLike(burl) pageSoup = BeautifulSoup(pageContent, 'html5lib') lastSoupUrl = burl lastSoup = pageSoup assert (pageSoup is not None) if postId is not None: poss = pageSoup.find_all(self.postContainer, {'id': postId}) if len(poss) != 1: # XenForo2 often has js- prefixed on the actual id attr poss = pageSoup.find_all(self.postContainer, {'id': 'js-' + postId}) if len(poss) != 1: raise Exception( f'error: cannot find post for chapter {postId}') rawPost = str(poss[0]) else: rawPost = str( pageSoup.find_all(self.postContainer, {'class': 'message'})[0]) chapterPosts += [rawPost] chapterUrls += [burl] chapterPostIds += [postId] fic.wordCount = 0 fic.published = None fic.updated = None chapterContents: List[str] = [] for rawPost in chapterPosts: post = BeautifulSoup(rawPost, 'html5lib') content = post.find_all( 'div', {'class': ['messageContent', 'message-content']}) if len(content) != 1: raise Exception('error: cannot find content for chapter post') content = content[0] lastEditedDivs = content.find_all('div', {'class': 'message-lastEdit'}) for lastEditedDiv in lastEditedDivs: br = soup.new_tag("br") lastEditedDiv.insert_before(br) chapterContents += [str(content)] fic.wordCount += len(str(content).split()) uts = self.getPostUpdatedOrPublished(post) if fic.published is None: fic.published = OilTimestamp(uts) fic.updated = OilTimestamp(uts) if fic.updated is None: raise Exception( f'unable to determine updated date: {len(chapterPosts)} {len(postUrls)}' ) fic.upsert() for cid in range(fic.chapterCount): chapter = fic.chapter(cid + 1) chapter.url = chapterUrls[cid] chapter.localChapterId = chapterPostIds[cid] if (cid + 1) in chapterTitles: chapter.title = chapterTitles[(cid + 1)] chapter.upsert() chapter.setHtml(str(chapterContents[cid])) # TODO: word count, published, updated can only be found once all chapters # each post is inside an li id="post-{number}" class="message" # each post has data-author="{author}" self.updateTitle(fic) return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? infoPane = soup.findAll('td', {'class': 'info2_pane'}) if len(infoPane) != 1: raise Exception('unable to find info2_pane: {}'.format(fic.url)) infoPane = infoPane[0] authorHrefPrefix = 'index.php?action=profile&id=' authorLinks = infoPane.findAll('a') authorUrl = None for authorLink in authorLinks: if not authorLink.get('href').startswith(authorHrefPrefix): continue authorUrl = self.baseUrl + '/' + authorLink.get('href') author = authorLink.getText() authorLocalId = authorLink.get('href')[len(authorHrefPrefix):] self.setAuthor(fic, author, authorUrl, authorLocalId) break else: raise Exception('unable to find author: {}'.format(fic.url)) titleMatch = re.search( '<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE ) if titleMatch is None: edumpContent(str(infoPane), 'sugarquill_title') raise Exception('could not locate title') fic.title = titleMatch.group(1).replace(' ', ' ').strip() chapterOptions = infoPane.findAll('option') chapterTitles = {} for chapterOption in chapterOptions: cid = int(chapterOption.get('value')) chapterTitles[cid] = chapterOption.getText().strip() fic.chapterCount = len(chapterOptions) fic.ageRating = '<unkown>' # TODO fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing # TODO: no uniform way to detect? authorProfileHtml = scrape.scrape(authorUrl)['raw'] authorProfileHtml = authorProfileHtml.replace('\r', '') authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib') storyTables = authorSoup.findAll('table', {'width': '90%'}) ourStoryTable = None for storyTable in storyTables: storyId = None for a in storyTable.findAll('a'): if not a.get('href').startswith('read.php?storyid='): continue storyId = a.get('href')[len('read.php?storyid='):] storyId = storyId[:storyId.find('&')] storyId = str(int(storyId)) if storyId is None: continue if storyId != str(fic.localId): continue ourStoryTable = storyTable if ourStoryTable is None: raise Exception(f'unable to find story table: {fic.localId} {authorUrl}') trs = ourStoryTable.findAll('tr') if len(trs) != 3: raise Exception( f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}' ) fic.description = trs[1].find('td').getText().strip() reviewsMatch = re.search( '\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE ) if reviewsMatch is None: edumpContent(str(trs[0]), 'sugarquill_reviews') raise Exception('could not locate reviews') fic.reviewCount = int(reviewsMatch.group(1).strip()) updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2])) if updatedMatch is None: edumpContent(str(trs[2]), 'sugarquill_updated') raise Exception('could not locate last updated') fic.updated = OilTimestamp( util.parseDateAsUnix(updatedMatch.group(1), fic.fetched) ) if fic.published is None: fic.published = fic.updated fic.wordCount = 0 fic.upsert() for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) ch.title = chapterTitles[cid + 1] ch.cache() ch.upsert() chtml = ch.html() if chtml is not None: fic.wordCount += len(chtml.split()) fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? titleHeadings = soup.findAll('h2', {'class': 'title heading'}) if len(titleHeadings) != 1: raise Exception('unable to find ao3 title {}'.format(fic.url)) fic.title = titleHeadings[0].get_text().strip() summaryModules = soup.findAll('div', {'class': 'summary module'}) if len(summaryModules) != 1: prefaceGroups = soup.findAll('div', {'class': 'preface group'}) if len(prefaceGroups) == 1: summaryModules = prefaceGroups[0].findAll( 'div', {'class': 'summary module'} ) if len(summaryModules) == 1: summaryBq = summaryModules[0].find('blockquote') fic.description = summaryBq.decode_contents(formatter='html').strip() elif fic.description is None: fic.description = "{no summary}" # raise Exception('unable to find ao3 summary {}'.format(fic.localId)) fic.ageRating = '<unkown>' # TODO: error handling cText = ' '.join(soup.find('dd', {'class': 'chapters'}).contents).strip() ps = cText.split('/') completedChapters = int(ps[0]) totalChapters = None if ps[1] == '?' else int(ps[1]) fic.chapterCount = completedChapters wText = ' '.join(soup.find('dd', {'class': 'words'}).contents).strip() fic.wordCount = int(wText) fic.reviewCount = 0 fic.favoriteCount = 0 kDefinition = soup.find('dd', {'class': 'kudos'}) if kDefinition is not None: kText = ' '.join(kDefinition.contents).strip() fic.favoriteCount = int(kText) fic.followCount = 0 pText = ' '.join(soup.find('dd', {'class': 'published'}).contents).strip() publishedUts = util.parseDateAsUnix(pText, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published if fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.ficStatus = FicStatus.ongoing # TODO chapter/chapters? if totalChapters is None or completedChapters < totalChapters: fic.ficStatus = FicStatus.ongoing statusDt = soup.find('dt', {'class': 'status'}) if statusDt is not None: if statusDt.contents[0] == 'Completed:': fic.ficStatus = FicStatus.complete cText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip() updatedUts = util.parseDateAsUnix(cText, fic.fetched) fic.updated = OilTimestamp(updatedUts) elif statusDt.contents[0] == 'Updated:': fic.ficStatus = FicStatus.ongoing uText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip() updatedUts = util.parseDateAsUnix(uText, fic.fetched) fic.updated = OilTimestamp(updatedUts) else: raise Exception('unkown status: {}'.format(statusDt.contents[0])) byline = soup.find('h3', {'class': 'byline heading'}) authorLink = byline.find('a') if authorLink is None: if fic.authorId is not None and len(fic.getAuthorName()) > 0: pass # updated author to anon, don't make changes else: # first loaded after it was already set to anonymous authorUrl = '' author = 'Anonymous' authorId = 'Anonymous' self.setAuthor(fic, author, authorUrl, authorId) else: authorUrl = authorLink.get('href') author = ' '.join(byline.find('a').contents) authorId = author # map pseudo to real? self.setAuthor(fic, author, authorUrl, authorId) if fic.chapterCount > 1: fic.upsert() localChapterIdSelect = soup.find(id='selected_id').findAll('option') # note: ao3 sometimes says there are less chapters than there really # are, possibly due to caching on their end. We just ensure there's _at # least_ chapterCount chapters, then fetch whatever the dropdown tells # us to if len(localChapterIdSelect) > fic.chapterCount: fic.chapterCount = len(localChapterIdSelect) fic.upsert() if len(localChapterIdSelect) != fic.chapterCount: raise Exception('mismatching localChapterId count?') for cid in range(1, fic.chapterCount + 1): chap = fic.chapter(cid) chap.url = '{}{}/chapters/{}?view_adult=true'.format( self.baseUrl, fic.localId, localChapterIdSelect[cid - 1].get('value') ) chap.localChapterId = localChapterIdSelect[cid - 1].get('value') chap.title = localChapterIdSelect[cid - 1].getText().strip() if chap.title is not None: chap.title = util.cleanChapterTitle(chap.title, cid) chap.upsert() fandomDd = soup.find('dd', {'class': 'fandom tags'}) if fandomDd is not None: fandomTags = fandomDd.findAll('a', {'class': 'tag'}) for ft in fandomTags: originalF = ft.contents[0].strip() f = originalF.lower() # TODO: this seriously needs reworked if ( (f.startswith("harry potter ") and f.endswith("rowling")) or f == 'harry potter - fandom' or f == 'fantastic beasts and where to find them (movies)' or f == 'harry potter next generation - fandom' ): fic.add(Fandom.define('Harry Potter')) elif ( f == 'sherlock - fandom' or f == 'sherlock (tv)' or f == 'sherlock holmes & related fandoms' or f == 'sherlock holmes - arthur conan doyle' or f == 'sherlock holmes (downey films)' ): fic.add(Fandom.define('Sherlock Holmes')) elif f == 'furry (fandom)' or f == 'harry - fandom': continue # skip elif f == 'fleurmione - fandom': continue # skip elif f == 'skyfall (2012) - fandom': fic.add(Fandom.define('James Bond')) elif f == 'orphan black (tv)': fic.add(Fandom.define('Orphan Black')) elif ( f == 'naruto' or f == 'naruto shippuden' or f == 'naruto shippuuden - fandom' ): fic.add(Fandom.define('Naruto')) elif f == 'naruto/harry potter': fic.add(Fandom.define('Naruto')) fic.add(Fandom.define('Harry Potter')) elif f == 'bleach': fic.add(Fandom.define('Bleach')) elif ( f == 'iron man (movies)' or f == 'iron man - all media types' or f == 'iron man (comic)' or f == 'iron man - fandom' or f == 'iron man (comics)' ): fic.add(Fandom.define('Iron Man')) elif ( f == 'the avengers (marvel) - all media types' or f == 'the avengers (marvel movies)' or f == 'the avengers - ambiguous fandom' or f == 'the avengers (2012)' or f == 'the avengers' or f == 'avengers (marvel) - all media types' or f == 'marvel avengers movies universe' or f == 'avengers' ): fic.add(Fandom.define('Avengers')) elif f == 'marvel 616': fic.add(Fandom.define('Marvel')) fic.add(Fandom.define('Marvel 616')) elif f == 'thor (movies)' or f == 'thor - all media types': fic.add(Fandom.define('Thor')) elif ( f == 'captain america (movies)' or f == 'captain america - all media types' or f == 'captain america (comics)' ): fic.add(Fandom.define('Captain America')) elif ( f == 'avatar: the last airbender' or f == 'avatar: legend of korra' or f == 'avatar the last airbender - fandom' ): fic.add(Fandom.define('Avatar')) elif f == 'original work': fic.add(Fandom.define('Original Work')) elif f == 'stargate atlantis': fic.add(Fandom.define('Stargate Atlantis')) elif f == 'stargate sg-1': fic.add(Fandom.define('Stargate SG-1')) elif f == 'stargate - all series': fic.add(Fandom.define('Stargate Atlantis')) fic.add(Fandom.define('Stargate SG-1')) elif f == 'agents of s.h.i.e.l.d. (tv)': fic.add(Fandom.define('Avengers')) elif f == 'supernatural': fic.add(Fandom.define('Supernatural')) elif f == 'teen wolf (tv)': fic.add(Fandom.define('Teen Wolf')) elif f == 'grimm (tv)': fic.add(Fandom.define('Grimm')) elif ( f == 'the amazing spider-man (movies - webb)' or f == 'spider-man - all media types' or f == 'spider-man: homecoming (2017)' ): fic.add(Fandom.define('Spiderman')) elif ( f == 'x-men - all media types' or f == 'x-men (movieverse)' or f == 'x-men (comicverse)' ): fic.add(Fandom.define('X-Men')) elif ( f == 'lord of the rings - j. r. r. tolkien' or f == 'the lord of the rings - j. r. r. tolkien' ): fic.add(Fandom.define('Lord of the Rings')) elif ( f == 'crisis core: final fantasy vii' or f == 'compilation of final fantasy vii' or f == 'final fantasy vii' ): fic.add(Fandom.define('Final Fantasy VII')) fic.add(Fandom.define('Final Fantasy')) elif f == 'sen to chihiro no kamikakushi | spirited away': fic.add(Fandom.define('Spirited Away')) elif f == 'howl no ugoku shiro | howl\'s moving castle': fic.add(Fandom.define('Howl\'s Moving Castle')) elif f == 'rise of the guardians (2012)': fic.add(Fandom.define('Rise of the Guardians')) elif ( f == 'doctor who' or f == 'doctor who (2005)' or f == 'doctor who & related fandoms' ): fic.add(Fandom.define('Doctor Who')) elif f == 'daredevil (tv)' or f == 'daredevil (comics)': fic.add(Fandom.define('DareDevil')) elif f == 'labyrinth (1986)': fic.add(Fandom.define('Labyrinth')) elif f == 'gravity falls': fic.add(Fandom.define('Gravity Falls')) elif f == 'once upon a time (tv)': fic.add(Fandom.define('Once Upon a Time')) elif f == 'doctor strange (comics)': fic.add(Fandom.define('Doctor Strange')) elif f == 'the sentinel': fic.add(Fandom.define('The Sentinel')) elif f == 'teen titans (animated series)': fic.add(Fandom.define('Teen Titans')) elif ( f == 'dcu' or f == 'dcu animated' or f == 'dcu (comics)' or f == 'dc extended universe' or f == 'dc animated universe' ): fic.add(Fandom.define('DC')) elif f == 'vampire hunter d': fic.add(Fandom.define('Vampire Hunter D')) elif f == 'homestuck': fic.add(Fandom.define('Homestuck')) elif f == 'one piece': fic.add(Fandom.define('One Piece')) elif f == 'batman (movies - nolan)': fic.add(Fandom.define('Batman')) elif f == 'die hard (movies)': fic.add(Fandom.define('Die Hard')) elif f == 'discworld - terry pratchett': fic.add(Fandom.define('Discworld')) elif f == 'gossip girl': fic.add(Fandom.define('Gossip Girl')) elif ( f == 'a song of ice and fire - george r. r. martin' or f == 'a song of ice and fire & related fandoms' ): fic.add(Fandom.define('A Song of Ice and Fire')) elif f == 'supergirl (tv 2015)': fic.add(Fandom.define('Supergirl')) elif f == 'merlin (tv)': fic.add(Fandom.define('Merlin')) elif f == 'star trek': fic.add(Fandom.define('Star Trek')) elif f == 'steven universe (cartoon)': fic.add(Fandom.define('Steven Universe')) elif f == 'hellsing': fic.add(Fandom.define('Hellsing')) elif f == 'the breaker': fic.add(Fandom.define('The Breaker')) elif f == 'smallville': fic.add(Fandom.define('Smallville')) elif f == '베리타스 | veritas (manhwa)': fic.add(Fandom.define('Veritas (manhwa)')) elif f == 'guardians of childhood - william joyce': fic.add(Fandom.define('Guardians of Childhood')) elif f == 'person of interest (tv)': fic.add(Fandom.define('Person of Interest')) elif f == 'james bond (craig movies)': fic.add(Fandom.define('James Bond')) elif f == 'the bourne legacy (2012)': fic.add(Fandom.define('Jason Bourne')) elif f == 'numb3rs': fic.add(Fandom.define('Numb3rs')) elif f == 'temeraire - naomi novik': fic.add(Fandom.define('Temeraire')) elif f == 'twilight series - stephenie meyer': fic.add(Fandom.define('Twilight')) elif f == 'dungeons and dragons - fandom': fic.add(Fandom.define('Dungeons and Dragons')) elif f == 'american horror story' or f == 'american horror story: cult': fic.add(Fandom.define('American Horror Story')) elif ( f == 'worm (web serial novel)' or f == 'worm - wildbow' or f == 'parahumans series - wildbow' or f == 'worm (web serial) | wildbow' or f == 'worm - fandom' or f == 'parahumans - fandom' or f == 'worm (parahumans)' or f == 'worm (web serial)' or f == 'worm | parahumans' or f == 'worm (web novel)' ): fic.add(Fandom.define('Worm')) elif f == 'toaru kagaku no railgun | a certain scientific railgun': fic.add(Fandom.define('A Certain Scientific Railgun')) elif f == 'toaru majutsu no index | a certain magical index': fic.add(Fandom.define('A Certain Magical Index')) elif f == 'cthulhu mythos - h. p. lovecraft': fic.add(Fandom.define('Cthulhu')) elif f == 'transformers - all media types': fic.add(Fandom.define('Transformers')) elif f == 'destiny (video game)': fic.add(Fandom.define('Destiny')) elif f == 'fandom - fandom' or f == 'meta - fandom': pass # >_> elif f == 'house m.d.': fic.add(Fandom.define('House, M.D.')) elif f == 'the hobbit (jackson movies)': fic.add(Fandom.define('The Hobbit')) elif f == 'doctor strange (2016)': fic.add(Fandom.define('Doctor Strange')) elif f == 'arrow (tv 2012)': fic.add(Fandom.define('Arrow')) elif f == 'the flash (tv 2014)': fic.add(Fandom.define('Flash')) elif f == 'senki zesshou symphogear': fic.add(Fandom.define('Symphogear')) elif ( f == 'fullmetal alchemist: brotherhood & manga' or f == 'fullmetal alchemist - all media types' or f == 'fullmetal alchemist (anime 2003)' ): fic.add(Fandom.define('Fullmetal Alchemist')) elif ( f == 'star wars - all media types' or f == 'star wars episode vii: the force awakens (2015)' or f == 'star wars prequel trilogy' ): fic.add(Fandom.define('Star Wars')) elif ( f == 'guardians of the galaxy (2014)' or f == 'guardians of the galaxy - all media types' or f == 'guardians of the galaxy (movies)' ): fic.add(Fandom.define('Guardians of the Galaxy')) elif f == 'ant man (2015)' or f == 'ant-man (movies)': fic.add(Fandom.define('Ant Man')) elif f == 'the defenders (marvel tv)': fic.add(Fandom.define('The Defenders')) elif f == 'elementary (tv)': fic.add(Fandom.define('Elementary')) elif f == 'good omens - neil gaiman & terry pratchett': fic.add(Fandom.define('Good Omens')) elif f == 'danny phantom': fic.add(Fandom.define('Danny Phantom')) elif f == 'katekyou hitman reborn!': fic.add(Fandom.define('Katekyo Hitman Reborn!')) elif f == 'welcome to night vale': fic.add(Fandom.define('Welcome to Night Vale')) elif f == 'ncis': fic.add(Fandom.define('NCIS')) elif f == 'torchwood': fic.add(Fandom.define('Torchwood')) elif f == 'magic: the gathering': fic.add(Fandom.define('Magic: The Gathering')) elif f == 'overwatch (video game)': fic.add(Fandom.define('Overwatch')) elif f == 'detroit: become human (video game)': fic.add(Fandom.define('Detroit: Become Human')) elif f == 'greek and roman mythology': pass elif f == 'life is strange (video game)': fic.add(Fandom.define('life is strange (video game)')) elif f == 'akatsuki no yona | yona of the dawn': fic.add(Fandom.define('Yona of the Dawn')) elif f == '僕のヒーローアカデミア | boku no hero academia | my hero academia': fic.add(Fandom.define('My Hero Academia')) elif f == 'voltron: legendary defender': fic.add(Fandom.define('Voltron')) elif f == 'selfie (tv)': fic.add(Fandom.define('Selfie')) elif f == 'suits (tv)': fic.add(Fandom.define('Suits')) elif f == 'fruits basket': fic.add(Fandom.define('Fruits Basket')) elif f == 'hetalia: axis powers': fic.add(Fandom.define('Hetalia: Axis Powers')) elif f == 'carmilla (web series)': fic.add(Fandom.define('Carmilla')) elif f == 'the dresden files - jim butcher': fic.add(Fandom.define('Dresden Files')) elif f == 'girl genius': fic.add(Fandom.define('Girl Genius')) elif f == 'unspecified fandom': pass # TODO? elif f == 'nightwing (comics)': fic.add(Fandom.define('Nightwing')) elif f == 'books of the raksura - martha wells': fic.add(Fandom.define('Books of the Raksura')) elif f == 'fall of ile-rien - martha wells': fic.add(Fandom.define('Fall of Ile-Rien')) elif f == 'vorkosigan saga - lois mcmaster bujold': fic.add(Fandom.define('Vorkosigan Saga')) elif ( f == 'highlander: the series' or f == 'highlander - all media types' ): fic.add(Fandom.define('Highlander')) elif f == 'yoroiden samurai troopers | ronin warriors': fic.add(Fandom.define('Ronin Warriors')) elif f == 'hockey rpf': fic.add(Fandom.define('Hockey RPF')) elif f == 'pacific rim (2013)': fic.add(Fandom.define('Pacific Rim')) elif f == 'enchanted forest chronicles - patricia wrede': fic.add(Fandom.define('Enchanted Forest Chronicles')) elif f == 'tortall - tamora pierce': fic.add(Fandom.define('Tortall')) elif f == 'protector of the small - tamora pierce': fic.add(Fandom.define('Protector of the Small')) elif f == 'leverage': fic.add(Fandom.define('Leverage')) elif f == 'valdemar series - mercedes lackey': fic.add(Fandom.define('Valdemar Series')) elif ( f == 'b.p.r.d.' or f == 'bureau for paranormal research and defense' ): fic.add(Fandom.define('B.P.R.D.')) elif f == 'hellboy (comic)': fic.add(Fandom.define('Hellboy')) elif f == 'sga/avatar': fic.add(Fandom.define('Stargate Atlantis')) fic.add(Fandom.define('Avatar')) elif f == 'annihilation (2018 garland)': fic.add(Fandom.define('Annihilation')) elif f == 'craft sequence - max gladstone': fic.add(Fandom.define('Craft Sequence')) elif f == 'the good place (tv)': fic.add(Fandom.define('The Good Place')) elif f == 'jessica jones (tv)': fic.add(Fandom.define('Jessica Jones')) elif f == 'mad max series (movies)': fic.add(Fandom.define('Mad Max')) elif f == 'american gods (tv)': fic.add(Fandom.define('American Gods')) elif f == 'terminator: the sarah connor chronicles': fic.add(Fandom.define('Terminator: The Sarah Connor Chronicles')) fic.add(Fandom.define('Terminator')) elif f == 'wolf 359 (radio)': fic.add(Fandom.define('Wolf 359')) elif f == 'shadowrun: dragonfall': fic.add(Fandom.define('Shadowrun')) elif f == 'ars paradoxica (podcast)': fic.add(Fandom.define('Ars Paradoxica')) elif f == 'love is strange - fandom': fic.add(Fandom.define('Love is Strange')) elif f == 'dune - all media types': fic.add(Fandom.define('Dune')) elif f == 'dragon age: origins': fic.add(Fandom.define('Dragon Age: Origins')) elif f == 'game of thrones (tv)': fic.add(Fandom.define('Game of Thrones')) elif f == 'chronicles of amber - roger zelazny': fic.add(Fandom.define('Chronicles of Amber')) elif f == 'the southern reach trilogy - jeff vandermeer': fic.add(Fandom.define('The Southern Reach Trilogy')) elif f == 'continuum (tv)': fic.add(Fandom.define('Continuum')) elif f == 'mage: the ascension': fic.add(Fandom.define('Mage: The Ascension')) elif f == 'the good wife (tv)' or f == 'good wife (tv)': fic.add(Fandom.define('The Good Wife')) elif f == 'alliance-union - c. j. cherryh': fic.add(Fandom.define('Alliance-Union')) elif f == 'indexing - seanan mcguire': fic.add(Fandom.define('Indexing')) elif f == 'ultraviolet (tv)': fic.add(Fandom.define('Ultraviolet')) elif f == 'veronica mars (tv)': fic.add(Fandom.define('Veronica Mars')) elif f == 'secret circle (tv)': fic.add(Fandom.define('Secret Circle')) elif f == 'mahou shoujo madoka magika | puella magi madoka magica': fic.add(Fandom.define('Madoka Magica')) elif f == 'agent carter (tv)': fic.add(Fandom.define('Agent Carter')) elif f == 'dracula & related fandoms': fic.add(Fandom.define('Dracula')) elif f == 'dragon ball': fic.add(Fandom.define('Dragon Ball')) elif f == 'mass effect - all media types': fic.add(Fandom.define('Mass Effect')) elif f == 'firefly' or f == 'serenity (2005)': fic.add(Fandom.define('Firefly')) else: anyHere = False global ao3FandomsMap for fm in ao3FandomsMap: here = False for uf in fm[0]: if f == uf.lower().strip(): here = True break if not here: continue anyHere = True for mf in fm[1]: fic.add(Fandom.define(mf)) if not anyHere: util.logMessage(f'ao3|unknown fandom|{fic.url}|{originalF}') #raise Exception('unknown fandom: {} "{}"'.format(fic.url, originalF)) ourDoms = fic.fandoms() # we have a canonical fandom, try to find our characters if len(ourDoms) == 1: relationshipDd = soup.find('dd', {'class': 'relationship tags'}) if relationshipDd is not None: relationshipTags = relationshipDd.findAll('a', {'class': 'tag'}) for rt in relationshipTags: r = rt.contents[0] chars = r.split('/') if len(chars) > 8: # TODO: sometimes more? raise Exception('unable to parse relationship: {}'.format(r)) for char in chars: fic.add(Character.defineInFandom(ourDoms[0], char, self.ftype)) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: raise Exception('FIXME TODO fanfics me format has changed') from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html5lib') ficHead = soup.find('div', {'class': 'FicHead'}) titleH1 = ficHead.find('h1') fic.title = titleH1.getText().strip() fandoms: List[str] = [] trs = ficHead.findAll('div', {'class': 'tr'}) author = None for tr in trs: divTitle = tr.find('div', {'class': 'title'}) divContent = tr.find('div', {'class': 'content'}) t = str(divTitle.getText()).strip() v = str(divContent.getText()).strip() if t == 'Автор:': author = v elif t == 'Фандом:': if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling': fandoms += ['Harry Potter'] else: raise Exception('unknown fandom: ' + v) elif t == 'Статус:': if v == 'В процессе': fic.ficStatus = FicStatus.ongoing elif v == 'Закончен': fic.ficStatus = FicStatus.complete else: raise Exception('unknown write status: ' + v) elif t == 'Опубликован:': fic.published = self.parseRussianDate(v) elif t == 'Изменен:': fic.updated = self.parseRussianDate(v) elif t == 'Ссылка:': src = v # source archive url elif t == 'Читателей:': fic.followCount = int(v) elif t == 'Персонажи:': # characters, parse relationship? pass elif t == 'Рейтинг:': fic.ageRating = v elif t == 'Предупреждения:': # warnings? pass else: raise Exception('unknown metadata: ' + t) # TODO? assert (author is not None) authorUrl = author authorId = author self.setAuthor(fic, author, authorUrl, authorId) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? if fic.url is None: fic.url = self.constructUrl(fic.localId) summaryTextDiv = soup.find('div', {'class': 'summary_text'}) if summaryTextDiv is None: summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'}) fic.description = summaryTextDiv.getText() # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 if fic.followCount is None: fic.followCount = 0 fic.ageRating = 'M' ficContentsUl = soup.find('ul', {'class': 'FicContents'}) chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'}) fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) # try to get it out of current blob first if chapter.html() is None: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapter.setHtml( '<div class="ReadContent">' + str(contentDiv) + '</div>' ) if chapter.title is None or len(chapter.title) < 1: contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)}) if contentDiv is not None: chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() # fallback to scraping it directly if chapter.html() is None: cdata = scrape.softScrape(chapter.url) assert (cdata is not None) chapter.setHtml(self.extractContent(fic, cdata)) csoup = BeautifulSoup(cdata, 'html5lib') contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)}) chapterTitle = contentDiv.previous_sibling if chapterTitle is not None and chapterTitle.name == 'h2': chapter.title = chapterTitle.getText() if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount += len(chapter.cachedContent().split()) fic.wordCount = wordCount for fandom in fandoms: fic.add(Fandom.define(fandom)) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup authorLid = fic.localId.split('/')[0] storyLid = fic.localId.split('/')[1] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' soup = BeautifulSoup(wwwHtml, 'html5lib') pageHeader = soup.find('div', {'class': 'page-header'}) titleH2 = pageHeader.find('h2') fic.title = titleH2.getText().strip() authorLink = pageHeader.find('a') author = authorLink.getText().strip() authorId = authorLid authorUrl = self.baseStoryUrl.format(authorLid, 'contact/') self.setAuthor(fic, author, authorUrl, authorId) divWell = soup.find('div', {'class': 'well'}) summaryQuote = divWell.find('blockquote') fic.description = str( summaryQuote.getText() ).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ') while fic.description.find(' ') != -1: fic.description = fic.description.replace(' ', ' ') fic.description = fic.description.strip() divWellText = divWell.getText().strip() match = re.search('Status:\s*([^-]*) -', divWellText) if match is not None and match.group(1) == 'In progress': fic.ficStatus = FicStatus.ongoing else: raise Exception('unable to find fic status') RegexMatcher( divWellText, { 'ageRating': ('Rating\s*:\s+([^-]+) -', str), 'chapterCount': ('Chapters\s*:\s+(\d+) -', int), 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), } ).matchAll(fic) assert (fic.chapterCount is not None) if str(fic.wordCount).find(',') != -1: fic.wordCount = int(str(fic.wordCount).replace(',', '')) wellParent = divWell.parent cid = 0 wordCount = 0 reviewCount = 0 chapterDates: List[int] = [] for child in wellParent.children: if child.name != 'p': continue cid += 1 if str(child).find('Chapter {}'.format(cid)) == -1: continue chapterLink = child.find('a') expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower() if chapterLink.get('href').lower() != expectedUrl: raise Exception('unexpected chapter url: ' + chapterLink.get('href')) chInfo = ChapterInfo() RegexMatcher( child.getText(), { 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), 'reviewCount': ('Reviews\s*:\s+([^-]+) -', int), 'updated': ('Uploaded on\s*:\s+(.+)', str), } ).matchAll(chInfo) assert (chInfo.updated is not None) if str(chInfo.wordCount).find(',') != -1: chInfo.wordCount = int(str(chInfo.wordCount).replace(',', '')) wordCount += chInfo.wordCount reviewCount += chInfo.reviewCount dt = (util.parseDateAsUnix(chInfo.updated, int(time.time()))) chapterDates += [dt] # wordCount is already set from overall metadata fic.reviewCount = reviewCount fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.upsert() for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = 'Chapter_{}'.format(cid) ch.url = self.constructUrl(fic.localId, cid) ch.upsert() return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? w95tables = soup.findAll('table', {'width': '95%'}) if len(w95tables) != 3: raise Exception('wrong number of w95 tables: {}'.format( len(w95tables))) ficInfoTable = w95tables[0] ficTitleH3 = ficInfoTable.find('h3') fic.title = ficTitleH3.get_text().strip() authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html) if authorUrlMatch is None: raise Exception('could not locate author url') author = authorUrlMatch.group(2) authorId = authorUrlMatch.group(1) authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId self.setAuthor(fic, author, authorUrl, authorId) # TODO: this may miss multiline summaries :( summaryMatch = re.search( '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE) if summaryMatch is None: edumpContent(html, 'siye_summary') raise Exception('could not locate summary') # alternatively: fic.description = "{no summary}" ? fic.description = summaryMatch.group(1).strip() fic.ageRating = '<unkown>' ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html) if ageRatingMatch is not None: fic.ageRating = ageRatingMatch.group(1).strip() maxChapter = 0 baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId) singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format( fic.localId) isSingleChapterFic = False allAs = soup.find_all('a') for a in allAs: href = a.get('href') if href is None: continue if not href.startswith(baseChapterHref): continue if href.startswith(singleChapterHref): isSingleChapterFic = True maxChapter = max(1, maxChapter) continue cid = int(href[len(baseChapterHref):]) maxChapter = max(cid, maxChapter) fic.chapterCount = maxChapter fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing if html.find('Story is Complete'): fic.ficStatus = FicStatus.complete updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)') minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched) maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched) for (year, month, day) in re.findall(updatedOnPattern, html): date = '{}/{}/{}'.format(year, month, day) dt = util.parseDateAsUnix(date, fic.fetched) minUpdate = min(minUpdate, dt) maxUpdate = max(maxUpdate, dt) if fic.published is None or fic.published.toUTS() > minUpdate: fic.published = OilTimestamp(minUpdate) if fic.updated is None or fic.updated.toUTS() < maxUpdate: fic.updated = OilTimestamp(maxUpdate) if fic.updated < fic.published: fic.updated = fic.published fic.wordCount = 0 wordsPattern = re.compile('(\d+) words') for (words) in re.findall(wordsPattern, html): fic.wordCount += int(words) if fic.wordCount == 0 and isSingleChapterFic: try: fic.upsert() ch1 = fic.chapter(1) ch1.cache() chtml = ch1.html() if chtml is not None: fic.wordCount = len(chtml.split()) except: pass fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(wwwHtml, 'html5lib') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO? ficTitleDiv = soup.find('div', {'class': 'fic-title'}) fic.title = ficTitleDiv.find('h1').getText().strip() authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a') author = authorLink.getText().strip() authorUrl = self.baseUrl + authorLink.get('href') authorId = authorUrl.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) divDescription = soup.find('div', {'class': 'description'}) try: descView = HtmlView(str(divDescription), markdown=False) desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text]) fic.description = desc except: fic.description = divDescription.getText().strip() fictionInfo = str(soup.find('div', {'class': 'fiction-info'})) if fictionInfo.find('>ONGOING<') != -1: fic.ficStatus = FicStatus.ongoing elif fictionInfo.find('>COMPLETED<') != -1: fic.ficStatus = FicStatus.complete elif fictionInfo.find('>HIATUS<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>STUB<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>DROPPED<') != -1: fic.ficStatus = FicStatus.abandoned else: raise Exception('unable to find fic status') divStatsContent = soup.find('div', {'class': 'stats-content'}) followers = divStatsContent.find(text='Followers :') ul = followers.parent.parent RegexMatcher( ul.getText(), { 'followCount?': ('Followers\s+:\s+([\d,]+)', str), 'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str), } ).matchAll(fic) if str(fic.followCount).find(','): fic.followCount = int(str(fic.followCount).replace(',', '')) if str(fic.favoriteCount).find(','): fic.favoriteCount = int(str(fic.favoriteCount).replace(',', '')) tableChapters = soup.find('table', {'id': 'chapters'}) chapterLinks = tableChapters.findAll('a') chapterUrls: List[str] = [] chapterTitles: List[str] = [] for chapterLink in chapterLinks: # TODO FIXME is this inverted? if chapterLink.find('time') is not None: continue chapterUrls += [chapterLink.get('href')] chapterTitles += [chapterLink.getText().strip()] chapterDates: List[int] = [] for chapterLink in chapterLinks: if chapterLink.find('time') is None: continue timeElement = chapterLink.find('time') if timeElement.get('unixtime'): chapterDates += [int(timeElement.get('unixtime'))] else: chapterDates += [ util.parseDateAsUnix(timeElement.get('title'), fic.fetched) ] fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.chapterCount = len(chapterUrls) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.url = self.baseUrl + chapterUrls[cid - 1] if chapterUrls[cid - 1].startswith('/fiction/chapter/'): # alternate chapter syntax if the chapter itself has no slug # /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug> chapter.localChapterId = ( chapterUrls[cid - 1].split('/')[3].split('?')[0] ) else: # standard chapter syntax # /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug> chapter.localChapterId = chapterUrls[cid - 1].split('/')[5] chapter.title = chapterTitles[cid - 1] if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) if chapter.html() is None: chapter.cache() chapter.upsert() chtml = chapter.html() if chtml is not None: wordCount += len(chtml.split()) fic.wordCount = wordCount return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup archive = fic.localId.split('/')[0] storyNo = fic.localId.split('/')[1] soup = BeautifulSoup(wwwHtml, 'html5lib') titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)}) fic.title = str(titleH2.getText()) membersUrl = 'http://members.adult-fanfiction.org/profile.php?no=' memberLink = soup.find( lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href") is not None and (t.get("href").startswith(membersUrl)))) author = memberLink.getText() authorId = memberLink.get('href')[len(membersUrl):] authorUrl = memberLink.get('href') self.setAuthor(fic, author, authorUrl, authorId) # TODO fic.ficStatus = FicStatus.ongoing fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1) # TODO: description is on search page if fic.description is None: fic.description = 'TODO: on the search page?' # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO if fic.published is None: fic.published = OilTimestamp.now() if fic.updated is None: fic.updated = fic.published chapterDropdown = soup.find('div', {'class': 'dropdown-content'}) chapterLinks = chapterDropdown.findAll('a') oldChapterCount = fic.chapterCount fic.chapterCount = len(chapterLinks) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapterContent = scrape.softScrape( self.constructUrl(fic.localId, cid)) chapter = fic.chapter(cid) if chapterContent is not None: chapter.setHtml(chapterContent) chapter.localChapterId = str(cid) chapter.url = self.constructUrl(fic.localId, cid) chapter.title = chapterLinks[cid - 1].getText().strip() if chapter.title is not None: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() if chapterContent is not None: wordCount += len(chapterContent.split()) fic.wordCount = wordCount if oldChapterCount is not None and oldChapterCount < fic.chapterCount: fic.updated = OilTimestamp.now() # TODO fic.upsert() storyUrl = self.constructUrl(fic.localId, chapterId=None) # more metadata from search page searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title={}&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author, fic.title.replace(' ', '+')) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) # fallback to pure author search if storyUrl not in metas: searchUrl = ('http://{}.adult-fanfiction.org/search.php?' + 'auth={}&title=&summary=&tags=&cats=0&search=Search') searchUrl = searchUrl.format(archive, author) data = scrape.scrape(searchUrl)['raw'] metas = self.extractSearchMetadata(data) if storyUrl not in metas: raise Exception('cannot find search metadata') meta = metas[storyUrl] assert (meta.published is not None and meta.updated is not None) fic.published = OilTimestamp(meta.published) fic.updated = OilTimestamp(meta.updated) fic.reviewCount = meta.reviewCount fic.favoriteCount = meta.views # TODO fic.ficStatus = meta.ficStatus assert (meta.description is not None) fic.description = meta.description assert (fic.description is not None) if len(meta.tags) > 0: fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags) for fan in meta.fandoms: fic.add(Fandom.define(fan)) return fic