def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicTexts = [ # probably deleted by user 'Story Not FoundUnable to locate story. Code 1.', # probably deleted by admin 'Story Not FoundUnable to locate story. Code 2.', # unknown 'Story Not FoundStory is unavailable for reading. (A)', ] soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: for deletedFicText in deletedFicTexts: if gui_warning.get_text() == deletedFicText: if fic.ficStatus != FicStatus.complete: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) descriptionFound = False for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() descriptionFound = True break if descriptionFound == False: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 # TODO we should match this only on the section following the description matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Rated:.*Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = [] if preStoryLinks is not None: preStoryLinksLinks = preStoryLinks.find_all('a') pendingFandoms: List[Fandom] = [] for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in ffNetFandomCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/ if ( len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers") and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0 ): fIds = [int(hrefParts[2]), int(hrefParts[3])] pendingFandoms += self.handleCrossoverFandom( fic, hrefParts[1], fIds, href ) continue # if it's a regular fandom in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in ffNetFandomCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) pendingFandoms += self.handleFandom(fic, hrefParts[2]) continue util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href)) fic.upsert() poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId}) if len(poss) != 1: raise Exception(f'unable to upsert fic?') fic = poss[0] for pfandom in pendingFandoms: fic.add(pfandom) if fic.chapterCount is None: return fic chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = str(cid) ch.url = self.constructUrl(fic.localId, cid) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid) elif fic.chapterCount == 1 and cid == 1: ch.title = fic.title ch.upsert() metaSpan = profile_top.find('span', {'class': 'xgray'}) if metaSpan is not None: try: res = self.parseFicMetaSpan(metaSpan.decode_contents()) #fic.language = res["language"] # reconstruct fields = [ ('rated', 'Rated: Fiction ZZZ'), ('language', 'Language: ZZZ'), ('genres', 'Genre: ZZZ'), ('characters', 'Characters: ZZZ'), ('reviews', 'Reviews: ZZZ'), ('favorites', 'Favs: ZZZ'), ('follows', 'Follows: ZZZ'), ] rmeta = ' - '.join( [f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res] ) fic.extraMeta = rmeta publishedUts = util.parseDateAsUnix(res['published'], fic.fetched) fic.published = OilTimestamp(publishedUts) fic.updated = fic.published if 'updated' in res: updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched) fic.updated = OilTimestamp(updatedUts) fic.upsert() except Exception as e: util.logMessage( f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}' ) util.logMessage( f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}' ) pass return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup authorLid = fic.localId.split('/')[0] storyLid = fic.localId.split('/')[1] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' soup = BeautifulSoup(wwwHtml, 'html5lib') pageHeader = soup.find('div', {'class': 'page-header'}) titleH2 = pageHeader.find('h2') fic.title = titleH2.getText().strip() authorLink = pageHeader.find('a') author = authorLink.getText().strip() authorId = authorLid authorUrl = self.baseStoryUrl.format(authorLid, 'contact/') self.setAuthor(fic, author, authorUrl, authorId) divWell = soup.find('div', {'class': 'well'}) summaryQuote = divWell.find('blockquote') fic.description = str( summaryQuote.getText() ).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ') while fic.description.find(' ') != -1: fic.description = fic.description.replace(' ', ' ') fic.description = fic.description.strip() divWellText = divWell.getText().strip() match = re.search('Status:\s*([^-]*) -', divWellText) if match is not None and match.group(1) == 'In progress': fic.ficStatus = FicStatus.ongoing else: raise Exception('unable to find fic status') RegexMatcher( divWellText, { 'ageRating': ('Rating\s*:\s+([^-]+) -', str), 'chapterCount': ('Chapters\s*:\s+(\d+) -', int), 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), } ).matchAll(fic) assert (fic.chapterCount is not None) if str(fic.wordCount).find(',') != -1: fic.wordCount = int(str(fic.wordCount).replace(',', '')) wellParent = divWell.parent cid = 0 wordCount = 0 reviewCount = 0 chapterDates: List[int] = [] for child in wellParent.children: if child.name != 'p': continue cid += 1 if str(child).find('Chapter {}'.format(cid)) == -1: continue chapterLink = child.find('a') expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower() if chapterLink.get('href').lower() != expectedUrl: raise Exception('unexpected chapter url: ' + chapterLink.get('href')) chInfo = ChapterInfo() RegexMatcher( child.getText(), { 'wordCount': ('Word count\s*:\s+([\d,]+) -', str), 'reviewCount': ('Reviews\s*:\s+([^-]+) -', int), 'updated': ('Uploaded on\s*:\s+(.+)', str), } ).matchAll(chInfo) assert (chInfo.updated is not None) if str(chInfo.wordCount).find(',') != -1: chInfo.wordCount = int(str(chInfo.wordCount).replace(',', '')) wordCount += chInfo.wordCount reviewCount += chInfo.reviewCount dt = (util.parseDateAsUnix(chInfo.updated, int(time.time()))) chapterDates += [dt] # wordCount is already set from overall metadata fic.reviewCount = reviewCount fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.upsert() for cid in range(1, fic.chapterCount + 1): ch = fic.chapter(cid) ch.localChapterId = 'Chapter_{}'.format(cid) ch.url = self.constructUrl(fic.localId, cid) ch.upsert() return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') divDetails = soup.find_all('div', {'class': 'details'}) if len(divDetails) != 1: raise Exception('error: unable to find details\n') else: divDetails = divDetails[0] text = divDetails.get_text() pt_str = str(divDetails) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? divTitle = soup.find_all('div', {'class': 'title'}) if len(divTitle) == 1: fic.title = divTitle[0].get_text().strip() else: raise Exception( 'error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1) # TODO: this may not exist on fictionhunt? fic.description = 'archive of {} from fictionhunt TODO'.format( fic.title) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('- Complete -', text) if match is None: fic.ficStatus = FicStatus.ongoing else: fic.ficStatus = FicStatus.complete for a in divDetails.find_all('a'): a_href = a.get('href') if a_href.find('fanfiction.net/u/') != -1: author = a.get_text() authorUrl = a_href authorId = a_href.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: hardcode Harry Potter fanfic? return fic
def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic: # existing data is newer, do nothing if fic.fetched is not None and fic.fetched.toUTS() > ts: return fic from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html5lib') text = soup.get_text() pt_str = str(html) fic.fetched = OilTimestamp(ts) fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId, 1, fic.title) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 for a in soup.find_all('a', {'class': 'stitle'}): fic.title = a.getText() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) for div in soup.find_all('div', {'class': 'z-padtop'}): fic.description = div.contents[0] break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+([^-]+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search( '(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text ) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(2) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}: {}'.format(fic.url, status)) for a in soup.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) zl = soup.find('div', {'class': 'z-list'}) fan = None if zl is None else zl.get('data-category') pendingFandoms: List[Fandom] = [] if fan is not None: pendingFandoms += self.handleFandom(fic, fan) # TODO: crossovers? #print('---') #print(fic.__dict__) #raise Exception('todo') fic.upsert() for pfandom in pendingFandoms: fic.add(pfandom) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(wwwHtml, 'html5lib') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? fic.url = self.constructUrl(fic.localId) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ageRating = 'M' # TODO? ficTitleDiv = soup.find('div', {'class': 'fic-title'}) fic.title = ficTitleDiv.find('h1').getText().strip() authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a') author = authorLink.getText().strip() authorUrl = self.baseUrl + authorLink.get('href') authorId = authorUrl.split('/')[-1] self.setAuthor(fic, author, authorUrl, authorId) divDescription = soup.find('div', {'class': 'description'}) try: descView = HtmlView(str(divDescription), markdown=False) desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text]) fic.description = desc except: fic.description = divDescription.getText().strip() fictionInfo = str(soup.find('div', {'class': 'fiction-info'})) if fictionInfo.find('>ONGOING<') != -1: fic.ficStatus = FicStatus.ongoing elif fictionInfo.find('>COMPLETED<') != -1: fic.ficStatus = FicStatus.complete elif fictionInfo.find('>HIATUS<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>STUB<') != -1: fic.ficStatus = FicStatus.ongoing # TODO? elif fictionInfo.find('>DROPPED<') != -1: fic.ficStatus = FicStatus.abandoned else: raise Exception('unable to find fic status') divStatsContent = soup.find('div', {'class': 'stats-content'}) followers = divStatsContent.find(text='Followers :') ul = followers.parent.parent RegexMatcher( ul.getText(), { 'followCount?': ('Followers\s+:\s+([\d,]+)', str), 'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str), } ).matchAll(fic) if str(fic.followCount).find(','): fic.followCount = int(str(fic.followCount).replace(',', '')) if str(fic.favoriteCount).find(','): fic.favoriteCount = int(str(fic.favoriteCount).replace(',', '')) tableChapters = soup.find('table', {'id': 'chapters'}) chapterLinks = tableChapters.findAll('a') chapterUrls: List[str] = [] chapterTitles: List[str] = [] for chapterLink in chapterLinks: # TODO FIXME is this inverted? if chapterLink.find('time') is not None: continue chapterUrls += [chapterLink.get('href')] chapterTitles += [chapterLink.getText().strip()] chapterDates: List[int] = [] for chapterLink in chapterLinks: if chapterLink.find('time') is None: continue timeElement = chapterLink.find('time') if timeElement.get('unixtime'): chapterDates += [int(timeElement.get('unixtime'))] else: chapterDates += [ util.parseDateAsUnix(timeElement.get('title'), fic.fetched) ] fic.published = OilTimestamp(min(chapterDates)) fic.updated = OilTimestamp(max(chapterDates)) fic.chapterCount = len(chapterUrls) if fic.wordCount is None: fic.wordCount = 0 fic.upsert() for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) chapter.url = self.baseUrl + chapterUrls[cid - 1] if chapterUrls[cid - 1].startswith('/fiction/chapter/'): # alternate chapter syntax if the chapter itself has no slug # /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug> chapter.localChapterId = ( chapterUrls[cid - 1].split('/')[3].split('?')[0] ) else: # standard chapter syntax # /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug> chapter.localChapterId = chapterUrls[cid - 1].split('/')[5] chapter.title = chapterTitles[cid - 1] if chapter.title is not None and len(chapter.title) > 0: chapter.title = util.cleanChapterTitle(chapter.title, cid) chapter.upsert() wordCount = 0 for cid in range(1, fic.chapterCount + 1): chapter = fic.chapter(cid) if chapter.html() is None: chapter.cache() chapter.upsert() chtml = chapter.html() if chtml is not None: wordCount += len(chtml.split()) fic.wordCount = wordCount return fic
def extractSearchMetadata( self, html: str, metas: Dict[str, AdultFanfictionMeta] = {} ) -> Dict[str, AdultFanfictionMeta]: from bs4 import BeautifulSoup archiveFandomMap = { 'naruto': 'Naruto', 'hp': 'Harry Potter', 'xmen': 'X-Men', } locatedFandomMap = [ ('Mass Effect', 'Mass Effect'), ('Metroid', 'Metroid'), ('Pokemon', 'Pokemon'), ('Sonic', 'Sonic'), ('Witcher 3: Wild Hunt', 'Witcher'), ] chars = [ 'Harry', 'Hermione', 'Snape', 'Draco', 'Sirius', 'Remus', 'Lucius', 'Ron', 'Voldemort', 'Ginny', 'Charlie', 'Lily', 'Scorpius', 'James', 'George', 'Fred', 'Narcissa', 'Blaise', 'Bill', 'Luna', 'Albus', 'Severus', 'Fenrir', 'Tonks', 'Rose', 'Neville', 'Cho', 'Cedric', 'Tom', 'Seamus', 'Pansy', 'Bellatrix', 'Viktor', 'Percy', 'Dudley', 'McGonagall', 'Lavendar', 'Dumbledore', 'Naruto', 'Sasuke', 'Kakashi', 'Iruka', 'Sakura', 'Itachi', 'Gaara', 'Shikamaru', 'Neji', 'Rock Lee', 'Hinata', 'Ino', 'Shino', 'Danzo' ] spaceSqeeezeRe = re.compile('\s+') searchSoup = BeautifulSoup(html, 'html5lib') resultTables = searchSoup.findAll('table', {'width': '90%'}) for resultTable in resultTables: meta = AdultFanfictionMeta() links = resultTable.findAll('a') titleLink = links[0] meta.title = titleLink.getText() meta.url = titleLink.get('href') authorLink = links[1] meta.author = authorLink.getText().strip() meta.authorUrl = authorLink.get('href').strip() assert (meta.authorUrl is not None) meta.authorId = meta.authorUrl.split('=')[-1] trs = resultTable.findAll('tr') publishedText = trs[0].getText() RegexMatcher(publishedText, { 'published': ('Published\s+:\s+(.+)', str), }).matchAll(meta) assert (meta.published is not None) meta.published = util.parseDateAsUnix(meta.published, int(time.time())) extendedMetadata = trs[1].getText() util.logMessage(extendedMetadata, 'tmp_e_meta_aff.log') # TODO: dragon prints are actually views, not followCount/favoriteCount RegexMatcher( extendedMetadata, { 'chapterCount': ('Chapters\s*:\s*(\d+)', int), 'updated': ('Updated\s+:\s+(.+?)-:-', str), 'reviewCount?': ('Reviews\s+:\s+(\d+)', int), 'views?': ('Dragon prints\s+:\s+(\d+)', int), 'located?': ('Located\s*:\s*(.*)', str) }).matchAll(meta) assert (meta.updated is not None) meta.updated = util.parseDateAsUnix(meta.updated, int(time.time())) meta.description = str(trs[2]) meta.description = util.filterUnicode(meta.description) meta.description = spaceSqeeezeRe.sub(' ', meta.description) meta.setTags(str(trs[3])) if 'COMPLETE' in meta.tags or 'Complete.' in meta.tags: meta.ficStatus = FicStatus.complete assert (meta.url is not None) ficId = FicId.tryParseUrl(meta.url) assert (ficId is not None) meta.localId = ficId.localId meta.archive = meta.localId.split('/')[0] meta.storyNo = meta.localId.split('/')[1] if meta.archive.lower() in archiveFandomMap: meta.fandoms += [archiveFandomMap[meta.archive.lower()]] meta.located = meta.located or '' loclow = meta.located.lower() for locFan in locatedFandomMap: if loclow.endswith(locFan[0].lower()): meta.fandoms += [locFan[1]] for c1 in chars: for c2 in chars: if loclow.endswith('{}/{}'.format(c1, c2).lower()): meta.chars += [c1, c2] # TODO: try parse category, get chars #meta.info() if meta.url not in metas or meta.isNewerThan(metas[meta.url]): metas[meta.url] = meta return metas
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? pagetitle = soup.find(id='pagetitle') aTags = pagetitle.findAll('a') author = None for a in aTags: href = a.get('href') if href.startswith('viewstory'): fic.title = a.contents[0].strip() elif href.startswith('viewuser.php?uid='): author = a.contents[0] authorUrl = self.baseUrl + href authorId = str(int(href[len('viewuser.php?uid='):])) self.setAuthor(fic, author, authorUrl, authorId) if fic.title is None: raise Exception('unable to find title') if author is None: raise Exception('unable to find author') lines = html.replace('\r', '\n').replace('<', '\n<').split('\n') inDescription = False description = '' for line in lines: cur = line.strip() if cur.find('!-- SUMMARY START --') != -1: inDescription = True elif cur.find('!-- SUMMARY END --') != -1: inDescription = False if inDescription == True: description += cur + '\n' fic.description = description fic.ageRating = '<unkown>' infoBlock = None infoText = None blocks = soup.findAll('div', {'class': 'block'}) for block in blocks: title = block.find('div', {'class': 'title'}) if title is None: continue if title.contents[0] != 'Story Information': continue infoBlock = block infoText = block.get_text() break else: raise Exception('unable to find info text') matcher = RegexMatcher( infoText, { 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Word count:\s+(\S+)', int), }) matcher.matchAll(fic) sortDiv = soup.find(id='sort') match = re.search('Reviews\s*-\s*([^\]]+)', sortDiv.get_text()) if match is not None: fic.reviewCount = int(match.group(1).replace(',', '')) else: fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 infoBlockHtml = str(infoBlock) match = re.search( '<!-- PUBLISHED START -->([^<]*)<!-- PUBLISHED END -->', infoBlockHtml) if match is not None: publishedUts = util.parseDateAsUnix(match.group(1), fic.fetched) fic.published = OilTimestamp(publishedUts) match = re.search('<!-- UPDATED START -->([^<]*)<!-- UPDATED END -->', infoBlockHtml) if match is not None: updatedUts = util.parseDateAsUnix(match.group(1), fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.updated is None: fic.updated = fic.published match = re.search('Completed:\s+(\S+)', infoText) if match is not None: complete = match.group(1) if complete == 'No': fic.ficStatus = FicStatus.ongoing elif complete == 'Yes': fic.ficStatus = FicStatus.complete else: raise Exception('unknown complete value: {}'.format(complete)) match = re.search('Crossovers', infoText) if match is not None: pass # raise Exception('Found unknown crossover in {0}: {1}'.format(fic.id, fic.url)) else: # otherwise not a crossover and just harry potter fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(wwwHtml, 'html.parser') storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'}) if len(storyMainInfo) != 1: raise Exception('unable to find main story info') storyMainInfo = storyMainInfo[0] fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid=" for a in soup.findAll('a'): href = a.get('href') if (not href.startswith(disclaimerJs) and href != '?psid={}'.format(fic.localId)): continue fic.title = a.getText() break else: raise Exception('error: unable to find title') fic.url = self.constructUrl(fic.localId) storySummaryTable = soup.findAll('table', {'class': 'storysummary'}) if len(storySummaryTable) != 1: raise Exception('cannot find story summary table') storySummaryTable = storySummaryTable[0] fic.description = (storySummaryTable.getText().strip()) if fic.description is None: raise Exception('error: unable to find description') # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 text = storyMainInfo.getText().replace('\xa0', ' ') matcher = RegexMatcher( text, { 'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str), 'chapterCount': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\d+)', int), 'reviewCount': ('Story Reviews:\s*(\d+)', int), 'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int), 'updated': ('Last Updated:\s+(\S+)', str), 'published': ('First Published:\s+(\S+)', str), }) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)', text) if match is None: raise Exception('cannot find write status') status = match.group(1) if status == 'Completed': fic.ficStatus = FicStatus.complete elif status == 'Work In Progress': fic.ficStatus = FicStatus.ongoing # should these be abandoned? elif status == 'Abandoned': fic.ficStatus = FicStatus.abandoned else: raise Exception('unknown status: {}'.format(status)) for a in soup.findAll('a'): a_href = a.get('href') if a_href.startswith('viewuser.php?showuid='): author = a.get_text() authorUrl = self.baseUrl + '/' + a_href authorId = a_href[len('viewuser.php?showuid='):] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) # TODO: chars/pairings? fic.add(Fandom.define('Harry Potter')) return fic
def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic: from bs4 import BeautifulSoup # type: ignore deletedFicText = 'Story Not FoundUnable to locate story. Code 1.' soup = BeautifulSoup(wwwHtml, 'html5lib') profile_top = soup.find(id='profile_top') # story might've been deleted if profile_top is None: gui_warnings = soup.find_all('span', {'class': 'gui_warning'}) for gui_warning in gui_warnings: if gui_warning.get_text() == deletedFicText: fic.ficStatus = FicStatus.abandoned fic.upsert() return fic text = profile_top.get_text() pt_str = str(profile_top) fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? for b in profile_top.find_all('b'): b_class = b.get('class') if len(b_class) == 1 and b_class[0] == 'xcontrast_txt': fic.title = b.get_text() break else: raise Exception('error: unable to find title:\n{}\n'.format(pt_str)) fic.url = self.constructUrl(fic.localId, 1, fic.title) for div in profile_top.find_all('div'): div_class = div.get('class') if ( div.get('style') == 'margin-top:2px' and len(div_class) == 1 and div_class[0] == 'xcontrast_txt' ): fic.description = div.get_text() break else: raise Exception('error: unable to find description:\n{}\n'.format(pt_str)) # default optional fields fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 matcher = RegexMatcher( text, { 'ageRating': ('Rated:\s+Fiction\s*(\S+)', str), 'chapterCount?': ('Chapters:\s+(\d+)', int), 'wordCount': ('Words:\s+(\S+)', int), 'reviewCount?': ('Reviews:\s+(\S+)', int), 'favoriteCount?': ('Favs:\s+(\S+)', int), 'followCount?': ('Follows:\s+(\S+)', int), 'updated?': ('Updated:\s+(\S+)', str), 'published': ('Published:\s+(\S+)', str), } ) matcher.matchAll(fic) if fic.published is not None: publishedUts = util.parseDateAsUnix(fic.published, fic.fetched) fic.published = OilTimestamp(publishedUts) if fic.updated is None: fic.updated = fic.published elif fic.updated is not None: updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched) fic.updated = OilTimestamp(updatedUts) if fic.chapterCount is None: fic.chapterCount = 1 match = re.search('Status:\s+(\S+)', text) if match is None: fic.ficStatus = FicStatus.ongoing else: status = match.group(1) if status == 'Complete': fic.ficStatus = FicStatus.complete else: raise Exception('unknown status: {}'.format(status)) for a in profile_top.find_all('a'): a_href = a.get('href') if a_href.startswith('/u/'): author = a.get_text() authorUrl = self.baseUrl + a_href authorId = a_href.split('/')[2] self.setAuthor(fic, author, authorUrl, authorId) break else: raise Exception('unable to find author:\n{}'.format(text)) preStoryLinks = soup.find(id='pre_story_links') preStoryLinksLinks = preStoryLinks.find_all('a') for a in preStoryLinksLinks: href = a.get('href') hrefParts = href.split('/') # if it's a top level category if ( len(hrefParts) == 3 and len(hrefParts[0]) == 0 and len(hrefParts[2]) == 0 ): cat = hrefParts[1] if cat in fictionPressCategories: continue # skip categories raise Exception('unknown category: {}'.format(cat)) # if it's a regular genre in some category if ( len(hrefParts) == 4 and len(hrefParts[0]) == 0 and len(hrefParts[3]) == 0 ): # ensure category is in our map if hrefParts[1] not in fictionPressCategories: raise Exception('unknown category: {}'.format(hrefParts[1])) # ensure it's in our whitelist if hrefParts[2] not in fictionPressGenres: util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}') continue fic.add(Fandom.define(hrefParts[2])) continue util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}') continue fic.upsert() chapterTitles = [] if fic.chapterCount > 1: chapterSelect = soup.find(id='chap_select') chapterOptions = [] if chapterSelect is not None: chapterOptions = chapterSelect.findAll('option') chapterTitles = [co.getText().strip() for co in chapterOptions] for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) if len(chapterTitles) > cid: ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1) elif fic.chapterCount == 1 and cid == 0: ch.title = fic.title ch.upsert() return fic