def getCurrentInfo(self, fic: Fic) -> Fic: url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(url) edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'sugarquill_ec') return self.parseInfoInto(fic, data['raw'])
def getCurrentInfo(self, fic: Fic) -> Fic: url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(url) time.sleep(self.baseDelay) edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'hpffa_ec') return self.parseInfoInto(fic, data['raw'])
def extractContent(self, fic: Fic, html: str) -> str: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(html, 'html.parser') normalDiv = soup.find('div', {'name': 'Normal'}) if normalDiv is None: edumpContent(html, 'fa_ec') raise Exception('unable to find normalDiv, e-dumped') return str(normalDiv)
def getCurrentInfo(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) url = self.tocUrl data = scrape.scrape(url) edumpContent('<!-- {} -->\n{}'.format(url, data['raw']), 'wavesarisen_ec') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def extractContent(self, fic: Fic, html: str) -> str: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(html, 'html5lib') tables = soup.findAll('table', {'width': '100%'}) if len(tables) != 5: edumpContent(html, 'aff') raise Exception('table count mismatch: {}'.format(len(tables))) ficTable = tables[2] trs = ficTable.findAll('tr') return str(trs[5])
def getRealAuthorPost(self, fic: 'Fic') -> Any: from bs4 import BeautifulSoup url = self.baseUrl + 'threads/' + str(fic.localId) data = self.scrapeLike(url) soup = BeautifulSoup(data, 'html5lib') posts = soup.find_all(self.postContainer, {'class': 'message'}) if len(posts) < 1: edumpContent(data, 'xen') raise Exception(f'error: unable to find author from {url}') return posts[0]
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(fic.url) edumpContent(data['raw'], 'sugarquill') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def extractContent(self, fic: Fic, html: str) -> str: from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'html.parser') chapters = soup.find(id='chapters') if chapters is None: edumpContent(html, 'ao3_ec') raise Exception('unable to find chapters, e-dumped') # delete 'Notes' and 'Chapter Text' headings for heading in chapters.find_all('h3', {'class': 'heading'}): heading.extract() return str(chapters)
def create(self, fic: Fic) -> Fic: fic.url = self.constructUrl(fic.localId) # scrape fresh info data = scrape.scrape(fic.url) time.sleep(self.baseDelay) edumpContent(data['raw'], 'hpffa') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() return Fic.lookup((fic.id, ))
def create(self, fic: Fic) -> Fic: fic.url = self.baseUrl + str(fic.localId) # scrape fresh info url = fic.url.split('?')[0] + '?view_adult=true' data = scrape.scrape(url) edumpContent(data['raw'], 'ao3') fic = self.parseInfoInto(fic, data['raw']) fic.upsert() chapter = fic.chapter(1) chapter.setHtml(data['raw']) chapter.upsert() return Fic.lookup((fic.id, ))
def extractContent(self, fic: Fic, html: str) -> str: from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(html, 'html.parser') mainpage = soup.find(id='mainpage') if mainpage is None: edumpContent(html, 'hpffa_ec') raise Exception('unable to find mainpage, e-dumped') blocks = mainpage.findAll('div', {'class': 'block'}) for block in blocks: title = block.find('div', {'class': 'title'}) if title is not None and title.contents[0] == 'Story': content = block.find('div', {'class': 'content'}) if content is not None: return str(content) edumpContent(html, 'hpffa_ec') raise Exception('unable to find content, e-dumped')
def getPostUpdatedOrPublished(self, post: Any) -> int: # old style xen foro messageMeta = post.find_all('div', {'class': 'messageMeta'}) if len(messageMeta) == 1: dt = messageMeta[0].find_all('span', {'class': 'DateTime'}) ts = None if len(dt) == 1: dt = dt[0] ts = dt.get('title') else: dt = messageMeta[0].find_all('abbr', {'class': 'DateTime'}) if len(dt) != 1: raise Exception( 'error: unable to find message meta datetime') dt = dt[0] ts = dt.get_text() tsp = dateutil.parser.parse(ts) uts = util.dtToUnix(tsp) return uts if len(messageMeta) > 1: raise Exception('error: unable to find message meta') # new xen foro style lastEdit = post.find('div', {'class': 'message-lastEdit'}) if lastEdit is not None: t = lastEdit.find('time') return int(t.get('data-time')) postPublish = post.find('div', {'class': 'message-attribution-main'}) if postPublish is not None: t = postPublish.find('time') return int(t.get('data-time')) postPublish = post.find('header', {'class': 'message-attribution'}) if postPublish is not None: t = postPublish.find('time') return int(t.get('data-time')) edumpContent(str(post), 'xen_post' + util.randomString()) raise Exception('unable to find post update or publish ts')
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? infoPane = soup.findAll('td', {'class': 'info2_pane'}) if len(infoPane) != 1: raise Exception('unable to find info2_pane: {}'.format(fic.url)) infoPane = infoPane[0] authorHrefPrefix = 'index.php?action=profile&id=' authorLinks = infoPane.findAll('a') authorUrl = None for authorLink in authorLinks: if not authorLink.get('href').startswith(authorHrefPrefix): continue authorUrl = self.baseUrl + '/' + authorLink.get('href') author = authorLink.getText() authorLocalId = authorLink.get('href')[len(authorHrefPrefix):] self.setAuthor(fic, author, authorUrl, authorLocalId) break else: raise Exception('unable to find author: {}'.format(fic.url)) titleMatch = re.search( '<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE ) if titleMatch is None: edumpContent(str(infoPane), 'sugarquill_title') raise Exception('could not locate title') fic.title = titleMatch.group(1).replace(' ', ' ').strip() chapterOptions = infoPane.findAll('option') chapterTitles = {} for chapterOption in chapterOptions: cid = int(chapterOption.get('value')) chapterTitles[cid] = chapterOption.getText().strip() fic.chapterCount = len(chapterOptions) fic.ageRating = '<unkown>' # TODO fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing # TODO: no uniform way to detect? authorProfileHtml = scrape.scrape(authorUrl)['raw'] authorProfileHtml = authorProfileHtml.replace('\r', '') authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib') storyTables = authorSoup.findAll('table', {'width': '90%'}) ourStoryTable = None for storyTable in storyTables: storyId = None for a in storyTable.findAll('a'): if not a.get('href').startswith('read.php?storyid='): continue storyId = a.get('href')[len('read.php?storyid='):] storyId = storyId[:storyId.find('&')] storyId = str(int(storyId)) if storyId is None: continue if storyId != str(fic.localId): continue ourStoryTable = storyTable if ourStoryTable is None: raise Exception(f'unable to find story table: {fic.localId} {authorUrl}') trs = ourStoryTable.findAll('tr') if len(trs) != 3: raise Exception( f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}' ) fic.description = trs[1].find('td').getText().strip() reviewsMatch = re.search( '\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE ) if reviewsMatch is None: edumpContent(str(trs[0]), 'sugarquill_reviews') raise Exception('could not locate reviews') fic.reviewCount = int(reviewsMatch.group(1).strip()) updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2])) if updatedMatch is None: edumpContent(str(trs[2]), 'sugarquill_updated') raise Exception('could not locate last updated') fic.updated = OilTimestamp( util.parseDateAsUnix(updatedMatch.group(1), fic.fetched) ) if fic.published is None: fic.published = fic.updated fic.wordCount = 0 fic.upsert() for cid in range(fic.chapterCount): ch = fic.chapter(cid + 1) ch.localChapterId = str(cid + 1) ch.title = chapterTitles[cid + 1] ch.cache() ch.upsert() chtml = ch.html() if chtml is not None: fic.wordCount += len(chtml.split()) fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic
def parseInfoInto(self, fic: Fic, html: str) -> Fic: from bs4 import BeautifulSoup html = html.replace('\r\n', '\n') soup = BeautifulSoup(html, 'html.parser') fic.fetched = OilTimestamp.now() fic.languageId = Language.getId("English") # TODO: don't hard code? w95tables = soup.findAll('table', {'width': '95%'}) if len(w95tables) != 3: raise Exception('wrong number of w95 tables: {}'.format( len(w95tables))) ficInfoTable = w95tables[0] ficTitleH3 = ficInfoTable.find('h3') fic.title = ficTitleH3.get_text().strip() authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html) if authorUrlMatch is None: raise Exception('could not locate author url') author = authorUrlMatch.group(2) authorId = authorUrlMatch.group(1) authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId self.setAuthor(fic, author, authorUrl, authorId) # TODO: this may miss multiline summaries :( summaryMatch = re.search( '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE) if summaryMatch is None: edumpContent(html, 'siye_summary') raise Exception('could not locate summary') # alternatively: fic.description = "{no summary}" ? fic.description = summaryMatch.group(1).strip() fic.ageRating = '<unkown>' ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html) if ageRatingMatch is not None: fic.ageRating = ageRatingMatch.group(1).strip() maxChapter = 0 baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId) singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format( fic.localId) isSingleChapterFic = False allAs = soup.find_all('a') for a in allAs: href = a.get('href') if href is None: continue if not href.startswith(baseChapterHref): continue if href.startswith(singleChapterHref): isSingleChapterFic = True maxChapter = max(1, maxChapter) continue cid = int(href[len(baseChapterHref):]) maxChapter = max(cid, maxChapter) fic.chapterCount = maxChapter fic.reviewCount = 0 fic.favoriteCount = 0 fic.followCount = 0 fic.ficStatus = FicStatus.ongoing if html.find('Story is Complete'): fic.ficStatus = FicStatus.complete updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)') minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched) maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched) for (year, month, day) in re.findall(updatedOnPattern, html): date = '{}/{}/{}'.format(year, month, day) dt = util.parseDateAsUnix(date, fic.fetched) minUpdate = min(minUpdate, dt) maxUpdate = max(maxUpdate, dt) if fic.published is None or fic.published.toUTS() > minUpdate: fic.published = OilTimestamp(minUpdate) if fic.updated is None or fic.updated.toUTS() < maxUpdate: fic.updated = OilTimestamp(maxUpdate) if fic.updated < fic.published: fic.updated = fic.published fic.wordCount = 0 wordsPattern = re.compile('(\d+) words') for (words) in re.findall(wordsPattern, html): fic.wordCount += int(words) if fic.wordCount == 0 and isSingleChapterFic: try: fic.upsert() ch1 = fic.chapter(1) ch1.cache() chtml = ch1.html() if chtml is not None: fic.wordCount = len(chtml.split()) except: pass fic.add(Fandom.define('Harry Potter')) # TODO: chars/relationship? return fic