Пример #1
0
    def updateTitle(self, fic: Fic) -> None:
        if fic.title is None: return
        completeTags = ['complete', 'completed']
        # look for Complete tag in the title
        for cont in self.containers:
            for completeTag in completeTags:
                ctag = cont[0] + completeTag + cont[1]
                cloc = fic.title.lower().find(ctag)
                if cloc != -1:
                    fic.title = fic.title[:cloc] + fic.title[cloc + len(ctag):]
                    fic.ficStatus = FicStatus.complete
                fic.title = fic.title.strip()
                fic.title = fic.title.replace('  ', ' ')

        # strip '[nsfw]' tag from anywhere in title
        for cont in self.containers:
            ntag = cont[0] + 'nsfw' + cont[1]
            nloc = fic.title.lower().find(ntag)
            if nloc != -1:
                fic.title = fic.title[:nloc] + fic.title[nloc + len(ntag):]
                fic.ageRating = 'M'  # TODO?
            fic.title = fic.title.strip()
            fic.title = fic.title.replace('  ', ' ')

        res = self.cleanTitle(fic.title)
        fic.title = res[0]
        for fan in res[1]:
            fic.add(Fandom.define(fan))
        for tag in res[2]:
            fic.add(Tag.define(tag))
        fic.upsert()
Пример #2
0
    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        html = html.replace('\r\n', '\n')
        soup = BeautifulSoup(html, 'html.parser')

        # wooh hardcoding
        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")

        fic.title = 'The Waves Arisen'
        fic.ageRating = 'M'

        self.setAuthor(fic, 'wertifloke', 'https://wertifloke.wordpress.com/',
                       str(2))

        # taken from https://www.parahumans.net/about/
        fic.description = '''
A young Naruto found refuge in the village library, and grew up smart, but by blood he is Ninja, and what place is there for curiosity and calculation in this brutal world of warring states?

The Waves Arisen is a complete novel-length work of Rationalist Naruto Fanfiction. No prior knowledge of the Naruto universe is necessary to follow along. '''

        chapterUrls = self.getChapterUrls(html)
        oldChapterCount = fic.chapterCount
        fic.chapterCount = len(chapterUrls)

        # TODO?
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        if fic.ficStatus is None or fic.ficStatus == FicStatus.broken:
            fic.ficStatus = FicStatus.ongoing

        fic.published = self.getChapterPublishDate(chapterUrls[0])
        fic.updated = self.getChapterPublishDate(chapterUrls[-1])

        if oldChapterCount is None or fic.chapterCount > oldChapterCount:
            fic.wordCount = 0
        if fic.wordCount == 0:
            fic.upsert()
            for cid in range(1, fic.chapterCount + 1):
                c = fic.chapter(cid)
                c.cache()
                chtml = c.html()
                if chtml is not None:
                    fic.wordCount += len(chtml.split())

        fic.add(Fandom.define('Naruto'))
        # TODO: chars/relationship?

        return fic
Пример #3
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(wwwHtml, 'html5lib')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?
        if fic.ficStatus is None or fic.ficStatus == FicStatus.broken:
            fic.ficStatus = FicStatus.ongoing

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0
        fic.ageRating = 'M'  # TODO?

        # grab title from <title> element
        titles = soup.find('head').find_all('title')
        if len(titles) != 1:
            raise Exception(f'error: cannot find title: {len(titles)}')
        ntitle = ''
        try:
            ntitle = titles[0].get_text()
        except:
            pass  # TODO FIXME
        if fic.title is None or len(ntitle.strip()) > 0:
            fic.title = ntitle
        if len(self.titleSuffix) > 0 and fic.title.endswith(self.titleSuffix):
            fic.title = fic.title[:-len(self.titleSuffix)]
        fic.title = fic.title.strip()

        # determine author
        authorPost = self.getRealAuthorPost(fic)
        authorPostUsernames = authorPost.find_all('a', {'class': 'username'})
        if len(authorPostUsernames) < 1:
            raise Exception('error: unable to find author username')
        author = authorPostUsernames[0].get_text()
        auth_href = authorPostUsernames[0].get('href')
        authorUrl = urllib.parse.urljoin(self.baseUrl, auth_href)
        if not authorUrl.startswith(self.baseUrl):
            raise Exception('error: unknown username href format')
        authorId = authorUrl[len(self.baseUrl):]
        if not authorId.startswith('members/'):
            raise Exception(f'error: unknown author id format: {authorId}')
        authorId = authorId.split('/')[1]
        self.setAuthor(fic, author, authorUrl, authorId)

        if fic.description is None:
            # TODO?
            fic.description = htmlEscape(fic.title + ' by ' +
                                         fic.getAuthorName())

        # try grabbing reader version, fallback to full pages
        threadmarksHtml = None
        try:
            sep = '?' if self.baseUrl.find('?') < 0 else '&'
            url = f'{self.baseUrl}threads/{fic.localId}/threadmarks{sep}category_id=1'
            threadmarksHtml = self.scrapeLike(url)
            self.readerSoftScrape(fic)
        except:
            # note: we do this before the theardmarks check for old-style fics
            # soft scrape all thread pages to ensure we have everything
            self.deepSoftScrape(fic)

        postSoups: Dict[str, Any] = {}

        postUrls: List[str] = []
        chapterTitles = {}
        try:
            # scrape the threadmarks page, assuming there is one
            threadmarksSoup = BeautifulSoup(threadmarksHtml, 'html5lib')

            # attempt to extract a fic description
            threadmarkExtraInfo = threadmarksSoup.find(
                'div', {'class': 'threadmarkListingHeader-extraInfo'})
            if threadmarkExtraInfo is not None:
                bbWrapper = threadmarkExtraInfo.find('div',
                                                     {'class': 'bbWrapper'})
                if bbWrapper is not None:
                    desc = bbWrapper.decode_contents()
                    descView = HtmlView(desc, markdown=False)
                    fic.description = ''.join(
                        [f'<p>{l}</p>' for l in descView.text])

            # determine chapter count based on threadmarks
            threadmarkList = threadmarksSoup.find('div',
                                                  {'class': 'threadmarkList'})
            threadmarks = None
            if threadmarkList is not None:
                threadmarks = threadmarkList.find_all(
                    'li', {'class': 'threadmarkListItem'})
            else:
                threadmarkList = threadmarksSoup.find(
                    'div', {'class': 'block-body--threadmarkBody'})
                if threadmarkList is None:
                    raise Exception('error: unable to find threadmark menu')
                if threadmarkList.find(class_='fa-ellipsis-h') is not None:
                    raise Exception('unable to handle elided threamdarks')
                threadmarks = threadmarkList.find_all('li')
                if len(threadmarks) == 0:
                    threadmarks = threadmarkList.find_all('tr')
                util.logMessage(
                    f'XenForo|new threadmarks count|{len(threadmarks)}')

            for threadmark in threadmarks:
                if threadmark.find(
                        'span', {'class': 'message-newIndicator'}) is not None:
                    continue
                a = threadmark.find('a')
                purl = a.get('href')
                if purl.startswith('threads/'):
                    purl = '{}{}'.format(self.baseUrl, purl)
                elif purl.startswith('/threads/'):
                    purl = '{}{}'.format(self.baseUrl, purl[1:])
                postUrls += [purl]

                chapterTitles[len(postUrls)] = a.getText().strip()

            try:
                postSoups, _ = self.getReaderPosts(fic)
            except Exception as ie:
                # FIXME oh boy:
                # https://forum.questionablequesting.com/threads/worm-cyoa-things-to-do-in-brockton-bay-when-youre-a-bored-demigod.1247/reader
                # Reader page says 36 threadmarks, but actual threadmark list says 33
                # First reader page abruptly stops at 27 threadmarks
                util.logMessage(
                    'XenForoAdapter: unable to getReaderPosts: {}\n{}'.format(
                        ie, traceback.format_exc()))
        except Exception as e:
            util.logMessage(
                'XenForoAdapter: unable to parse threadmarks: {}\n{}'.format(
                    e, traceback.format_exc()))
            try:
                postUrls = self.getReaderPostUrls(fic)
                postSoups, chapterTitles = self.getReaderPosts(fic)
            except Exception as ie:
                util.logMessage(
                    'XenForoAdapter: unable to parse reader posts: {}\n{}'.
                    format(ie, traceback.format_exc()))
                postUrls = self.getDeepAuthorPostUrls(fic)
                # if we fallback to here, don't immediately setup postSoups at all;
                # they'll be fetched as needed later

        fic.chapterCount = len(postUrls)

        chapterPosts: List[Optional[str]] = []
        chapterUrls: List[str] = []
        chapterPostIds: List[str] = []

        lastSoupUrl: Optional[str] = None
        lastSoup: Optional[Any] = None

        for purl in postUrls:
            parts = purl.split('#')
            burl = parts[0]
            postId = authorPost.get('id') if len(parts) < 2 else parts[1]

            rawPost = None
            # first try getting the post from the reader pages
            if postId in postSoups and postSoups[postId] is not None:
                rawPost = str(postSoups[postId])
            else:
                # if needed, fallback to grabbing that page from the entire thread
                pageSoup = None
                if lastSoupUrl is not None and lastSoupUrl == burl:
                    pageSoup = lastSoup
                else:
                    pageContent = self.scrapeLike(burl)
                    pageSoup = BeautifulSoup(pageContent, 'html5lib')
                    lastSoupUrl = burl
                    lastSoup = pageSoup
                assert (pageSoup is not None)
                if postId is not None:
                    poss = pageSoup.find_all(self.postContainer,
                                             {'id': postId})
                    if len(poss) != 1:
                        # XenForo2 often has js- prefixed on the actual id attr
                        poss = pageSoup.find_all(self.postContainer,
                                                 {'id': 'js-' + postId})
                    if len(poss) != 1:
                        raise Exception(
                            f'error: cannot find post for chapter {postId}')
                    rawPost = str(poss[0])
                else:
                    rawPost = str(
                        pageSoup.find_all(self.postContainer,
                                          {'class': 'message'})[0])

            chapterPosts += [rawPost]
            chapterUrls += [burl]
            chapterPostIds += [postId]

        fic.wordCount = 0
        fic.published = None
        fic.updated = None

        chapterContents: List[str] = []
        for rawPost in chapterPosts:
            post = BeautifulSoup(rawPost, 'html5lib')
            content = post.find_all(
                'div', {'class': ['messageContent', 'message-content']})
            if len(content) != 1:
                raise Exception('error: cannot find content for chapter post')
            content = content[0]

            lastEditedDivs = content.find_all('div',
                                              {'class': 'message-lastEdit'})
            for lastEditedDiv in lastEditedDivs:
                br = soup.new_tag("br")
                lastEditedDiv.insert_before(br)

            chapterContents += [str(content)]
            fic.wordCount += len(str(content).split())

            uts = self.getPostUpdatedOrPublished(post)

            if fic.published is None:
                fic.published = OilTimestamp(uts)
            fic.updated = OilTimestamp(uts)

        if fic.updated is None:
            raise Exception(
                f'unable to determine updated date: {len(chapterPosts)} {len(postUrls)}'
            )

        fic.upsert()
        for cid in range(fic.chapterCount):
            chapter = fic.chapter(cid + 1)
            chapter.url = chapterUrls[cid]
            chapter.localChapterId = chapterPostIds[cid]
            if (cid + 1) in chapterTitles:
                chapter.title = chapterTitles[(cid + 1)]
            chapter.upsert()

            chapter.setHtml(str(chapterContents[cid]))

        # TODO: word count, published, updated can only be found once all chapters

        # each post is inside an li id="post-{number}" class="message"
        # each post has data-author="{author}"

        self.updateTitle(fic)

        return fic
Пример #4
0
	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		html = html.replace('\r\n', '\n')
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		infoPane = soup.findAll('td', {'class': 'info2_pane'})
		if len(infoPane) != 1:
			raise Exception('unable to find info2_pane: {}'.format(fic.url))
		infoPane = infoPane[0]

		authorHrefPrefix = 'index.php?action=profile&id='
		authorLinks = infoPane.findAll('a')
		authorUrl = None
		for authorLink in authorLinks:
			if not authorLink.get('href').startswith(authorHrefPrefix):
				continue

			authorUrl = self.baseUrl + '/' + authorLink.get('href')
			author = authorLink.getText()
			authorLocalId = authorLink.get('href')[len(authorHrefPrefix):]

			self.setAuthor(fic, author, authorUrl, authorLocalId)
			break
		else:
			raise Exception('unable to find author: {}'.format(fic.url))

		titleMatch = re.search(
			'<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE
		)
		if titleMatch is None:
			edumpContent(str(infoPane), 'sugarquill_title')
			raise Exception('could not locate title')

		fic.title = titleMatch.group(1).replace('&nbsp;', ' ').strip()

		chapterOptions = infoPane.findAll('option')
		chapterTitles = {}
		for chapterOption in chapterOptions:
			cid = int(chapterOption.get('value'))
			chapterTitles[cid] = chapterOption.getText().strip()
		fic.chapterCount = len(chapterOptions)

		fic.ageRating = '<unkown>'  # TODO
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ficStatus = FicStatus.ongoing  # TODO: no uniform way to detect?

		authorProfileHtml = scrape.scrape(authorUrl)['raw']
		authorProfileHtml = authorProfileHtml.replace('\r', '')
		authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib')

		storyTables = authorSoup.findAll('table', {'width': '90%'})
		ourStoryTable = None
		for storyTable in storyTables:
			storyId = None
			for a in storyTable.findAll('a'):
				if not a.get('href').startswith('read.php?storyid='):
					continue
				storyId = a.get('href')[len('read.php?storyid='):]
				storyId = storyId[:storyId.find('&')]
				storyId = str(int(storyId))
			if storyId is None:
				continue
			if storyId != str(fic.localId):
				continue
			ourStoryTable = storyTable
		if ourStoryTable is None:
			raise Exception(f'unable to find story table: {fic.localId} {authorUrl}')

		trs = ourStoryTable.findAll('tr')
		if len(trs) != 3:
			raise Exception(
				f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}'
			)

		fic.description = trs[1].find('td').getText().strip()

		reviewsMatch = re.search(
			'\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE
		)
		if reviewsMatch is None:
			edumpContent(str(trs[0]), 'sugarquill_reviews')
			raise Exception('could not locate reviews')

		fic.reviewCount = int(reviewsMatch.group(1).strip())

		updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2]))
		if updatedMatch is None:
			edumpContent(str(trs[2]), 'sugarquill_updated')
			raise Exception('could not locate last updated')

		fic.updated = OilTimestamp(
			util.parseDateAsUnix(updatedMatch.group(1), fic.fetched)
		)
		if fic.published is None:
			fic.published = fic.updated

		fic.wordCount = 0
		fic.upsert()

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			ch.title = chapterTitles[cid + 1]
			ch.cache()
			ch.upsert()
			chtml = ch.html()
			if chtml is not None:
				fic.wordCount += len(chtml.split())

		fic.add(Fandom.define('Harry Potter'))
		# TODO: chars/relationship?

		return fic
Пример #5
0
	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		titleHeadings = soup.findAll('h2', {'class': 'title heading'})
		if len(titleHeadings) != 1:
			raise Exception('unable to find ao3 title {}'.format(fic.url))
		fic.title = titleHeadings[0].get_text().strip()

		summaryModules = soup.findAll('div', {'class': 'summary module'})
		if len(summaryModules) != 1:
			prefaceGroups = soup.findAll('div', {'class': 'preface group'})
			if len(prefaceGroups) == 1:
				summaryModules = prefaceGroups[0].findAll(
					'div', {'class': 'summary module'}
				)

		if len(summaryModules) == 1:
			summaryBq = summaryModules[0].find('blockquote')
			fic.description = summaryBq.decode_contents(formatter='html').strip()
		elif fic.description is None:
			fic.description = "{no summary}"
			# raise Exception('unable to find ao3 summary {}'.format(fic.localId))

		fic.ageRating = '<unkown>'

		# TODO: error handling
		cText = ' '.join(soup.find('dd', {'class': 'chapters'}).contents).strip()
		ps = cText.split('/')
		completedChapters = int(ps[0])
		totalChapters = None if ps[1] == '?' else int(ps[1])
		fic.chapterCount = completedChapters

		wText = ' '.join(soup.find('dd', {'class': 'words'}).contents).strip()
		fic.wordCount = int(wText)

		fic.reviewCount = 0

		fic.favoriteCount = 0
		kDefinition = soup.find('dd', {'class': 'kudos'})
		if kDefinition is not None:
			kText = ' '.join(kDefinition.contents).strip()
			fic.favoriteCount = int(kText)

		fic.followCount = 0

		pText = ' '.join(soup.find('dd', {'class': 'published'}).contents).strip()
		publishedUts = util.parseDateAsUnix(pText, fic.fetched)
		fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		if fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		fic.ficStatus = FicStatus.ongoing  # TODO chapter/chapters?

		if totalChapters is None or completedChapters < totalChapters:
			fic.ficStatus = FicStatus.ongoing

		statusDt = soup.find('dt', {'class': 'status'})
		if statusDt is not None:
			if statusDt.contents[0] == 'Completed:':
				fic.ficStatus = FicStatus.complete
				cText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip()
				updatedUts = util.parseDateAsUnix(cText, fic.fetched)
				fic.updated = OilTimestamp(updatedUts)
			elif statusDt.contents[0] == 'Updated:':
				fic.ficStatus = FicStatus.ongoing
				uText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip()
				updatedUts = util.parseDateAsUnix(uText, fic.fetched)
				fic.updated = OilTimestamp(updatedUts)
			else:
				raise Exception('unkown status: {}'.format(statusDt.contents[0]))

		byline = soup.find('h3', {'class': 'byline heading'})
		authorLink = byline.find('a')
		if authorLink is None:
			if fic.authorId is not None and len(fic.getAuthorName()) > 0:
				pass  # updated author to anon, don't make changes
			else:
				# first loaded after it was already set to anonymous
				authorUrl = ''
				author = 'Anonymous'
				authorId = 'Anonymous'
				self.setAuthor(fic, author, authorUrl, authorId)
		else:
			authorUrl = authorLink.get('href')
			author = ' '.join(byline.find('a').contents)
			authorId = author  # map pseudo to real?
			self.setAuthor(fic, author, authorUrl, authorId)

		if fic.chapterCount > 1:
			fic.upsert()
			localChapterIdSelect = soup.find(id='selected_id').findAll('option')
			# note: ao3 sometimes says there are less chapters than there really
			# are, possibly due to caching on their end. We just ensure there's _at
			# least_ chapterCount chapters, then fetch whatever the dropdown tells
			# us to
			if len(localChapterIdSelect) > fic.chapterCount:
				fic.chapterCount = len(localChapterIdSelect)
				fic.upsert()
			if len(localChapterIdSelect) != fic.chapterCount:
				raise Exception('mismatching localChapterId count?')

			for cid in range(1, fic.chapterCount + 1):
				chap = fic.chapter(cid)
				chap.url = '{}{}/chapters/{}?view_adult=true'.format(
					self.baseUrl, fic.localId, localChapterIdSelect[cid - 1].get('value')
				)
				chap.localChapterId = localChapterIdSelect[cid - 1].get('value')
				chap.title = localChapterIdSelect[cid - 1].getText().strip()
				if chap.title is not None:
					chap.title = util.cleanChapterTitle(chap.title, cid)
				chap.upsert()

		fandomDd = soup.find('dd', {'class': 'fandom tags'})
		if fandomDd is not None:
			fandomTags = fandomDd.findAll('a', {'class': 'tag'})
			for ft in fandomTags:
				originalF = ft.contents[0].strip()
				f = originalF.lower()
				# TODO: this seriously needs reworked
				if (
					(f.startswith("harry potter ") and f.endswith("rowling"))
					or f == 'harry potter - fandom'
					or f == 'fantastic beasts and where to find them (movies)'
					or f == 'harry potter next generation - fandom'
				):
					fic.add(Fandom.define('Harry Potter'))
				elif (
					f == 'sherlock - fandom' or f == 'sherlock (tv)'
					or f == 'sherlock holmes & related fandoms'
					or f == 'sherlock holmes - arthur conan doyle'
					or f == 'sherlock holmes (downey films)'
				):
					fic.add(Fandom.define('Sherlock Holmes'))
				elif f == 'furry (fandom)' or f == 'harry - fandom':
					continue  # skip
				elif f == 'fleurmione - fandom':
					continue  # skip
				elif f == 'skyfall (2012) - fandom':
					fic.add(Fandom.define('James Bond'))
				elif f == 'orphan black (tv)':
					fic.add(Fandom.define('Orphan Black'))
				elif (
					f == 'naruto' or f == 'naruto shippuden'
					or f == 'naruto shippuuden - fandom'
				):
					fic.add(Fandom.define('Naruto'))
				elif f == 'naruto/harry potter':
					fic.add(Fandom.define('Naruto'))
					fic.add(Fandom.define('Harry Potter'))
				elif f == 'bleach':
					fic.add(Fandom.define('Bleach'))
				elif (
					f == 'iron man (movies)' or f == 'iron man - all media types'
					or f == 'iron man (comic)' or f == 'iron man - fandom'
					or f == 'iron man (comics)'
				):
					fic.add(Fandom.define('Iron Man'))
				elif (
					f == 'the avengers (marvel) - all media types'
					or f == 'the avengers (marvel movies)'
					or f == 'the avengers - ambiguous fandom'
					or f == 'the avengers (2012)' or f == 'the avengers'
					or f == 'avengers (marvel) - all media types'
					or f == 'marvel avengers movies universe' or f == 'avengers'
				):
					fic.add(Fandom.define('Avengers'))
				elif f == 'marvel 616':
					fic.add(Fandom.define('Marvel'))
					fic.add(Fandom.define('Marvel 616'))
				elif f == 'thor (movies)' or f == 'thor - all media types':
					fic.add(Fandom.define('Thor'))
				elif (
					f == 'captain america (movies)'
					or f == 'captain america - all media types'
					or f == 'captain america (comics)'
				):
					fic.add(Fandom.define('Captain America'))
				elif (
					f == 'avatar: the last airbender' or f == 'avatar: legend of korra'
					or f == 'avatar the last airbender - fandom'
				):
					fic.add(Fandom.define('Avatar'))
				elif f == 'original work':
					fic.add(Fandom.define('Original Work'))
				elif f == 'stargate atlantis':
					fic.add(Fandom.define('Stargate Atlantis'))
				elif f == 'stargate sg-1':
					fic.add(Fandom.define('Stargate SG-1'))
				elif f == 'stargate - all series':
					fic.add(Fandom.define('Stargate Atlantis'))
					fic.add(Fandom.define('Stargate SG-1'))
				elif f == 'agents of s.h.i.e.l.d. (tv)':
					fic.add(Fandom.define('Avengers'))
				elif f == 'supernatural':
					fic.add(Fandom.define('Supernatural'))
				elif f == 'teen wolf (tv)':
					fic.add(Fandom.define('Teen Wolf'))
				elif f == 'grimm (tv)':
					fic.add(Fandom.define('Grimm'))
				elif (
					f == 'the amazing spider-man (movies - webb)'
					or f == 'spider-man - all media types'
					or f == 'spider-man: homecoming (2017)'
				):
					fic.add(Fandom.define('Spiderman'))
				elif (
					f == 'x-men - all media types' or f == 'x-men (movieverse)'
					or f == 'x-men (comicverse)'
				):
					fic.add(Fandom.define('X-Men'))
				elif (
					f == 'lord of the rings - j. r. r. tolkien'
					or f == 'the lord of the rings - j. r. r. tolkien'
				):
					fic.add(Fandom.define('Lord of the Rings'))
				elif (
					f == 'crisis core: final fantasy vii'
					or f == 'compilation of final fantasy vii' or f == 'final fantasy vii'
				):
					fic.add(Fandom.define('Final Fantasy VII'))
					fic.add(Fandom.define('Final Fantasy'))
				elif f == 'sen to chihiro no kamikakushi | spirited away':
					fic.add(Fandom.define('Spirited Away'))
				elif f == 'howl no ugoku shiro | howl\'s moving castle':
					fic.add(Fandom.define('Howl\'s Moving Castle'))
				elif f == 'rise of the guardians (2012)':
					fic.add(Fandom.define('Rise of the Guardians'))
				elif (
					f == 'doctor who' or f == 'doctor who (2005)'
					or f == 'doctor who & related fandoms'
				):
					fic.add(Fandom.define('Doctor Who'))
				elif f == 'daredevil (tv)' or f == 'daredevil (comics)':
					fic.add(Fandom.define('DareDevil'))
				elif f == 'labyrinth (1986)':
					fic.add(Fandom.define('Labyrinth'))
				elif f == 'gravity falls':
					fic.add(Fandom.define('Gravity Falls'))
				elif f == 'once upon a time (tv)':
					fic.add(Fandom.define('Once Upon a Time'))
				elif f == 'doctor strange (comics)':
					fic.add(Fandom.define('Doctor Strange'))
				elif f == 'the sentinel':
					fic.add(Fandom.define('The Sentinel'))
				elif f == 'teen titans (animated series)':
					fic.add(Fandom.define('Teen Titans'))
				elif (
					f == 'dcu' or f == 'dcu animated' or f == 'dcu (comics)'
					or f == 'dc extended universe' or f == 'dc animated universe'
				):
					fic.add(Fandom.define('DC'))
				elif f == 'vampire hunter d':
					fic.add(Fandom.define('Vampire Hunter D'))
				elif f == 'homestuck':
					fic.add(Fandom.define('Homestuck'))
				elif f == 'one piece':
					fic.add(Fandom.define('One Piece'))
				elif f == 'batman (movies - nolan)':
					fic.add(Fandom.define('Batman'))
				elif f == 'die hard (movies)':
					fic.add(Fandom.define('Die Hard'))
				elif f == 'discworld - terry pratchett':
					fic.add(Fandom.define('Discworld'))
				elif f == 'gossip girl':
					fic.add(Fandom.define('Gossip Girl'))
				elif (
					f == 'a song of ice and fire - george r. r. martin'
					or f == 'a song of ice and fire & related fandoms'
				):
					fic.add(Fandom.define('A Song of Ice and Fire'))
				elif f == 'supergirl (tv 2015)':
					fic.add(Fandom.define('Supergirl'))
				elif f == 'merlin (tv)':
					fic.add(Fandom.define('Merlin'))
				elif f == 'star trek':
					fic.add(Fandom.define('Star Trek'))
				elif f == 'steven universe (cartoon)':
					fic.add(Fandom.define('Steven Universe'))
				elif f == 'hellsing':
					fic.add(Fandom.define('Hellsing'))
				elif f == 'the breaker':
					fic.add(Fandom.define('The Breaker'))
				elif f == 'smallville':
					fic.add(Fandom.define('Smallville'))
				elif f == '베리타스 | veritas (manhwa)':
					fic.add(Fandom.define('Veritas (manhwa)'))
				elif f == 'guardians of childhood - william joyce':
					fic.add(Fandom.define('Guardians of Childhood'))
				elif f == 'person of interest (tv)':
					fic.add(Fandom.define('Person of Interest'))
				elif f == 'james bond (craig movies)':
					fic.add(Fandom.define('James Bond'))
				elif f == 'the bourne legacy (2012)':
					fic.add(Fandom.define('Jason Bourne'))
				elif f == 'numb3rs':
					fic.add(Fandom.define('Numb3rs'))
				elif f == 'temeraire - naomi novik':
					fic.add(Fandom.define('Temeraire'))
				elif f == 'twilight series - stephenie meyer':
					fic.add(Fandom.define('Twilight'))
				elif f == 'dungeons and dragons - fandom':
					fic.add(Fandom.define('Dungeons and Dragons'))
				elif f == 'american horror story' or f == 'american horror story: cult':
					fic.add(Fandom.define('American Horror Story'))
				elif (
					f == 'worm (web serial novel)' or f == 'worm - wildbow'
					or f == 'parahumans series - wildbow'
					or f == 'worm (web serial) | wildbow' or f == 'worm - fandom'
					or f == 'parahumans - fandom' or f == 'worm (parahumans)'
					or f == 'worm (web serial)' or f == 'worm | parahumans'
					or f == 'worm (web novel)'
				):
					fic.add(Fandom.define('Worm'))
				elif f == 'toaru kagaku no railgun | a certain scientific railgun':
					fic.add(Fandom.define('A Certain Scientific Railgun'))
				elif f == 'toaru majutsu no index | a certain magical index':
					fic.add(Fandom.define('A Certain Magical Index'))
				elif f == 'cthulhu mythos - h. p. lovecraft':
					fic.add(Fandom.define('Cthulhu'))
				elif f == 'transformers - all media types':
					fic.add(Fandom.define('Transformers'))
				elif f == 'destiny (video game)':
					fic.add(Fandom.define('Destiny'))
				elif f == 'fandom - fandom' or f == 'meta - fandom':
					pass  # >_>
				elif f == 'house m.d.':
					fic.add(Fandom.define('House, M.D.'))
				elif f == 'the hobbit (jackson movies)':
					fic.add(Fandom.define('The Hobbit'))
				elif f == 'doctor strange (2016)':
					fic.add(Fandom.define('Doctor Strange'))
				elif f == 'arrow (tv 2012)':
					fic.add(Fandom.define('Arrow'))
				elif f == 'the flash (tv 2014)':
					fic.add(Fandom.define('Flash'))
				elif f == 'senki zesshou symphogear':
					fic.add(Fandom.define('Symphogear'))
				elif (
					f == 'fullmetal alchemist: brotherhood & manga'
					or f == 'fullmetal alchemist - all media types'
					or f == 'fullmetal alchemist (anime 2003)'
				):
					fic.add(Fandom.define('Fullmetal Alchemist'))
				elif (
					f == 'star wars - all media types'
					or f == 'star wars episode vii: the force awakens (2015)'
					or f == 'star wars prequel trilogy'
				):
					fic.add(Fandom.define('Star Wars'))
				elif (
					f == 'guardians of the galaxy (2014)'
					or f == 'guardians of the galaxy - all media types'
					or f == 'guardians of the galaxy (movies)'
				):
					fic.add(Fandom.define('Guardians of the Galaxy'))
				elif f == 'ant man (2015)' or f == 'ant-man (movies)':
					fic.add(Fandom.define('Ant Man'))
				elif f == 'the defenders (marvel tv)':
					fic.add(Fandom.define('The Defenders'))
				elif f == 'elementary (tv)':
					fic.add(Fandom.define('Elementary'))
				elif f == 'good omens - neil gaiman & terry pratchett':
					fic.add(Fandom.define('Good Omens'))
				elif f == 'danny phantom':
					fic.add(Fandom.define('Danny Phantom'))
				elif f == 'katekyou hitman reborn!':
					fic.add(Fandom.define('Katekyo Hitman Reborn!'))
				elif f == 'welcome to night vale':
					fic.add(Fandom.define('Welcome to Night Vale'))
				elif f == 'ncis':
					fic.add(Fandom.define('NCIS'))
				elif f == 'torchwood':
					fic.add(Fandom.define('Torchwood'))
				elif f == 'magic: the gathering':
					fic.add(Fandom.define('Magic: The Gathering'))
				elif f == 'overwatch (video game)':
					fic.add(Fandom.define('Overwatch'))
				elif f == 'detroit: become human (video game)':
					fic.add(Fandom.define('Detroit: Become Human'))
				elif f == 'greek and roman mythology':
					pass
				elif f == 'life is strange (video game)':
					fic.add(Fandom.define('life is strange (video game)'))
				elif f == 'akatsuki no yona | yona of the dawn':
					fic.add(Fandom.define('Yona of the Dawn'))
				elif f == '僕のヒーローアカデミア | boku no hero academia | my hero academia':
					fic.add(Fandom.define('My Hero Academia'))
				elif f == 'voltron: legendary defender':
					fic.add(Fandom.define('Voltron'))
				elif f == 'selfie (tv)':
					fic.add(Fandom.define('Selfie'))
				elif f == 'suits (tv)':
					fic.add(Fandom.define('Suits'))
				elif f == 'fruits basket':
					fic.add(Fandom.define('Fruits Basket'))
				elif f == 'hetalia: axis powers':
					fic.add(Fandom.define('Hetalia: Axis Powers'))
				elif f == 'carmilla (web series)':
					fic.add(Fandom.define('Carmilla'))
				elif f == 'the dresden files - jim butcher':
					fic.add(Fandom.define('Dresden Files'))
				elif f == 'girl genius':
					fic.add(Fandom.define('Girl Genius'))
				elif f == 'unspecified fandom':
					pass  # TODO?
				elif f == 'nightwing (comics)':
					fic.add(Fandom.define('Nightwing'))
				elif f == 'books of the raksura - martha wells':
					fic.add(Fandom.define('Books of the Raksura'))
				elif f == 'fall of ile-rien - martha wells':
					fic.add(Fandom.define('Fall of Ile-Rien'))
				elif f == 'vorkosigan saga - lois mcmaster bujold':
					fic.add(Fandom.define('Vorkosigan Saga'))
				elif (
					f == 'highlander: the series' or f == 'highlander - all media types'
				):
					fic.add(Fandom.define('Highlander'))
				elif f == 'yoroiden samurai troopers | ronin warriors':
					fic.add(Fandom.define('Ronin Warriors'))
				elif f == 'hockey rpf':
					fic.add(Fandom.define('Hockey RPF'))
				elif f == 'pacific rim (2013)':
					fic.add(Fandom.define('Pacific Rim'))
				elif f == 'enchanted forest chronicles - patricia wrede':
					fic.add(Fandom.define('Enchanted Forest Chronicles'))
				elif f == 'tortall - tamora pierce':
					fic.add(Fandom.define('Tortall'))
				elif f == 'protector of the small - tamora pierce':
					fic.add(Fandom.define('Protector of the Small'))
				elif f == 'leverage':
					fic.add(Fandom.define('Leverage'))
				elif f == 'valdemar series - mercedes lackey':
					fic.add(Fandom.define('Valdemar Series'))
				elif (
					f == 'b.p.r.d.' or f == 'bureau for paranormal research and defense'
				):
					fic.add(Fandom.define('B.P.R.D.'))
				elif f == 'hellboy (comic)':
					fic.add(Fandom.define('Hellboy'))
				elif f == 'sga/avatar':
					fic.add(Fandom.define('Stargate Atlantis'))
					fic.add(Fandom.define('Avatar'))
				elif f == 'annihilation (2018 garland)':
					fic.add(Fandom.define('Annihilation'))
				elif f == 'craft sequence - max gladstone':
					fic.add(Fandom.define('Craft Sequence'))
				elif f == 'the good place (tv)':
					fic.add(Fandom.define('The Good Place'))
				elif f == 'jessica jones (tv)':
					fic.add(Fandom.define('Jessica Jones'))
				elif f == 'mad max series (movies)':
					fic.add(Fandom.define('Mad Max'))
				elif f == 'american gods (tv)':
					fic.add(Fandom.define('American Gods'))
				elif f == 'terminator: the sarah connor chronicles':
					fic.add(Fandom.define('Terminator: The Sarah Connor Chronicles'))
					fic.add(Fandom.define('Terminator'))
				elif f == 'wolf 359 (radio)':
					fic.add(Fandom.define('Wolf 359'))
				elif f == 'shadowrun: dragonfall':
					fic.add(Fandom.define('Shadowrun'))
				elif f == 'ars paradoxica (podcast)':
					fic.add(Fandom.define('Ars Paradoxica'))
				elif f == 'love is strange - fandom':
					fic.add(Fandom.define('Love is Strange'))
				elif f == 'dune - all media types':
					fic.add(Fandom.define('Dune'))
				elif f == 'dragon age: origins':
					fic.add(Fandom.define('Dragon Age: Origins'))
				elif f == 'game of thrones (tv)':
					fic.add(Fandom.define('Game of Thrones'))
				elif f == 'chronicles of amber - roger zelazny':
					fic.add(Fandom.define('Chronicles of Amber'))
				elif f == 'the southern reach trilogy - jeff vandermeer':
					fic.add(Fandom.define('The Southern Reach Trilogy'))
				elif f == 'continuum (tv)':
					fic.add(Fandom.define('Continuum'))
				elif f == 'mage: the ascension':
					fic.add(Fandom.define('Mage: The Ascension'))
				elif f == 'the good wife (tv)' or f == 'good wife (tv)':
					fic.add(Fandom.define('The Good Wife'))
				elif f == 'alliance-union - c. j. cherryh':
					fic.add(Fandom.define('Alliance-Union'))
				elif f == 'indexing - seanan mcguire':
					fic.add(Fandom.define('Indexing'))
				elif f == 'ultraviolet (tv)':
					fic.add(Fandom.define('Ultraviolet'))
				elif f == 'veronica mars (tv)':
					fic.add(Fandom.define('Veronica Mars'))
				elif f == 'secret circle (tv)':
					fic.add(Fandom.define('Secret Circle'))
				elif f == 'mahou shoujo madoka magika | puella magi madoka magica':
					fic.add(Fandom.define('Madoka Magica'))
				elif f == 'agent carter (tv)':
					fic.add(Fandom.define('Agent Carter'))
				elif f == 'dracula & related fandoms':
					fic.add(Fandom.define('Dracula'))
				elif f == 'dragon ball':
					fic.add(Fandom.define('Dragon Ball'))
				elif f == 'mass effect - all media types':
					fic.add(Fandom.define('Mass Effect'))
				elif f == 'firefly' or f == 'serenity (2005)':
					fic.add(Fandom.define('Firefly'))
				else:
					anyHere = False
					global ao3FandomsMap
					for fm in ao3FandomsMap:
						here = False
						for uf in fm[0]:
							if f == uf.lower().strip():
								here = True
								break
						if not here:
							continue
						anyHere = True
						for mf in fm[1]:
							fic.add(Fandom.define(mf))
					if not anyHere:
						util.logMessage(f'ao3|unknown fandom|{fic.url}|{originalF}')
						#raise Exception('unknown fandom: {} "{}"'.format(fic.url, originalF))

		ourDoms = fic.fandoms()
		# we have a canonical fandom, try to find our characters
		if len(ourDoms) == 1:
			relationshipDd = soup.find('dd', {'class': 'relationship tags'})
			if relationshipDd is not None:
				relationshipTags = relationshipDd.findAll('a', {'class': 'tag'})
				for rt in relationshipTags:
					r = rt.contents[0]
					chars = r.split('/')
					if len(chars) > 8:  # TODO: sometimes more?
						raise Exception('unable to parse relationship: {}'.format(r))
					for char in chars:
						fic.add(Character.defineInFandom(ourDoms[0], char, self.ftype))

		return fic
Пример #6
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		authorLid = fic.localId.split('/')[0]
		storyLid = fic.localId.split('/')[1]

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'

		soup = BeautifulSoup(wwwHtml, 'html5lib')

		pageHeader = soup.find('div', {'class': 'page-header'})
		titleH2 = pageHeader.find('h2')
		fic.title = titleH2.getText().strip()

		authorLink = pageHeader.find('a')
		author = authorLink.getText().strip()
		authorId = authorLid
		authorUrl = self.baseStoryUrl.format(authorLid, 'contact/')
		self.setAuthor(fic, author, authorUrl, authorId)

		divWell = soup.find('div', {'class': 'well'})

		summaryQuote = divWell.find('blockquote')

		fic.description = str(
			summaryQuote.getText()
		).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
		while fic.description.find('  ') != -1:
			fic.description = fic.description.replace('  ', ' ')
		fic.description = fic.description.strip()

		divWellText = divWell.getText().strip()

		match = re.search('Status:\s*([^-]*) -', divWellText)
		if match is not None and match.group(1) == 'In progress':
			fic.ficStatus = FicStatus.ongoing
		else:
			raise Exception('unable to find fic status')

		RegexMatcher(
			divWellText, {
				'ageRating': ('Rating\s*:\s+([^-]+) -', str),
				'chapterCount': ('Chapters\s*:\s+(\d+) -', int),
				'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
			}
		).matchAll(fic)
		assert (fic.chapterCount is not None)

		if str(fic.wordCount).find(',') != -1:
			fic.wordCount = int(str(fic.wordCount).replace(',', ''))

		wellParent = divWell.parent
		cid = 0
		wordCount = 0
		reviewCount = 0
		chapterDates: List[int] = []

		for child in wellParent.children:
			if child.name != 'p': continue
			cid += 1
			if str(child).find('Chapter {}'.format(cid)) == -1:
				continue
			chapterLink = child.find('a')
			expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower()
			if chapterLink.get('href').lower() != expectedUrl:
				raise Exception('unexpected chapter url: ' + chapterLink.get('href'))

			chInfo = ChapterInfo()

			RegexMatcher(
				child.getText(), {
					'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
					'reviewCount': ('Reviews\s*:\s+([^-]+) -', int),
					'updated': ('Uploaded on\s*:\s+(.+)', str),
				}
			).matchAll(chInfo)
			assert (chInfo.updated is not None)

			if str(chInfo.wordCount).find(',') != -1:
				chInfo.wordCount = int(str(chInfo.wordCount).replace(',', ''))

			wordCount += chInfo.wordCount
			reviewCount += chInfo.reviewCount

			dt = (util.parseDateAsUnix(chInfo.updated, int(time.time())))
			chapterDates += [dt]

		# wordCount is already set from overall metadata
		fic.reviewCount = reviewCount

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))

		fic.upsert()
		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = 'Chapter_{}'.format(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			ch.upsert()

		return fic
Пример #7
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		raise Exception('FIXME TODO fanfics me format has changed')
		from bs4 import BeautifulSoup  # type: ignore
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		ficHead = soup.find('div', {'class': 'FicHead'})

		titleH1 = ficHead.find('h1')
		fic.title = titleH1.getText().strip()

		fandoms: List[str] = []
		trs = ficHead.findAll('div', {'class': 'tr'})
		author = None
		for tr in trs:
			divTitle = tr.find('div', {'class': 'title'})
			divContent = tr.find('div', {'class': 'content'})

			t = str(divTitle.getText()).strip()
			v = str(divContent.getText()).strip()

			if t == 'Автор:':
				author = v
			elif t == 'Фандом:':
				if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling':
					fandoms += ['Harry Potter']
				else:
					raise Exception('unknown fandom: ' + v)
			elif t == 'Статус:':
				if v == 'В процессе':
					fic.ficStatus = FicStatus.ongoing
				elif v == 'Закончен':
					fic.ficStatus = FicStatus.complete
				else:
					raise Exception('unknown write status: ' + v)
			elif t == 'Опубликован:':
				fic.published = self.parseRussianDate(v)
			elif t == 'Изменен:':
				fic.updated = self.parseRussianDate(v)
			elif t == 'Ссылка:':
				src = v  # source archive url
			elif t == 'Читателей:':
				fic.followCount = int(v)
			elif t == 'Персонажи:':
				# characters, parse relationship?
				pass
			elif t == 'Рейтинг:':
				fic.ageRating = v
			elif t == 'Предупреждения:':
				# warnings?
				pass
			else:
				raise Exception('unknown metadata: ' + t)

		# TODO?
		assert (author is not None)
		authorUrl = author
		authorId = author
		self.setAuthor(fic, author, authorUrl, authorId)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		if fic.url is None:
			fic.url = self.constructUrl(fic.localId)

		summaryTextDiv = soup.find('div', {'class': 'summary_text'})
		if summaryTextDiv is None:
			summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'})
		fic.description = summaryTextDiv.getText()

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		if fic.followCount is None:
			fic.followCount = 0

		fic.ageRating = 'M'

		ficContentsUl = soup.find('ul', {'class': 'FicContents'})
		chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'})
		fic.chapterCount = len(chapterLinks)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.localChapterId = str(cid)
			chapter.url = self.constructUrl(fic.localId, cid)

			# try to get it out of current blob first
			if chapter.html() is None:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapter.setHtml(
						'<div class="ReadContent">' + str(contentDiv) + '</div>'
					)

			if chapter.title is None or len(chapter.title) < 1:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapterTitle = contentDiv.previous_sibling
					if chapterTitle is not None and chapterTitle.name == 'h2':
						chapter.title = chapterTitle.getText()

			# fallback to scraping it directly
			if chapter.html() is None:
				cdata = scrape.softScrape(chapter.url)
				assert (cdata is not None)
				chapter.setHtml(self.extractContent(fic, cdata))
				csoup = BeautifulSoup(cdata, 'html5lib')
				contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)})
				chapterTitle = contentDiv.previous_sibling
				if chapterTitle is not None and chapterTitle.name == 'h2':
					chapter.title = chapterTitle.getText()

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()
			wordCount += len(chapter.cachedContent().split())

		fic.wordCount = wordCount

		for fandom in fandoms:
			fic.add(Fandom.define(fandom))

		return fic
Пример #8
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicTexts = [
			# probably deleted by user
			'Story Not FoundUnable to locate story. Code 1.',
			# probably deleted by admin
			'Story Not FoundUnable to locate story. Code 2.',
			# unknown
			'Story Not FoundStory is unavailable for reading. (A)',
		]
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				for deletedFicText in deletedFicTexts:
					if gui_warning.get_text() == deletedFicText:
						if fic.ficStatus != FicStatus.complete:
							fic.ficStatus = FicStatus.abandoned
						fic.upsert()
						return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
				break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		descriptionFound = False
		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
			):
				fic.description = div.get_text()
				descriptionFound = True
				break
		if descriptionFound == False:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		# TODO we should match this only on the section following the description
		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Rated:.*Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text
		)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = []
		if preStoryLinks is not None:
			preStoryLinksLinks = preStoryLinks.find_all('a')
		pendingFandoms: List[Fandom] = []
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
			):
				cat = hrefParts[1]
				if cat in ffNetFandomCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/
			if (
				len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers")
				and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0
			):
				fIds = [int(hrefParts[2]), int(hrefParts[3])]
				pendingFandoms += self.handleCrossoverFandom(
					fic, hrefParts[1], fIds, href
				)
				continue

			# if it's a regular fandom in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
			):
				# ensure category is in our map
				if hrefParts[1] not in ffNetFandomCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				pendingFandoms += self.handleFandom(fic, hrefParts[2])
				continue

			util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href))

		fic.upsert()
		poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId})
		if len(poss) != 1:
			raise Exception(f'unable to upsert fic?')
		fic = poss[0]
		for pfandom in pendingFandoms:
			fic.add(pfandom)

		if fic.chapterCount is None:
			return fic

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = str(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid)
			elif fic.chapterCount == 1 and cid == 1:
				ch.title = fic.title
			ch.upsert()

		metaSpan = profile_top.find('span', {'class': 'xgray'})
		if metaSpan is not None:
			try:
				res = self.parseFicMetaSpan(metaSpan.decode_contents())
				#fic.language = res["language"]

				# reconstruct
				fields = [
					('rated', 'Rated: Fiction ZZZ'),
					('language', 'Language: ZZZ'),
					('genres', 'Genre: ZZZ'),
					('characters', 'Characters: ZZZ'),
					('reviews', 'Reviews: ZZZ'),
					('favorites', 'Favs: ZZZ'),
					('follows', 'Follows: ZZZ'),
				]
				rmeta = ' - '.join(
					[f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res]
				)

				fic.extraMeta = rmeta
				publishedUts = util.parseDateAsUnix(res['published'], fic.fetched)
				fic.published = OilTimestamp(publishedUts)
				fic.updated = fic.published
				if 'updated' in res:
					updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched)
					fic.updated = OilTimestamp(updatedUts)
				fic.upsert()

			except Exception as e:
				util.logMessage(
					f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}'
				)
				util.logMessage(
					f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}'
				)
				pass

		return fic
Пример #9
0
    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        html = html.replace('\r\n', '\n')
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        w95tables = soup.findAll('table', {'width': '95%'})
        if len(w95tables) != 3:
            raise Exception('wrong number of w95 tables: {}'.format(
                len(w95tables)))

        ficInfoTable = w95tables[0]
        ficTitleH3 = ficInfoTable.find('h3')
        fic.title = ficTitleH3.get_text().strip()

        authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html)
        if authorUrlMatch is None:
            raise Exception('could not locate author url')

        author = authorUrlMatch.group(2)
        authorId = authorUrlMatch.group(1)
        authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId

        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO: this may miss multiline summaries :(
        summaryMatch = re.search(
            '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE)
        if summaryMatch is None:
            edumpContent(html, 'siye_summary')
            raise Exception('could not locate summary')
        # alternatively: fic.description = "{no summary}" ?

        fic.description = summaryMatch.group(1).strip()

        fic.ageRating = '<unkown>'

        ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html)
        if ageRatingMatch is not None:
            fic.ageRating = ageRatingMatch.group(1).strip()

        maxChapter = 0
        baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId)
        singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format(
            fic.localId)
        isSingleChapterFic = False
        allAs = soup.find_all('a')
        for a in allAs:
            href = a.get('href')
            if href is None:
                continue
            if not href.startswith(baseChapterHref):
                continue
            if href.startswith(singleChapterHref):
                isSingleChapterFic = True
                maxChapter = max(1, maxChapter)
                continue
            cid = int(href[len(baseChapterHref):])
            maxChapter = max(cid, maxChapter)

        fic.chapterCount = maxChapter

        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ficStatus = FicStatus.ongoing
        if html.find('Story is Complete'):
            fic.ficStatus = FicStatus.complete

        updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)')
        minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched)
        maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched)
        for (year, month, day) in re.findall(updatedOnPattern, html):
            date = '{}/{}/{}'.format(year, month, day)
            dt = util.parseDateAsUnix(date, fic.fetched)

            minUpdate = min(minUpdate, dt)
            maxUpdate = max(maxUpdate, dt)

        if fic.published is None or fic.published.toUTS() > minUpdate:
            fic.published = OilTimestamp(minUpdate)
        if fic.updated is None or fic.updated.toUTS() < maxUpdate:
            fic.updated = OilTimestamp(maxUpdate)
        if fic.updated < fic.published:
            fic.updated = fic.published

        fic.wordCount = 0
        wordsPattern = re.compile('(\d+) words')
        for (words) in re.findall(wordsPattern, html):
            fic.wordCount += int(words)

        if fic.wordCount == 0 and isSingleChapterFic:
            try:
                fic.upsert()
                ch1 = fic.chapter(1)
                ch1.cache()
                chtml = ch1.html()
                if chtml is not None:
                    fic.wordCount = len(chtml.split())
            except:
                pass

        fic.add(Fandom.define('Harry Potter'))
        # TODO: chars/relationship?

        return fic
Пример #10
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        divDetails = soup.find_all('div', {'class': 'details'})
        if len(divDetails) != 1:
            raise Exception('error: unable to find details\n')
        else:
            divDetails = divDetails[0]

        text = divDetails.get_text()
        pt_str = str(divDetails)

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        divTitle = soup.find_all('div', {'class': 'title'})
        if len(divTitle) == 1:
            fic.title = divTitle[0].get_text().strip()
        else:
            raise Exception(
                'error: unable to find title:\n{}\n'.format(pt_str))

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: this may not exist on fictionhunt?
        fic.description = 'archive of {} from fictionhunt TODO'.format(
            fic.title)

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rated:\s+(\S+)', str),
                'chapterCount?': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\S+)', int),
                'reviewCount?': ('Reviews:\s+(\S+)', int),
                'favoriteCount?': ('Favs:\s+(\S+)', int),
                'followCount?': ('Follows:\s+(\S+)', int),
                'updated?': ('Updated:\s+(\S+)', str),
                'published': ('Published:\s+(\S+)', str),
            })
        matcher.matchAll(fic)

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('- Complete -', text)
        if match is None:
            fic.ficStatus = FicStatus.ongoing
        else:
            fic.ficStatus = FicStatus.complete

        for a in divDetails.find_all('a'):
            a_href = a.get('href')
            if a_href.find('fanfiction.net/u/') != -1:
                author = a.get_text()
                authorUrl = a_href
                authorId = a_href.split('/')[-1]
                self.setAuthor(fic, author, authorUrl, authorId)
                break
        else:
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: hardcode Harry Potter fanfic?

        return fic
Пример #11
0
	def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic:
		# existing data is newer, do nothing
		if fic.fetched is not None and fic.fetched.toUTS() > ts:
			return fic
		from bs4 import BeautifulSoup

		soup = BeautifulSoup(html, 'html5lib')

		text = soup.get_text()
		pt_str = str(html)

		fic.fetched = OilTimestamp(ts)
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		for a in soup.find_all('a', {'class': 'stitle'}):
			fic.title = a.getText()
			break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		for div in soup.find_all('div', {'class': 'z-padtop'}):
			fic.description = div.contents[0]
			break
		else:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text
		)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in soup.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		zl = soup.find('div', {'class': 'z-list'})
		fan = None if zl is None else zl.get('data-category')
		pendingFandoms: List[Fandom] = []
		if fan is not None:
			pendingFandoms += self.handleFandom(fic, fan)
			# TODO: crossovers?

		#print('---')
		#print(fic.__dict__)
		#raise Exception('todo')

		fic.upsert()
		for pfandom in pendingFandoms:
			fic.add(pfandom)

		return fic
Пример #12
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'  # TODO?

		ficTitleDiv = soup.find('div', {'class': 'fic-title'})
		fic.title = ficTitleDiv.find('h1').getText().strip()

		authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a')
		author = authorLink.getText().strip()
		authorUrl = self.baseUrl + authorLink.get('href')
		authorId = authorUrl.split('/')[-1]
		self.setAuthor(fic, author, authorUrl, authorId)

		divDescription = soup.find('div', {'class': 'description'})
		try:
			descView = HtmlView(str(divDescription), markdown=False)
			desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text])
			fic.description = desc
		except:
			fic.description = divDescription.getText().strip()

		fictionInfo = str(soup.find('div', {'class': 'fiction-info'}))
		if fictionInfo.find('>ONGOING<') != -1:
			fic.ficStatus = FicStatus.ongoing
		elif fictionInfo.find('>COMPLETED<') != -1:
			fic.ficStatus = FicStatus.complete
		elif fictionInfo.find('>HIATUS<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>STUB<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>DROPPED<') != -1:
			fic.ficStatus = FicStatus.abandoned
		else:
			raise Exception('unable to find fic status')

		divStatsContent = soup.find('div', {'class': 'stats-content'})
		followers = divStatsContent.find(text='Followers :')
		ul = followers.parent.parent

		RegexMatcher(
			ul.getText(), {
				'followCount?': ('Followers\s+:\s+([\d,]+)', str),
				'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str),
			}
		).matchAll(fic)

		if str(fic.followCount).find(','):
			fic.followCount = int(str(fic.followCount).replace(',', ''))
		if str(fic.favoriteCount).find(','):
			fic.favoriteCount = int(str(fic.favoriteCount).replace(',', ''))

		tableChapters = soup.find('table', {'id': 'chapters'})
		chapterLinks = tableChapters.findAll('a')

		chapterUrls: List[str] = []
		chapterTitles: List[str] = []
		for chapterLink in chapterLinks:
			# TODO FIXME is this inverted?
			if chapterLink.find('time') is not None:
				continue
			chapterUrls += [chapterLink.get('href')]
			chapterTitles += [chapterLink.getText().strip()]

		chapterDates: List[int] = []
		for chapterLink in chapterLinks:
			if chapterLink.find('time') is None:
				continue
			timeElement = chapterLink.find('time')
			if timeElement.get('unixtime'):
				chapterDates += [int(timeElement.get('unixtime'))]
			else:
				chapterDates += [
					util.parseDateAsUnix(timeElement.get('title'), fic.fetched)
				]

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))
		fic.chapterCount = len(chapterUrls)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.url = self.baseUrl + chapterUrls[cid - 1]
			if chapterUrls[cid - 1].startswith('/fiction/chapter/'):
				# alternate chapter syntax if the chapter itself has no slug
				# /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug>
				chapter.localChapterId = (
					chapterUrls[cid - 1].split('/')[3].split('?')[0]
				)
			else:
				# standard chapter syntax
				# /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug>
				chapter.localChapterId = chapterUrls[cid - 1].split('/')[5]
			chapter.title = chapterTitles[cid - 1]

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			if chapter.html() is None:
				chapter.cache()

			chapter.upsert()
			chtml = chapter.html()
			if chtml is not None:
				wordCount += len(chtml.split())

		fic.wordCount = wordCount

		return fic
Пример #13
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        archive = fic.localId.split('/')[0]
        storyNo = fic.localId.split('/')[1]

        soup = BeautifulSoup(wwwHtml, 'html5lib')

        titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)})
        fic.title = str(titleH2.getText())

        membersUrl = 'http://members.adult-fanfiction.org/profile.php?no='
        memberLink = soup.find(
            lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href")
                       is not None and (t.get("href").startswith(membersUrl))))

        author = memberLink.getText()
        authorId = memberLink.get('href')[len(membersUrl):]
        authorUrl = memberLink.get('href')
        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO
        fic.ficStatus = FicStatus.ongoing

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: description is on search page
        if fic.description is None:
            fic.description = 'TODO: on the search page?'

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ageRating = 'M'

        # TODO
        if fic.published is None:
            fic.published = OilTimestamp.now()
        if fic.updated is None:
            fic.updated = fic.published

        chapterDropdown = soup.find('div', {'class': 'dropdown-content'})
        chapterLinks = chapterDropdown.findAll('a')
        oldChapterCount = fic.chapterCount
        fic.chapterCount = len(chapterLinks)

        if fic.wordCount is None:
            fic.wordCount = 0
        fic.upsert()

        wordCount = 0
        for cid in range(1, fic.chapterCount + 1):
            chapterContent = scrape.softScrape(
                self.constructUrl(fic.localId, cid))
            chapter = fic.chapter(cid)
            if chapterContent is not None:
                chapter.setHtml(chapterContent)
            chapter.localChapterId = str(cid)
            chapter.url = self.constructUrl(fic.localId, cid)

            chapter.title = chapterLinks[cid - 1].getText().strip()
            if chapter.title is not None:
                chapter.title = util.cleanChapterTitle(chapter.title, cid)

            chapter.upsert()
            if chapterContent is not None:
                wordCount += len(chapterContent.split())

        fic.wordCount = wordCount

        if oldChapterCount is not None and oldChapterCount < fic.chapterCount:
            fic.updated = OilTimestamp.now()  # TODO
        fic.upsert()

        storyUrl = self.constructUrl(fic.localId, chapterId=None)

        # more metadata from search page
        searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                     'auth={}&title={}&summary=&tags=&cats=0&search=Search')
        searchUrl = searchUrl.format(archive, author,
                                     fic.title.replace(' ', '+'))
        data = scrape.scrape(searchUrl)['raw']

        metas = self.extractSearchMetadata(data)

        # fallback to pure author search
        if storyUrl not in metas:
            searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                         'auth={}&title=&summary=&tags=&cats=0&search=Search')
            searchUrl = searchUrl.format(archive, author)
            data = scrape.scrape(searchUrl)['raw']
            metas = self.extractSearchMetadata(data)

        if storyUrl not in metas:
            raise Exception('cannot find search metadata')

        meta = metas[storyUrl]

        assert (meta.published is not None and meta.updated is not None)
        fic.published = OilTimestamp(meta.published)
        fic.updated = OilTimestamp(meta.updated)

        fic.reviewCount = meta.reviewCount
        fic.favoriteCount = meta.views  # TODO

        fic.ficStatus = meta.ficStatus

        assert (meta.description is not None)
        fic.description = meta.description
        assert (fic.description is not None)
        if len(meta.tags) > 0:
            fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags)

        for fan in meta.fandoms:
            fic.add(Fandom.define(fan))

        return fic
Пример #14
0
    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        pagetitle = soup.find(id='pagetitle')
        aTags = pagetitle.findAll('a')
        author = None
        for a in aTags:
            href = a.get('href')
            if href.startswith('viewstory'):
                fic.title = a.contents[0].strip()
            elif href.startswith('viewuser.php?uid='):
                author = a.contents[0]
                authorUrl = self.baseUrl + href
                authorId = str(int(href[len('viewuser.php?uid='):]))
                self.setAuthor(fic, author, authorUrl, authorId)

        if fic.title is None:
            raise Exception('unable to find title')
        if author is None:
            raise Exception('unable to find author')

        lines = html.replace('\r', '\n').replace('<', '\n<').split('\n')
        inDescription = False
        description = ''
        for line in lines:
            cur = line.strip()
            if cur.find('!-- SUMMARY START --') != -1:
                inDescription = True
            elif cur.find('!-- SUMMARY END --') != -1:
                inDescription = False

            if inDescription == True:
                description += cur + '\n'

        fic.description = description

        fic.ageRating = '<unkown>'

        infoBlock = None
        infoText = None
        blocks = soup.findAll('div', {'class': 'block'})
        for block in blocks:
            title = block.find('div', {'class': 'title'})
            if title is None:
                continue
            if title.contents[0] != 'Story Information':
                continue
            infoBlock = block
            infoText = block.get_text()
            break
        else:
            raise Exception('unable to find info text')

        matcher = RegexMatcher(
            infoText, {
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Word count:\s+(\S+)', int),
            })
        matcher.matchAll(fic)

        sortDiv = soup.find(id='sort')
        match = re.search('Reviews\s*-\s*([^\]]+)', sortDiv.get_text())
        if match is not None:
            fic.reviewCount = int(match.group(1).replace(',', ''))
        else:
            fic.reviewCount = 0

        fic.favoriteCount = 0
        fic.followCount = 0

        infoBlockHtml = str(infoBlock)
        match = re.search(
            '<!-- PUBLISHED START -->([^<]*)<!-- PUBLISHED END -->',
            infoBlockHtml)
        if match is not None:
            publishedUts = util.parseDateAsUnix(match.group(1), fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        match = re.search('<!-- UPDATED START -->([^<]*)<!-- UPDATED END -->',
                          infoBlockHtml)
        if match is not None:
            updatedUts = util.parseDateAsUnix(match.group(1), fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.updated is None:
            fic.updated = fic.published

        match = re.search('Completed:\s+(\S+)', infoText)
        if match is not None:
            complete = match.group(1)
            if complete == 'No':
                fic.ficStatus = FicStatus.ongoing
            elif complete == 'Yes':
                fic.ficStatus = FicStatus.complete
            else:
                raise Exception('unknown complete value: {}'.format(complete))

        match = re.search('Crossovers', infoText)
        if match is not None:
            pass  # raise Exception('Found unknown crossover in {0}: {1}'.format(fic.id, fic.url))
        else:
            # otherwise not a crossover and just harry potter
            fic.add(Fandom.define('Harry Potter'))

        return fic
Пример #15
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'})
        if len(storyMainInfo) != 1:
            raise Exception('unable to find main story info')
        storyMainInfo = storyMainInfo[0]

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid="
        for a in soup.findAll('a'):
            href = a.get('href')
            if (not href.startswith(disclaimerJs)
                    and href != '?psid={}'.format(fic.localId)):
                continue
            fic.title = a.getText()
            break
        else:
            raise Exception('error: unable to find title')

        fic.url = self.constructUrl(fic.localId)

        storySummaryTable = soup.findAll('table', {'class': 'storysummary'})
        if len(storySummaryTable) != 1:
            raise Exception('cannot find story summary table')
        storySummaryTable = storySummaryTable[0]
        fic.description = (storySummaryTable.getText().strip())
        if fic.description is None:
            raise Exception('error: unable to find description')

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        text = storyMainInfo.getText().replace('\xa0', ' ')
        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str),
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\d+)', int),
                'reviewCount': ('Story Reviews:\s*(\d+)', int),
                'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int),
                'updated': ('Last Updated:\s+(\S+)', str),
                'published': ('First Published:\s+(\S+)', str),
            })
        matcher.matchAll(fic)

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)',
                          text)
        if match is None:
            raise Exception('cannot find write status')

        status = match.group(1)
        if status == 'Completed':
            fic.ficStatus = FicStatus.complete
        elif status == 'Work In Progress':
            fic.ficStatus = FicStatus.ongoing  # should these be abandoned?
        elif status == 'Abandoned':
            fic.ficStatus = FicStatus.abandoned
        else:
            raise Exception('unknown status: {}'.format(status))

        for a in soup.findAll('a'):
            a_href = a.get('href')
            if a_href.startswith('viewuser.php?showuid='):
                author = a.get_text()
                authorUrl = self.baseUrl + '/' + a_href
                authorId = a_href[len('viewuser.php?showuid='):]
                self.setAuthor(fic, author, authorUrl, authorId)
                break
        else:
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: chars/pairings?
        fic.add(Fandom.define('Harry Potter'))
        return fic
Пример #16
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicText = 'Story Not FoundUnable to locate story. Code 1.'
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				if gui_warning.get_text() == deletedFicText:
					fic.ficStatus = FicStatus.abandoned
					fic.upsert()
					return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
				break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
			):
				fic.description = div.get_text()
				break
		else:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+(\S+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search('Status:\s+(\S+)', text)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(1)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}'.format(status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = preStoryLinks.find_all('a')
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
			):
				cat = hrefParts[1]
				if cat in fictionPressCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a regular genre in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
			):
				# ensure category is in our map
				if hrefParts[1] not in fictionPressCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				# ensure it's in our whitelist
				if hrefParts[2] not in fictionPressGenres:
					util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}')
					continue

				fic.add(Fandom.define(hrefParts[2]))
				continue

			util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}')
			continue

		fic.upsert()

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1)
			elif fic.chapterCount == 1 and cid == 0:
				ch.title = fic.title
			ch.upsert()

		return fic