Python Fic.wordCount примеры использования

Язык программирования: Python

Пространство имен/Пакет: store

Класс/Тип: Fic

Метод/Функция: wordCount

Примеров на hotexamples.com: 9

Python Fic.wordCount - 9 примеров найдено. Это лучшие примеры Python кода для store.Fic.wordCount, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

upsert(23)

url(21)

ficStatus(16)

title(16)

published(15)

languageId(15)

reviewCount(15)

followCount(15)

fetched(15)

favoriteCount(15)

updated(15)

description(15)

chapter(15)

lookup(13)

chapterCount(13)

select(12)

add(12)

ageRating(11)

wordCount(9)

new(6)

list(4)

getAuthorName(4)

load(3)

get(3)

tryLoad(2)

getUserFic(1)

fandoms(1)

extraMeta(1)

authorId(1)

insert(1)

Пример #1

Показать файл

Файл: wavesArisenAdapter.py Проект: FanFicDev/hermes

    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        html = html.replace('\r\n', '\n')
        soup = BeautifulSoup(html, 'html.parser')

        # wooh hardcoding
        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")

        fic.title = 'The Waves Arisen'
        fic.ageRating = 'M'

        self.setAuthor(fic, 'wertifloke', 'https://wertifloke.wordpress.com/',
                       str(2))

        # taken from https://www.parahumans.net/about/
        fic.description = '''
A young Naruto found refuge in the village library, and grew up smart, but by blood he is Ninja, and what place is there for curiosity and calculation in this brutal world of warring states?

The Waves Arisen is a complete novel-length work of Rationalist Naruto Fanfiction. No prior knowledge of the Naruto universe is necessary to follow along. '''

        chapterUrls = self.getChapterUrls(html)
        oldChapterCount = fic.chapterCount
        fic.chapterCount = len(chapterUrls)

        # TODO?
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        if fic.ficStatus is None or fic.ficStatus == FicStatus.broken:
            fic.ficStatus = FicStatus.ongoing

        fic.published = self.getChapterPublishDate(chapterUrls[0])
        fic.updated = self.getChapterPublishDate(chapterUrls[-1])

        if oldChapterCount is None or fic.chapterCount > oldChapterCount:
            fic.wordCount = 0
        if fic.wordCount == 0:
            fic.upsert()
            for cid in range(1, fic.chapterCount + 1):
                c = fic.chapter(cid)
                c.cache()
                chtml = c.html()
                if chtml is not None:
                    fic.wordCount += len(chtml.split())

        fic.add(Fandom.define('Naruto'))
        # TODO: chars/relationship?

        return fic

Пример #2

Показать файл

Файл: xenForoAdapter.py Проект: FanFicDev/hermes

    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(wwwHtml, 'html5lib')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?
        if fic.ficStatus is None or fic.ficStatus == FicStatus.broken:
            fic.ficStatus = FicStatus.ongoing

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0
        fic.ageRating = 'M'  # TODO?

        # grab title from <title> element
        titles = soup.find('head').find_all('title')
        if len(titles) != 1:
            raise Exception(f'error: cannot find title: {len(titles)}')
        ntitle = ''
        try:
            ntitle = titles[0].get_text()
        except:
            pass  # TODO FIXME
        if fic.title is None or len(ntitle.strip()) > 0:
            fic.title = ntitle
        if len(self.titleSuffix) > 0 and fic.title.endswith(self.titleSuffix):
            fic.title = fic.title[:-len(self.titleSuffix)]
        fic.title = fic.title.strip()

        # determine author
        authorPost = self.getRealAuthorPost(fic)
        authorPostUsernames = authorPost.find_all('a', {'class': 'username'})
        if len(authorPostUsernames) < 1:
            raise Exception('error: unable to find author username')
        author = authorPostUsernames[0].get_text()
        auth_href = authorPostUsernames[0].get('href')
        authorUrl = urllib.parse.urljoin(self.baseUrl, auth_href)
        if not authorUrl.startswith(self.baseUrl):
            raise Exception('error: unknown username href format')
        authorId = authorUrl[len(self.baseUrl):]
        if not authorId.startswith('members/'):
            raise Exception(f'error: unknown author id format: {authorId}')
        authorId = authorId.split('/')[1]
        self.setAuthor(fic, author, authorUrl, authorId)

        if fic.description is None:
            # TODO?
            fic.description = htmlEscape(fic.title + ' by ' +
                                         fic.getAuthorName())

        # try grabbing reader version, fallback to full pages
        threadmarksHtml = None
        try:
            sep = '?' if self.baseUrl.find('?') < 0 else '&'
            url = f'{self.baseUrl}threads/{fic.localId}/threadmarks{sep}category_id=1'
            threadmarksHtml = self.scrapeLike(url)
            self.readerSoftScrape(fic)
        except:
            # note: we do this before the theardmarks check for old-style fics
            # soft scrape all thread pages to ensure we have everything
            self.deepSoftScrape(fic)

        postSoups: Dict[str, Any] = {}

        postUrls: List[str] = []
        chapterTitles = {}
        try:
            # scrape the threadmarks page, assuming there is one
            threadmarksSoup = BeautifulSoup(threadmarksHtml, 'html5lib')

            # attempt to extract a fic description
            threadmarkExtraInfo = threadmarksSoup.find(
                'div', {'class': 'threadmarkListingHeader-extraInfo'})
            if threadmarkExtraInfo is not None:
                bbWrapper = threadmarkExtraInfo.find('div',
                                                     {'class': 'bbWrapper'})
                if bbWrapper is not None:
                    desc = bbWrapper.decode_contents()
                    descView = HtmlView(desc, markdown=False)
                    fic.description = ''.join(
                        [f'<p>{l}</p>' for l in descView.text])

            # determine chapter count based on threadmarks
            threadmarkList = threadmarksSoup.find('div',
                                                  {'class': 'threadmarkList'})
            threadmarks = None
            if threadmarkList is not None:
                threadmarks = threadmarkList.find_all(
                    'li', {'class': 'threadmarkListItem'})
            else:
                threadmarkList = threadmarksSoup.find(
                    'div', {'class': 'block-body--threadmarkBody'})
                if threadmarkList is None:
                    raise Exception('error: unable to find threadmark menu')
                if threadmarkList.find(class_='fa-ellipsis-h') is not None:
                    raise Exception('unable to handle elided threamdarks')
                threadmarks = threadmarkList.find_all('li')
                if len(threadmarks) == 0:
                    threadmarks = threadmarkList.find_all('tr')
                util.logMessage(
                    f'XenForo|new threadmarks count|{len(threadmarks)}')

            for threadmark in threadmarks:
                if threadmark.find(
                        'span', {'class': 'message-newIndicator'}) is not None:
                    continue
                a = threadmark.find('a')
                purl = a.get('href')
                if purl.startswith('threads/'):
                    purl = '{}{}'.format(self.baseUrl, purl)
                elif purl.startswith('/threads/'):
                    purl = '{}{}'.format(self.baseUrl, purl[1:])
                postUrls += [purl]

                chapterTitles[len(postUrls)] = a.getText().strip()

            try:
                postSoups, _ = self.getReaderPosts(fic)
            except Exception as ie:
                # FIXME oh boy:
                # https://forum.questionablequesting.com/threads/worm-cyoa-things-to-do-in-brockton-bay-when-youre-a-bored-demigod.1247/reader
                # Reader page says 36 threadmarks, but actual threadmark list says 33
                # First reader page abruptly stops at 27 threadmarks
                util.logMessage(
                    'XenForoAdapter: unable to getReaderPosts: {}\n{}'.format(
                        ie, traceback.format_exc()))
        except Exception as e:
            util.logMessage(
                'XenForoAdapter: unable to parse threadmarks: {}\n{}'.format(
                    e, traceback.format_exc()))
            try:
                postUrls = self.getReaderPostUrls(fic)
                postSoups, chapterTitles = self.getReaderPosts(fic)
            except Exception as ie:
                util.logMessage(
                    'XenForoAdapter: unable to parse reader posts: {}\n{}'.
                    format(ie, traceback.format_exc()))
                postUrls = self.getDeepAuthorPostUrls(fic)
                # if we fallback to here, don't immediately setup postSoups at all;
                # they'll be fetched as needed later

        fic.chapterCount = len(postUrls)

        chapterPosts: List[Optional[str]] = []
        chapterUrls: List[str] = []
        chapterPostIds: List[str] = []

        lastSoupUrl: Optional[str] = None
        lastSoup: Optional[Any] = None

        for purl in postUrls:
            parts = purl.split('#')
            burl = parts[0]
            postId = authorPost.get('id') if len(parts) < 2 else parts[1]

            rawPost = None
            # first try getting the post from the reader pages
            if postId in postSoups and postSoups[postId] is not None:
                rawPost = str(postSoups[postId])
            else:
                # if needed, fallback to grabbing that page from the entire thread
                pageSoup = None
                if lastSoupUrl is not None and lastSoupUrl == burl:
                    pageSoup = lastSoup
                else:
                    pageContent = self.scrapeLike(burl)
                    pageSoup = BeautifulSoup(pageContent, 'html5lib')
                    lastSoupUrl = burl
                    lastSoup = pageSoup
                assert (pageSoup is not None)
                if postId is not None:
                    poss = pageSoup.find_all(self.postContainer,
                                             {'id': postId})
                    if len(poss) != 1:
                        # XenForo2 often has js- prefixed on the actual id attr
                        poss = pageSoup.find_all(self.postContainer,
                                                 {'id': 'js-' + postId})
                    if len(poss) != 1:
                        raise Exception(
                            f'error: cannot find post for chapter {postId}')
                    rawPost = str(poss[0])
                else:
                    rawPost = str(
                        pageSoup.find_all(self.postContainer,
                                          {'class': 'message'})[0])

            chapterPosts += [rawPost]
            chapterUrls += [burl]
            chapterPostIds += [postId]

        fic.wordCount = 0
        fic.published = None
        fic.updated = None

        chapterContents: List[str] = []
        for rawPost in chapterPosts:
            post = BeautifulSoup(rawPost, 'html5lib')
            content = post.find_all(
                'div', {'class': ['messageContent', 'message-content']})
            if len(content) != 1:
                raise Exception('error: cannot find content for chapter post')
            content = content[0]

            lastEditedDivs = content.find_all('div',
                                              {'class': 'message-lastEdit'})
            for lastEditedDiv in lastEditedDivs:
                br = soup.new_tag("br")
                lastEditedDiv.insert_before(br)

            chapterContents += [str(content)]
            fic.wordCount += len(str(content).split())

            uts = self.getPostUpdatedOrPublished(post)

            if fic.published is None:
                fic.published = OilTimestamp(uts)
            fic.updated = OilTimestamp(uts)

        if fic.updated is None:
            raise Exception(
                f'unable to determine updated date: {len(chapterPosts)} {len(postUrls)}'
            )

        fic.upsert()
        for cid in range(fic.chapterCount):
            chapter = fic.chapter(cid + 1)
            chapter.url = chapterUrls[cid]
            chapter.localChapterId = chapterPostIds[cid]
            if (cid + 1) in chapterTitles:
                chapter.title = chapterTitles[(cid + 1)]
            chapter.upsert()

            chapter.setHtml(str(chapterContents[cid]))

        # TODO: word count, published, updated can only be found once all chapters

        # each post is inside an li id="post-{number}" class="message"
        # each post has data-author="{author}"

        self.updateTitle(fic)

        return fic

Пример #3

Показать файл

Файл: sugarQuillAdapter.py Проект: FanFicDev/hermes

	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		html = html.replace('\r\n', '\n')
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		infoPane = soup.findAll('td', {'class': 'info2_pane'})
		if len(infoPane) != 1:
			raise Exception('unable to find info2_pane: {}'.format(fic.url))
		infoPane = infoPane[0]

		authorHrefPrefix = 'index.php?action=profile&id='
		authorLinks = infoPane.findAll('a')
		authorUrl = None
		for authorLink in authorLinks:
			if not authorLink.get('href').startswith(authorHrefPrefix):
				continue

			authorUrl = self.baseUrl + '/' + authorLink.get('href')
			author = authorLink.getText()
			authorLocalId = authorLink.get('href')[len(authorHrefPrefix):]

			self.setAuthor(fic, author, authorUrl, authorLocalId)
			break
		else:
			raise Exception('unable to find author: {}'.format(fic.url))

		titleMatch = re.search(
			'<b>Story</b>:((.|\r|\n)*)<b>Chapter</b>:', str(infoPane), re.MULTILINE
		)
		if titleMatch is None:
			edumpContent(str(infoPane), 'sugarquill_title')
			raise Exception('could not locate title')

		fic.title = titleMatch.group(1).replace('&nbsp;', ' ').strip()

		chapterOptions = infoPane.findAll('option')
		chapterTitles = {}
		for chapterOption in chapterOptions:
			cid = int(chapterOption.get('value'))
			chapterTitles[cid] = chapterOption.getText().strip()
		fic.chapterCount = len(chapterOptions)

		fic.ageRating = '<unkown>'  # TODO
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ficStatus = FicStatus.ongoing  # TODO: no uniform way to detect?

		authorProfileHtml = scrape.scrape(authorUrl)['raw']
		authorProfileHtml = authorProfileHtml.replace('\r', '')
		authorSoup = BeautifulSoup(authorProfileHtml, 'html5lib')

		storyTables = authorSoup.findAll('table', {'width': '90%'})
		ourStoryTable = None
		for storyTable in storyTables:
			storyId = None
			for a in storyTable.findAll('a'):
				if not a.get('href').startswith('read.php?storyid='):
					continue
				storyId = a.get('href')[len('read.php?storyid='):]
				storyId = storyId[:storyId.find('&')]
				storyId = str(int(storyId))
			if storyId is None:
				continue
			if storyId != str(fic.localId):
				continue
			ourStoryTable = storyTable
		if ourStoryTable is None:
			raise Exception(f'unable to find story table: {fic.localId} {authorUrl}')

		trs = ourStoryTable.findAll('tr')
		if len(trs) != 3:
			raise Exception(
				f'ourStoryTable does not have 3 trs: {fic.localId} {authorUrl}'
			)

		fic.description = trs[1].find('td').getText().strip()

		reviewsMatch = re.search(
			'\( Reviews: <a[^>]*>(\\d+)</a> \)</td>', str(trs[0]), re.MULTILINE
		)
		if reviewsMatch is None:
			edumpContent(str(trs[0]), 'sugarquill_reviews')
			raise Exception('could not locate reviews')

		fic.reviewCount = int(reviewsMatch.group(1).strip())

		updatedMatch = re.search('Last updated (\\d+/\\d+/\\d+)', str(trs[2]))
		if updatedMatch is None:
			edumpContent(str(trs[2]), 'sugarquill_updated')
			raise Exception('could not locate last updated')

		fic.updated = OilTimestamp(
			util.parseDateAsUnix(updatedMatch.group(1), fic.fetched)
		)
		if fic.published is None:
			fic.published = fic.updated

		fic.wordCount = 0
		fic.upsert()

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			ch.title = chapterTitles[cid + 1]
			ch.cache()
			ch.upsert()
			chtml = ch.html()
			if chtml is not None:
				fic.wordCount += len(chtml.split())

		fic.add(Fandom.define('Harry Potter'))
		# TODO: chars/relationship?

		return fic

Пример #4

Показать файл

	def parseInfoInto(self, fic: Fic, html: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(html, 'html.parser')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		titleHeadings = soup.findAll('h2', {'class': 'title heading'})
		if len(titleHeadings) != 1:
			raise Exception('unable to find ao3 title {}'.format(fic.url))
		fic.title = titleHeadings[0].get_text().strip()

		summaryModules = soup.findAll('div', {'class': 'summary module'})
		if len(summaryModules) != 1:
			prefaceGroups = soup.findAll('div', {'class': 'preface group'})
			if len(prefaceGroups) == 1:
				summaryModules = prefaceGroups[0].findAll(
					'div', {'class': 'summary module'}
				)

		if len(summaryModules) == 1:
			summaryBq = summaryModules[0].find('blockquote')
			fic.description = summaryBq.decode_contents(formatter='html').strip()
		elif fic.description is None:
			fic.description = "{no summary}"
			# raise Exception('unable to find ao3 summary {}'.format(fic.localId))

		fic.ageRating = '<unkown>'

		# TODO: error handling
		cText = ' '.join(soup.find('dd', {'class': 'chapters'}).contents).strip()
		ps = cText.split('/')
		completedChapters = int(ps[0])
		totalChapters = None if ps[1] == '?' else int(ps[1])
		fic.chapterCount = completedChapters

		wText = ' '.join(soup.find('dd', {'class': 'words'}).contents).strip()
		fic.wordCount = int(wText)

		fic.reviewCount = 0

		fic.favoriteCount = 0
		kDefinition = soup.find('dd', {'class': 'kudos'})
		if kDefinition is not None:
			kText = ' '.join(kDefinition.contents).strip()
			fic.favoriteCount = int(kText)

		fic.followCount = 0

		pText = ' '.join(soup.find('dd', {'class': 'published'}).contents).strip()
		publishedUts = util.parseDateAsUnix(pText, fic.fetched)
		fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		if fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		fic.ficStatus = FicStatus.ongoing  # TODO chapter/chapters?

		if totalChapters is None or completedChapters < totalChapters:
			fic.ficStatus = FicStatus.ongoing

		statusDt = soup.find('dt', {'class': 'status'})
		if statusDt is not None:
			if statusDt.contents[0] == 'Completed:':
				fic.ficStatus = FicStatus.complete
				cText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip()
				updatedUts = util.parseDateAsUnix(cText, fic.fetched)
				fic.updated = OilTimestamp(updatedUts)
			elif statusDt.contents[0] == 'Updated:':
				fic.ficStatus = FicStatus.ongoing
				uText = ' '.join(soup.find('dd', {'class': 'status'}).contents).strip()
				updatedUts = util.parseDateAsUnix(uText, fic.fetched)
				fic.updated = OilTimestamp(updatedUts)
			else:
				raise Exception('unkown status: {}'.format(statusDt.contents[0]))

		byline = soup.find('h3', {'class': 'byline heading'})
		authorLink = byline.find('a')
		if authorLink is None:
			if fic.authorId is not None and len(fic.getAuthorName()) > 0:
				pass  # updated author to anon, don't make changes
			else:
				# first loaded after it was already set to anonymous
				authorUrl = ''
				author = 'Anonymous'
				authorId = 'Anonymous'
				self.setAuthor(fic, author, authorUrl, authorId)
		else:
			authorUrl = authorLink.get('href')
			author = ' '.join(byline.find('a').contents)
			authorId = author  # map pseudo to real?
			self.setAuthor(fic, author, authorUrl, authorId)

		if fic.chapterCount > 1:
			fic.upsert()
			localChapterIdSelect = soup.find(id='selected_id').findAll('option')
			# note: ao3 sometimes says there are less chapters than there really
			# are, possibly due to caching on their end. We just ensure there's _at
			# least_ chapterCount chapters, then fetch whatever the dropdown tells
			# us to
			if len(localChapterIdSelect) > fic.chapterCount:
				fic.chapterCount = len(localChapterIdSelect)
				fic.upsert()
			if len(localChapterIdSelect) != fic.chapterCount:
				raise Exception('mismatching localChapterId count?')

			for cid in range(1, fic.chapterCount + 1):
				chap = fic.chapter(cid)
				chap.url = '{}{}/chapters/{}?view_adult=true'.format(
					self.baseUrl, fic.localId, localChapterIdSelect[cid - 1].get('value')
				)
				chap.localChapterId = localChapterIdSelect[cid - 1].get('value')
				chap.title = localChapterIdSelect[cid - 1].getText().strip()
				if chap.title is not None:
					chap.title = util.cleanChapterTitle(chap.title, cid)
				chap.upsert()

		fandomDd = soup.find('dd', {'class': 'fandom tags'})
		if fandomDd is not None:
			fandomTags = fandomDd.findAll('a', {'class': 'tag'})
			for ft in fandomTags:
				originalF = ft.contents[0].strip()
				f = originalF.lower()
				# TODO: this seriously needs reworked
				if (
					(f.startswith("harry potter ") and f.endswith("rowling"))
					or f == 'harry potter - fandom'
					or f == 'fantastic beasts and where to find them (movies)'
					or f == 'harry potter next generation - fandom'
				):
					fic.add(Fandom.define('Harry Potter'))
				elif (
					f == 'sherlock - fandom' or f == 'sherlock (tv)'
					or f == 'sherlock holmes & related fandoms'
					or f == 'sherlock holmes - arthur conan doyle'
					or f == 'sherlock holmes (downey films)'
				):
					fic.add(Fandom.define('Sherlock Holmes'))
				elif f == 'furry (fandom)' or f == 'harry - fandom':
					continue  # skip
				elif f == 'fleurmione - fandom':
					continue  # skip
				elif f == 'skyfall (2012) - fandom':
					fic.add(Fandom.define('James Bond'))
				elif f == 'orphan black (tv)':
					fic.add(Fandom.define('Orphan Black'))
				elif (
					f == 'naruto' or f == 'naruto shippuden'
					or f == 'naruto shippuuden - fandom'
				):
					fic.add(Fandom.define('Naruto'))
				elif f == 'naruto/harry potter':
					fic.add(Fandom.define('Naruto'))
					fic.add(Fandom.define('Harry Potter'))
				elif f == 'bleach':
					fic.add(Fandom.define('Bleach'))
				elif (
					f == 'iron man (movies)' or f == 'iron man - all media types'
					or f == 'iron man (comic)' or f == 'iron man - fandom'
					or f == 'iron man (comics)'
				):
					fic.add(Fandom.define('Iron Man'))
				elif (
					f == 'the avengers (marvel) - all media types'
					or f == 'the avengers (marvel movies)'
					or f == 'the avengers - ambiguous fandom'
					or f == 'the avengers (2012)' or f == 'the avengers'
					or f == 'avengers (marvel) - all media types'
					or f == 'marvel avengers movies universe' or f == 'avengers'
				):
					fic.add(Fandom.define('Avengers'))
				elif f == 'marvel 616':
					fic.add(Fandom.define('Marvel'))
					fic.add(Fandom.define('Marvel 616'))
				elif f == 'thor (movies)' or f == 'thor - all media types':
					fic.add(Fandom.define('Thor'))
				elif (
					f == 'captain america (movies)'
					or f == 'captain america - all media types'
					or f == 'captain america (comics)'
				):
					fic.add(Fandom.define('Captain America'))
				elif (
					f == 'avatar: the last airbender' or f == 'avatar: legend of korra'
					or f == 'avatar the last airbender - fandom'
				):
					fic.add(Fandom.define('Avatar'))
				elif f == 'original work':
					fic.add(Fandom.define('Original Work'))
				elif f == 'stargate atlantis':
					fic.add(Fandom.define('Stargate Atlantis'))
				elif f == 'stargate sg-1':
					fic.add(Fandom.define('Stargate SG-1'))
				elif f == 'stargate - all series':
					fic.add(Fandom.define('Stargate Atlantis'))
					fic.add(Fandom.define('Stargate SG-1'))
				elif f == 'agents of s.h.i.e.l.d. (tv)':
					fic.add(Fandom.define('Avengers'))
				elif f == 'supernatural':
					fic.add(Fandom.define('Supernatural'))
				elif f == 'teen wolf (tv)':
					fic.add(Fandom.define('Teen Wolf'))
				elif f == 'grimm (tv)':
					fic.add(Fandom.define('Grimm'))
				elif (
					f == 'the amazing spider-man (movies - webb)'
					or f == 'spider-man - all media types'
					or f == 'spider-man: homecoming (2017)'
				):
					fic.add(Fandom.define('Spiderman'))
				elif (
					f == 'x-men - all media types' or f == 'x-men (movieverse)'
					or f == 'x-men (comicverse)'
				):
					fic.add(Fandom.define('X-Men'))
				elif (
					f == 'lord of the rings - j. r. r. tolkien'
					or f == 'the lord of the rings - j. r. r. tolkien'
				):
					fic.add(Fandom.define('Lord of the Rings'))
				elif (
					f == 'crisis core: final fantasy vii'
					or f == 'compilation of final fantasy vii' or f == 'final fantasy vii'
				):
					fic.add(Fandom.define('Final Fantasy VII'))
					fic.add(Fandom.define('Final Fantasy'))
				elif f == 'sen to chihiro no kamikakushi | spirited away':
					fic.add(Fandom.define('Spirited Away'))
				elif f == 'howl no ugoku shiro | howl\'s moving castle':
					fic.add(Fandom.define('Howl\'s Moving Castle'))
				elif f == 'rise of the guardians (2012)':
					fic.add(Fandom.define('Rise of the Guardians'))
				elif (
					f == 'doctor who' or f == 'doctor who (2005)'
					or f == 'doctor who & related fandoms'
				):
					fic.add(Fandom.define('Doctor Who'))
				elif f == 'daredevil (tv)' or f == 'daredevil (comics)':
					fic.add(Fandom.define('DareDevil'))
				elif f == 'labyrinth (1986)':
					fic.add(Fandom.define('Labyrinth'))
				elif f == 'gravity falls':
					fic.add(Fandom.define('Gravity Falls'))
				elif f == 'once upon a time (tv)':
					fic.add(Fandom.define('Once Upon a Time'))
				elif f == 'doctor strange (comics)':
					fic.add(Fandom.define('Doctor Strange'))
				elif f == 'the sentinel':
					fic.add(Fandom.define('The Sentinel'))
				elif f == 'teen titans (animated series)':
					fic.add(Fandom.define('Teen Titans'))
				elif (
					f == 'dcu' or f == 'dcu animated' or f == 'dcu (comics)'
					or f == 'dc extended universe' or f == 'dc animated universe'
				):
					fic.add(Fandom.define('DC'))
				elif f == 'vampire hunter d':
					fic.add(Fandom.define('Vampire Hunter D'))
				elif f == 'homestuck':
					fic.add(Fandom.define('Homestuck'))
				elif f == 'one piece':
					fic.add(Fandom.define('One Piece'))
				elif f == 'batman (movies - nolan)':
					fic.add(Fandom.define('Batman'))
				elif f == 'die hard (movies)':
					fic.add(Fandom.define('Die Hard'))
				elif f == 'discworld - terry pratchett':
					fic.add(Fandom.define('Discworld'))
				elif f == 'gossip girl':
					fic.add(Fandom.define('Gossip Girl'))
				elif (
					f == 'a song of ice and fire - george r. r. martin'
					or f == 'a song of ice and fire & related fandoms'
				):
					fic.add(Fandom.define('A Song of Ice and Fire'))
				elif f == 'supergirl (tv 2015)':
					fic.add(Fandom.define('Supergirl'))
				elif f == 'merlin (tv)':
					fic.add(Fandom.define('Merlin'))
				elif f == 'star trek':
					fic.add(Fandom.define('Star Trek'))
				elif f == 'steven universe (cartoon)':
					fic.add(Fandom.define('Steven Universe'))
				elif f == 'hellsing':
					fic.add(Fandom.define('Hellsing'))
				elif f == 'the breaker':
					fic.add(Fandom.define('The Breaker'))
				elif f == 'smallville':
					fic.add(Fandom.define('Smallville'))
				elif f == '베리타스 | veritas (manhwa)':
					fic.add(Fandom.define('Veritas (manhwa)'))
				elif f == 'guardians of childhood - william joyce':
					fic.add(Fandom.define('Guardians of Childhood'))
				elif f == 'person of interest (tv)':
					fic.add(Fandom.define('Person of Interest'))
				elif f == 'james bond (craig movies)':
					fic.add(Fandom.define('James Bond'))
				elif f == 'the bourne legacy (2012)':
					fic.add(Fandom.define('Jason Bourne'))
				elif f == 'numb3rs':
					fic.add(Fandom.define('Numb3rs'))
				elif f == 'temeraire - naomi novik':
					fic.add(Fandom.define('Temeraire'))
				elif f == 'twilight series - stephenie meyer':
					fic.add(Fandom.define('Twilight'))
				elif f == 'dungeons and dragons - fandom':
					fic.add(Fandom.define('Dungeons and Dragons'))
				elif f == 'american horror story' or f == 'american horror story: cult':
					fic.add(Fandom.define('American Horror Story'))
				elif (
					f == 'worm (web serial novel)' or f == 'worm - wildbow'
					or f == 'parahumans series - wildbow'
					or f == 'worm (web serial) | wildbow' or f == 'worm - fandom'
					or f == 'parahumans - fandom' or f == 'worm (parahumans)'
					or f == 'worm (web serial)' or f == 'worm | parahumans'
					or f == 'worm (web novel)'
				):
					fic.add(Fandom.define('Worm'))
				elif f == 'toaru kagaku no railgun | a certain scientific railgun':
					fic.add(Fandom.define('A Certain Scientific Railgun'))
				elif f == 'toaru majutsu no index | a certain magical index':
					fic.add(Fandom.define('A Certain Magical Index'))
				elif f == 'cthulhu mythos - h. p. lovecraft':
					fic.add(Fandom.define('Cthulhu'))
				elif f == 'transformers - all media types':
					fic.add(Fandom.define('Transformers'))
				elif f == 'destiny (video game)':
					fic.add(Fandom.define('Destiny'))
				elif f == 'fandom - fandom' or f == 'meta - fandom':
					pass  # >_>
				elif f == 'house m.d.':
					fic.add(Fandom.define('House, M.D.'))
				elif f == 'the hobbit (jackson movies)':
					fic.add(Fandom.define('The Hobbit'))
				elif f == 'doctor strange (2016)':
					fic.add(Fandom.define('Doctor Strange'))
				elif f == 'arrow (tv 2012)':
					fic.add(Fandom.define('Arrow'))
				elif f == 'the flash (tv 2014)':
					fic.add(Fandom.define('Flash'))
				elif f == 'senki zesshou symphogear':
					fic.add(Fandom.define('Symphogear'))
				elif (
					f == 'fullmetal alchemist: brotherhood & manga'
					or f == 'fullmetal alchemist - all media types'
					or f == 'fullmetal alchemist (anime 2003)'
				):
					fic.add(Fandom.define('Fullmetal Alchemist'))
				elif (
					f == 'star wars - all media types'
					or f == 'star wars episode vii: the force awakens (2015)'
					or f == 'star wars prequel trilogy'
				):
					fic.add(Fandom.define('Star Wars'))
				elif (
					f == 'guardians of the galaxy (2014)'
					or f == 'guardians of the galaxy - all media types'
					or f == 'guardians of the galaxy (movies)'
				):
					fic.add(Fandom.define('Guardians of the Galaxy'))
				elif f == 'ant man (2015)' or f == 'ant-man (movies)':
					fic.add(Fandom.define('Ant Man'))
				elif f == 'the defenders (marvel tv)':
					fic.add(Fandom.define('The Defenders'))
				elif f == 'elementary (tv)':
					fic.add(Fandom.define('Elementary'))
				elif f == 'good omens - neil gaiman & terry pratchett':
					fic.add(Fandom.define('Good Omens'))
				elif f == 'danny phantom':
					fic.add(Fandom.define('Danny Phantom'))
				elif f == 'katekyou hitman reborn!':
					fic.add(Fandom.define('Katekyo Hitman Reborn!'))
				elif f == 'welcome to night vale':
					fic.add(Fandom.define('Welcome to Night Vale'))
				elif f == 'ncis':
					fic.add(Fandom.define('NCIS'))
				elif f == 'torchwood':
					fic.add(Fandom.define('Torchwood'))
				elif f == 'magic: the gathering':
					fic.add(Fandom.define('Magic: The Gathering'))
				elif f == 'overwatch (video game)':
					fic.add(Fandom.define('Overwatch'))
				elif f == 'detroit: become human (video game)':
					fic.add(Fandom.define('Detroit: Become Human'))
				elif f == 'greek and roman mythology':
					pass
				elif f == 'life is strange (video game)':
					fic.add(Fandom.define('life is strange (video game)'))
				elif f == 'akatsuki no yona | yona of the dawn':
					fic.add(Fandom.define('Yona of the Dawn'))
				elif f == '僕のヒーローアカデミア | boku no hero academia | my hero academia':
					fic.add(Fandom.define('My Hero Academia'))
				elif f == 'voltron: legendary defender':
					fic.add(Fandom.define('Voltron'))
				elif f == 'selfie (tv)':
					fic.add(Fandom.define('Selfie'))
				elif f == 'suits (tv)':
					fic.add(Fandom.define('Suits'))
				elif f == 'fruits basket':
					fic.add(Fandom.define('Fruits Basket'))
				elif f == 'hetalia: axis powers':
					fic.add(Fandom.define('Hetalia: Axis Powers'))
				elif f == 'carmilla (web series)':
					fic.add(Fandom.define('Carmilla'))
				elif f == 'the dresden files - jim butcher':
					fic.add(Fandom.define('Dresden Files'))
				elif f == 'girl genius':
					fic.add(Fandom.define('Girl Genius'))
				elif f == 'unspecified fandom':
					pass  # TODO?
				elif f == 'nightwing (comics)':
					fic.add(Fandom.define('Nightwing'))
				elif f == 'books of the raksura - martha wells':
					fic.add(Fandom.define('Books of the Raksura'))
				elif f == 'fall of ile-rien - martha wells':
					fic.add(Fandom.define('Fall of Ile-Rien'))
				elif f == 'vorkosigan saga - lois mcmaster bujold':
					fic.add(Fandom.define('Vorkosigan Saga'))
				elif (
					f == 'highlander: the series' or f == 'highlander - all media types'
				):
					fic.add(Fandom.define('Highlander'))
				elif f == 'yoroiden samurai troopers | ronin warriors':
					fic.add(Fandom.define('Ronin Warriors'))
				elif f == 'hockey rpf':
					fic.add(Fandom.define('Hockey RPF'))
				elif f == 'pacific rim (2013)':
					fic.add(Fandom.define('Pacific Rim'))
				elif f == 'enchanted forest chronicles - patricia wrede':
					fic.add(Fandom.define('Enchanted Forest Chronicles'))
				elif f == 'tortall - tamora pierce':
					fic.add(Fandom.define('Tortall'))
				elif f == 'protector of the small - tamora pierce':
					fic.add(Fandom.define('Protector of the Small'))
				elif f == 'leverage':
					fic.add(Fandom.define('Leverage'))
				elif f == 'valdemar series - mercedes lackey':
					fic.add(Fandom.define('Valdemar Series'))
				elif (
					f == 'b.p.r.d.' or f == 'bureau for paranormal research and defense'
				):
					fic.add(Fandom.define('B.P.R.D.'))
				elif f == 'hellboy (comic)':
					fic.add(Fandom.define('Hellboy'))
				elif f == 'sga/avatar':
					fic.add(Fandom.define('Stargate Atlantis'))
					fic.add(Fandom.define('Avatar'))
				elif f == 'annihilation (2018 garland)':
					fic.add(Fandom.define('Annihilation'))
				elif f == 'craft sequence - max gladstone':
					fic.add(Fandom.define('Craft Sequence'))
				elif f == 'the good place (tv)':
					fic.add(Fandom.define('The Good Place'))
				elif f == 'jessica jones (tv)':
					fic.add(Fandom.define('Jessica Jones'))
				elif f == 'mad max series (movies)':
					fic.add(Fandom.define('Mad Max'))
				elif f == 'american gods (tv)':
					fic.add(Fandom.define('American Gods'))
				elif f == 'terminator: the sarah connor chronicles':
					fic.add(Fandom.define('Terminator: The Sarah Connor Chronicles'))
					fic.add(Fandom.define('Terminator'))
				elif f == 'wolf 359 (radio)':
					fic.add(Fandom.define('Wolf 359'))
				elif f == 'shadowrun: dragonfall':
					fic.add(Fandom.define('Shadowrun'))
				elif f == 'ars paradoxica (podcast)':
					fic.add(Fandom.define('Ars Paradoxica'))
				elif f == 'love is strange - fandom':
					fic.add(Fandom.define('Love is Strange'))
				elif f == 'dune - all media types':
					fic.add(Fandom.define('Dune'))
				elif f == 'dragon age: origins':
					fic.add(Fandom.define('Dragon Age: Origins'))
				elif f == 'game of thrones (tv)':
					fic.add(Fandom.define('Game of Thrones'))
				elif f == 'chronicles of amber - roger zelazny':
					fic.add(Fandom.define('Chronicles of Amber'))
				elif f == 'the southern reach trilogy - jeff vandermeer':
					fic.add(Fandom.define('The Southern Reach Trilogy'))
				elif f == 'continuum (tv)':
					fic.add(Fandom.define('Continuum'))
				elif f == 'mage: the ascension':
					fic.add(Fandom.define('Mage: The Ascension'))
				elif f == 'the good wife (tv)' or f == 'good wife (tv)':
					fic.add(Fandom.define('The Good Wife'))
				elif f == 'alliance-union - c. j. cherryh':
					fic.add(Fandom.define('Alliance-Union'))
				elif f == 'indexing - seanan mcguire':
					fic.add(Fandom.define('Indexing'))
				elif f == 'ultraviolet (tv)':
					fic.add(Fandom.define('Ultraviolet'))
				elif f == 'veronica mars (tv)':
					fic.add(Fandom.define('Veronica Mars'))
				elif f == 'secret circle (tv)':
					fic.add(Fandom.define('Secret Circle'))
				elif f == 'mahou shoujo madoka magika | puella magi madoka magica':
					fic.add(Fandom.define('Madoka Magica'))
				elif f == 'agent carter (tv)':
					fic.add(Fandom.define('Agent Carter'))
				elif f == 'dracula & related fandoms':
					fic.add(Fandom.define('Dracula'))
				elif f == 'dragon ball':
					fic.add(Fandom.define('Dragon Ball'))
				elif f == 'mass effect - all media types':
					fic.add(Fandom.define('Mass Effect'))
				elif f == 'firefly' or f == 'serenity (2005)':
					fic.add(Fandom.define('Firefly'))
				else:
					anyHere = False
					global ao3FandomsMap
					for fm in ao3FandomsMap:
						here = False
						for uf in fm[0]:
							if f == uf.lower().strip():
								here = True
								break
						if not here:
							continue
						anyHere = True
						for mf in fm[1]:
							fic.add(Fandom.define(mf))
					if not anyHere:
						util.logMessage(f'ao3|unknown fandom|{fic.url}|{originalF}')
						#raise Exception('unknown fandom: {} "{}"'.format(fic.url, originalF))

		ourDoms = fic.fandoms()
		# we have a canonical fandom, try to find our characters
		if len(ourDoms) == 1:
			relationshipDd = soup.find('dd', {'class': 'relationship tags'})
			if relationshipDd is not None:
				relationshipTags = relationshipDd.findAll('a', {'class': 'tag'})
				for rt in relationshipTags:
					r = rt.contents[0]
					chars = r.split('/')
					if len(chars) > 8:  # TODO: sometimes more?
						raise Exception('unable to parse relationship: {}'.format(r))
					for char in chars:
						fic.add(Character.defineInFandom(ourDoms[0], char, self.ftype))

		return fic

Пример #5

Показать файл

Файл: fanficsMeAdapter.py Проект: FanFicDev/hermes

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		raise Exception('FIXME TODO fanfics me format has changed')
		from bs4 import BeautifulSoup  # type: ignore
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		ficHead = soup.find('div', {'class': 'FicHead'})

		titleH1 = ficHead.find('h1')
		fic.title = titleH1.getText().strip()

		fandoms: List[str] = []
		trs = ficHead.findAll('div', {'class': 'tr'})
		author = None
		for tr in trs:
			divTitle = tr.find('div', {'class': 'title'})
			divContent = tr.find('div', {'class': 'content'})

			t = str(divTitle.getText()).strip()
			v = str(divContent.getText()).strip()

			if t == 'Автор:':
				author = v
			elif t == 'Фандом:':
				if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling':
					fandoms += ['Harry Potter']
				else:
					raise Exception('unknown fandom: ' + v)
			elif t == 'Статус:':
				if v == 'В процессе':
					fic.ficStatus = FicStatus.ongoing
				elif v == 'Закончен':
					fic.ficStatus = FicStatus.complete
				else:
					raise Exception('unknown write status: ' + v)
			elif t == 'Опубликован:':
				fic.published = self.parseRussianDate(v)
			elif t == 'Изменен:':
				fic.updated = self.parseRussianDate(v)
			elif t == 'Ссылка:':
				src = v  # source archive url
			elif t == 'Читателей:':
				fic.followCount = int(v)
			elif t == 'Персонажи:':
				# characters, parse relationship?
				pass
			elif t == 'Рейтинг:':
				fic.ageRating = v
			elif t == 'Предупреждения:':
				# warnings?
				pass
			else:
				raise Exception('unknown metadata: ' + t)

		# TODO?
		assert (author is not None)
		authorUrl = author
		authorId = author
		self.setAuthor(fic, author, authorUrl, authorId)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		if fic.url is None:
			fic.url = self.constructUrl(fic.localId)

		summaryTextDiv = soup.find('div', {'class': 'summary_text'})
		if summaryTextDiv is None:
			summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'})
		fic.description = summaryTextDiv.getText()

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		if fic.followCount is None:
			fic.followCount = 0

		fic.ageRating = 'M'

		ficContentsUl = soup.find('ul', {'class': 'FicContents'})
		chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'})
		fic.chapterCount = len(chapterLinks)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.localChapterId = str(cid)
			chapter.url = self.constructUrl(fic.localId, cid)

			# try to get it out of current blob first
			if chapter.html() is None:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapter.setHtml(
						'<div class="ReadContent">' + str(contentDiv) + '</div>'
					)

			if chapter.title is None or len(chapter.title) < 1:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapterTitle = contentDiv.previous_sibling
					if chapterTitle is not None and chapterTitle.name == 'h2':
						chapter.title = chapterTitle.getText()

			# fallback to scraping it directly
			if chapter.html() is None:
				cdata = scrape.softScrape(chapter.url)
				assert (cdata is not None)
				chapter.setHtml(self.extractContent(fic, cdata))
				csoup = BeautifulSoup(cdata, 'html5lib')
				contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)})
				chapterTitle = contentDiv.previous_sibling
				if chapterTitle is not None and chapterTitle.name == 'h2':
					chapter.title = chapterTitle.getText()

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()
			wordCount += len(chapter.cachedContent().split())

		fic.wordCount = wordCount

		for fandom in fandoms:
			fic.add(Fandom.define(fandom))

		return fic

Пример #6

Показать файл

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		authorLid = fic.localId.split('/')[0]
		storyLid = fic.localId.split('/')[1]

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'

		soup = BeautifulSoup(wwwHtml, 'html5lib')

		pageHeader = soup.find('div', {'class': 'page-header'})
		titleH2 = pageHeader.find('h2')
		fic.title = titleH2.getText().strip()

		authorLink = pageHeader.find('a')
		author = authorLink.getText().strip()
		authorId = authorLid
		authorUrl = self.baseStoryUrl.format(authorLid, 'contact/')
		self.setAuthor(fic, author, authorUrl, authorId)

		divWell = soup.find('div', {'class': 'well'})

		summaryQuote = divWell.find('blockquote')

		fic.description = str(
			summaryQuote.getText()
		).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
		while fic.description.find('  ') != -1:
			fic.description = fic.description.replace('  ', ' ')
		fic.description = fic.description.strip()

		divWellText = divWell.getText().strip()

		match = re.search('Status:\s*([^-]*) -', divWellText)
		if match is not None and match.group(1) == 'In progress':
			fic.ficStatus = FicStatus.ongoing
		else:
			raise Exception('unable to find fic status')

		RegexMatcher(
			divWellText, {
				'ageRating': ('Rating\s*:\s+([^-]+) -', str),
				'chapterCount': ('Chapters\s*:\s+(\d+) -', int),
				'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
			}
		).matchAll(fic)
		assert (fic.chapterCount is not None)

		if str(fic.wordCount).find(',') != -1:
			fic.wordCount = int(str(fic.wordCount).replace(',', ''))

		wellParent = divWell.parent
		cid = 0
		wordCount = 0
		reviewCount = 0
		chapterDates: List[int] = []

		for child in wellParent.children:
			if child.name != 'p': continue
			cid += 1
			if str(child).find('Chapter {}'.format(cid)) == -1:
				continue
			chapterLink = child.find('a')
			expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower()
			if chapterLink.get('href').lower() != expectedUrl:
				raise Exception('unexpected chapter url: ' + chapterLink.get('href'))

			chInfo = ChapterInfo()

			RegexMatcher(
				child.getText(), {
					'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
					'reviewCount': ('Reviews\s*:\s+([^-]+) -', int),
					'updated': ('Uploaded on\s*:\s+(.+)', str),
				}
			).matchAll(chInfo)
			assert (chInfo.updated is not None)

			if str(chInfo.wordCount).find(',') != -1:
				chInfo.wordCount = int(str(chInfo.wordCount).replace(',', ''))

			wordCount += chInfo.wordCount
			reviewCount += chInfo.reviewCount

			dt = (util.parseDateAsUnix(chInfo.updated, int(time.time())))
			chapterDates += [dt]

		# wordCount is already set from overall metadata
		fic.reviewCount = reviewCount

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))

		fic.upsert()
		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = 'Chapter_{}'.format(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			ch.upsert()

		return fic

Пример #7

Показать файл

Файл: siyeAdapter.py Проект: FanFicDev/hermes

    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        html = html.replace('\r\n', '\n')
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        w95tables = soup.findAll('table', {'width': '95%'})
        if len(w95tables) != 3:
            raise Exception('wrong number of w95 tables: {}'.format(
                len(w95tables)))

        ficInfoTable = w95tables[0]
        ficTitleH3 = ficInfoTable.find('h3')
        fic.title = ficTitleH3.get_text().strip()

        authorUrlMatch = re.search('"viewuser.php\?uid=(\d+)">([^<]*)<', html)
        if authorUrlMatch is None:
            raise Exception('could not locate author url')

        author = authorUrlMatch.group(2)
        authorId = authorUrlMatch.group(1)
        authorUrl = self.baseUrl + '/viewuser.php?uid=' + authorId

        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO: this may miss multiline summaries :(
        summaryMatch = re.search(
            '<b>Summary:</b>((.|\r|\n)*)<b>Hitcount: </b>', html, re.MULTILINE)
        if summaryMatch is None:
            edumpContent(html, 'siye_summary')
            raise Exception('could not locate summary')
        # alternatively: fic.description = "{no summary}" ?

        fic.description = summaryMatch.group(1).strip()

        fic.ageRating = '<unkown>'

        ageRatingMatch = re.search('<b>Rating:</b>(.*)<br>', html)
        if ageRatingMatch is not None:
            fic.ageRating = ageRatingMatch.group(1).strip()

        maxChapter = 0
        baseChapterHref = 'viewstory.php?sid={}&chapter='.format(fic.localId)
        singleChapterHref = 'viewstory.php?sid={}&chapter=Array'.format(
            fic.localId)
        isSingleChapterFic = False
        allAs = soup.find_all('a')
        for a in allAs:
            href = a.get('href')
            if href is None:
                continue
            if not href.startswith(baseChapterHref):
                continue
            if href.startswith(singleChapterHref):
                isSingleChapterFic = True
                maxChapter = max(1, maxChapter)
                continue
            cid = int(href[len(baseChapterHref):])
            maxChapter = max(cid, maxChapter)

        fic.chapterCount = maxChapter

        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ficStatus = FicStatus.ongoing
        if html.find('Story is Complete'):
            fic.ficStatus = FicStatus.complete

        updatedOnPattern = re.compile('updated on (\d+).(\d+).(\d+)')
        minUpdate = util.parseDateAsUnix(int(time.time()), fic.fetched)
        maxUpdate = util.parseDateAsUnix('1970/01/01', fic.fetched)
        for (year, month, day) in re.findall(updatedOnPattern, html):
            date = '{}/{}/{}'.format(year, month, day)
            dt = util.parseDateAsUnix(date, fic.fetched)

            minUpdate = min(minUpdate, dt)
            maxUpdate = max(maxUpdate, dt)

        if fic.published is None or fic.published.toUTS() > minUpdate:
            fic.published = OilTimestamp(minUpdate)
        if fic.updated is None or fic.updated.toUTS() < maxUpdate:
            fic.updated = OilTimestamp(maxUpdate)
        if fic.updated < fic.published:
            fic.updated = fic.published

        fic.wordCount = 0
        wordsPattern = re.compile('(\d+) words')
        for (words) in re.findall(wordsPattern, html):
            fic.wordCount += int(words)

        if fic.wordCount == 0 and isSingleChapterFic:
            try:
                fic.upsert()
                ch1 = fic.chapter(1)
                ch1.cache()
                chtml = ch1.html()
                if chtml is not None:
                    fic.wordCount = len(chtml.split())
            except:
                pass

        fic.add(Fandom.define('Harry Potter'))
        # TODO: chars/relationship?

        return fic

Пример #8

Показать файл

Файл: royalroadlAdapter.py Проект: FanFicDev/hermes

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'  # TODO?

		ficTitleDiv = soup.find('div', {'class': 'fic-title'})
		fic.title = ficTitleDiv.find('h1').getText().strip()

		authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a')
		author = authorLink.getText().strip()
		authorUrl = self.baseUrl + authorLink.get('href')
		authorId = authorUrl.split('/')[-1]
		self.setAuthor(fic, author, authorUrl, authorId)

		divDescription = soup.find('div', {'class': 'description'})
		try:
			descView = HtmlView(str(divDescription), markdown=False)
			desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text])
			fic.description = desc
		except:
			fic.description = divDescription.getText().strip()

		fictionInfo = str(soup.find('div', {'class': 'fiction-info'}))
		if fictionInfo.find('>ONGOING<') != -1:
			fic.ficStatus = FicStatus.ongoing
		elif fictionInfo.find('>COMPLETED<') != -1:
			fic.ficStatus = FicStatus.complete
		elif fictionInfo.find('>HIATUS<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>STUB<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>DROPPED<') != -1:
			fic.ficStatus = FicStatus.abandoned
		else:
			raise Exception('unable to find fic status')

		divStatsContent = soup.find('div', {'class': 'stats-content'})
		followers = divStatsContent.find(text='Followers :')
		ul = followers.parent.parent

		RegexMatcher(
			ul.getText(), {
				'followCount?': ('Followers\s+:\s+([\d,]+)', str),
				'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str),
			}
		).matchAll(fic)

		if str(fic.followCount).find(','):
			fic.followCount = int(str(fic.followCount).replace(',', ''))
		if str(fic.favoriteCount).find(','):
			fic.favoriteCount = int(str(fic.favoriteCount).replace(',', ''))

		tableChapters = soup.find('table', {'id': 'chapters'})
		chapterLinks = tableChapters.findAll('a')

		chapterUrls: List[str] = []
		chapterTitles: List[str] = []
		for chapterLink in chapterLinks:
			# TODO FIXME is this inverted?
			if chapterLink.find('time') is not None:
				continue
			chapterUrls += [chapterLink.get('href')]
			chapterTitles += [chapterLink.getText().strip()]

		chapterDates: List[int] = []
		for chapterLink in chapterLinks:
			if chapterLink.find('time') is None:
				continue
			timeElement = chapterLink.find('time')
			if timeElement.get('unixtime'):
				chapterDates += [int(timeElement.get('unixtime'))]
			else:
				chapterDates += [
					util.parseDateAsUnix(timeElement.get('title'), fic.fetched)
				]

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))
		fic.chapterCount = len(chapterUrls)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.url = self.baseUrl + chapterUrls[cid - 1]
			if chapterUrls[cid - 1].startswith('/fiction/chapter/'):
				# alternate chapter syntax if the chapter itself has no slug
				# /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug>
				chapter.localChapterId = (
					chapterUrls[cid - 1].split('/')[3].split('?')[0]
				)
			else:
				# standard chapter syntax
				# /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug>
				chapter.localChapterId = chapterUrls[cid - 1].split('/')[5]
			chapter.title = chapterTitles[cid - 1]

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			if chapter.html() is None:
				chapter.cache()

			chapter.upsert()
			chtml = chapter.html()
			if chtml is not None:
				wordCount += len(chtml.split())

		fic.wordCount = wordCount

		return fic

Пример #9

Показать файл

    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        archive = fic.localId.split('/')[0]
        storyNo = fic.localId.split('/')[1]

        soup = BeautifulSoup(wwwHtml, 'html5lib')

        titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)})
        fic.title = str(titleH2.getText())

        membersUrl = 'http://members.adult-fanfiction.org/profile.php?no='
        memberLink = soup.find(
            lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href")
                       is not None and (t.get("href").startswith(membersUrl))))

        author = memberLink.getText()
        authorId = memberLink.get('href')[len(membersUrl):]
        authorUrl = memberLink.get('href')
        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO
        fic.ficStatus = FicStatus.ongoing

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: description is on search page
        if fic.description is None:
            fic.description = 'TODO: on the search page?'

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ageRating = 'M'

        # TODO
        if fic.published is None:
            fic.published = OilTimestamp.now()
        if fic.updated is None:
            fic.updated = fic.published

        chapterDropdown = soup.find('div', {'class': 'dropdown-content'})
        chapterLinks = chapterDropdown.findAll('a')
        oldChapterCount = fic.chapterCount
        fic.chapterCount = len(chapterLinks)

        if fic.wordCount is None:
            fic.wordCount = 0
        fic.upsert()

        wordCount = 0
        for cid in range(1, fic.chapterCount + 1):
            chapterContent = scrape.softScrape(
                self.constructUrl(fic.localId, cid))
            chapter = fic.chapter(cid)
            if chapterContent is not None:
                chapter.setHtml(chapterContent)
            chapter.localChapterId = str(cid)
            chapter.url = self.constructUrl(fic.localId, cid)

            chapter.title = chapterLinks[cid - 1].getText().strip()
            if chapter.title is not None:
                chapter.title = util.cleanChapterTitle(chapter.title, cid)

            chapter.upsert()
            if chapterContent is not None:
                wordCount += len(chapterContent.split())

        fic.wordCount = wordCount

        if oldChapterCount is not None and oldChapterCount < fic.chapterCount:
            fic.updated = OilTimestamp.now()  # TODO
        fic.upsert()

        storyUrl = self.constructUrl(fic.localId, chapterId=None)

        # more metadata from search page
        searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                     'auth={}&title={}&summary=&tags=&cats=0&search=Search')
        searchUrl = searchUrl.format(archive, author,
                                     fic.title.replace(' ', '+'))
        data = scrape.scrape(searchUrl)['raw']

        metas = self.extractSearchMetadata(data)

        # fallback to pure author search
        if storyUrl not in metas:
            searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                         'auth={}&title=&summary=&tags=&cats=0&search=Search')
            searchUrl = searchUrl.format(archive, author)
            data = scrape.scrape(searchUrl)['raw']
            metas = self.extractSearchMetadata(data)

        if storyUrl not in metas:
            raise Exception('cannot find search metadata')

        meta = metas[storyUrl]

        assert (meta.published is not None and meta.updated is not None)
        fic.published = OilTimestamp(meta.published)
        fic.updated = OilTimestamp(meta.updated)

        fic.reviewCount = meta.reviewCount
        fic.favoriteCount = meta.views  # TODO

        fic.ficStatus = meta.ficStatus

        assert (meta.description is not None)
        fic.description = meta.description
        assert (fic.description is not None)
        if len(meta.tags) > 0:
            fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags)

        for fan in meta.fandoms:
            fic.add(Fandom.define(fan))

        return fic