Пример #1
0
	def createFromZList(self, fic: Fic, ts: int, data: str) -> Fic:
		fic.url = self.constructUrl(fic.localId, 1)

		fic = self.parseZListInfoInto(fic, ts, data)
		fic.upsert()

		return Fic.lookup((fic.id, ))
Пример #2
0
	def getCurrentInfo(self, fic: Fic) -> Fic:
		fic.url = self.baseUrl + str(fic.localId)
		url = fic.url.split('?')[0] + '?view_adult=true'
		# scrape fresh info
		data = scrape.scrape(url)

		return self.parseInfoInto(fic, data['raw'])
Пример #3
0
	def create(self, fic: Fic) -> Fic:
		fic.url = self.constructUrl(fic.localId)
		data = scrape.softScrape(fic.url)
		if data is None:
			raise Exception('unable to scrape? FIXME')

		fic = self.parseInfoInto(fic, data)
		fic.upsert()

		return Fic.lookup((fic.id, ))
Пример #4
0
    def getCurrentInfo(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId)
        url = self.tocUrl
        data = scrape.scrape(url)
        edumpContent('<!-- {} -->\n{}'.format(url, data['raw']),
                     'wavesarisen_ec')

        fic = self.parseInfoInto(fic, data['raw'])
        fic.upsert()
        return Fic.lookup((fic.id, ))
Пример #5
0
	def create(self, fic: Fic) -> Fic:
		fic.url = self.constructUrl(fic.localId)

		# scrape fresh info
		data = scrape.scrape(fic.url)

		edumpContent(data['raw'], 'sugarquill')

		fic = self.parseInfoInto(fic, data['raw'])
		fic.upsert()

		return Fic.lookup((fic.id, ))
Пример #6
0
    def create(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId)

        # scrape fresh info
        data = scrape.scrape(fic.url)
        time.sleep(self.baseDelay)

        edumpContent(data['raw'], 'hpffa')

        fic = self.parseInfoInto(fic, data['raw'])
        fic.upsert()

        return Fic.lookup((fic.id, ))
Пример #7
0
    def create(self, fic: Fic) -> Fic:
        # TODO: should we try to get the actual url here, including the url safe
        # version of the title before the lid? Needs done elsewhere in this
        # adapter as well
        fic.url = self.baseUrl + 'threads/' + str(fic.localId)

        # scrape fresh info
        data = self.scrapeLike(fic.url)

        fic = self.parseInfoInto(fic, data)
        fic.upsert()

        return Fic.lookup((fic.id, ))
Пример #8
0
    def create(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId, 1)

        # scrape fresh info
        data = scrape.scrape(fic.url)

        fic = self.parseInfoInto(fic, data['raw'])
        fic.insert()

        chapter = fic.chapter(1)
        chapter.setHtml(data['raw'])
        chapter.upsert()

        return Fic.lookup((fic.id, ))
Пример #9
0
	def getCurrentInfo(self, fic: Fic) -> Fic:
		# FIXME when fics are deleted they 404:
		# https://www.royalroad.com/fiction/38947/
		# 404
		# Page Not Found
		# The server has returned the following error:
		# This fiction has been deleted
		fic.url = self.constructUrl(fic.localId)

		data = self.scrape(fic.url)
		if 'raw' not in data:
			raise Exception('unable to scrape? FIXME')
		raw = data['raw']

		return self.parseInfoInto(fic, raw)
Пример #10
0
	def create(self, fic: Fic) -> Fic:
		fic.url = self.baseUrl + str(fic.localId)

		# scrape fresh info
		url = fic.url.split('?')[0] + '?view_adult=true'
		data = scrape.scrape(url)

		edumpContent(data['raw'], 'ao3')

		fic = self.parseInfoInto(fic, data['raw'])
		fic.upsert()

		chapter = fic.chapter(1)
		chapter.setHtml(data['raw'])
		chapter.upsert()

		return Fic.lookup((fic.id, ))
Пример #11
0
    def create(self, fic: Fic) -> Fic:
        fic.url = self.constructUrl(fic.localId, 1)

        # scrape fresh info
        data = scrape.softScrape(fic.url)
        if data is None:
            raise Exception('unable to scrape? FIXME')

        fic = self.parseInfoInto(fic, data)
        fic.upsert()

        chapter = fic.chapter(1)
        chapter.setHtml(data)
        chapter.localChapterId = str(1)
        chapter.url = self.constructUrl(fic.localId, 1)
        chapter.upsert()

        return Fic.lookup((fic.id, ))
Пример #12
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        divDetails = soup.find_all('div', {'class': 'details'})
        if len(divDetails) != 1:
            raise Exception('error: unable to find details\n')
        else:
            divDetails = divDetails[0]

        text = divDetails.get_text()
        pt_str = str(divDetails)

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        divTitle = soup.find_all('div', {'class': 'title'})
        if len(divTitle) == 1:
            fic.title = divTitle[0].get_text().strip()
        else:
            raise Exception(
                'error: unable to find title:\n{}\n'.format(pt_str))

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: this may not exist on fictionhunt?
        fic.description = 'archive of {} from fictionhunt TODO'.format(
            fic.title)

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rated:\s+(\S+)', str),
                'chapterCount?': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\S+)', int),
                'reviewCount?': ('Reviews:\s+(\S+)', int),
                'favoriteCount?': ('Favs:\s+(\S+)', int),
                'followCount?': ('Follows:\s+(\S+)', int),
                'updated?': ('Updated:\s+(\S+)', str),
                'published': ('Published:\s+(\S+)', str),
            })
        matcher.matchAll(fic)

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('- Complete -', text)
        if match is None:
            fic.ficStatus = FicStatus.ongoing
        else:
            fic.ficStatus = FicStatus.complete

        for a in divDetails.find_all('a'):
            a_href = a.get('href')
            if a_href.find('fanfiction.net/u/') != -1:
                author = a.get_text()
                authorUrl = a_href
                authorId = a_href.split('/')[-1]
                self.setAuthor(fic, author, authorUrl, authorId)
                break
        else:
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: hardcode Harry Potter fanfic?

        return fic
Пример #13
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'})
        if len(storyMainInfo) != 1:
            raise Exception('unable to find main story info')
        storyMainInfo = storyMainInfo[0]

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid="
        for a in soup.findAll('a'):
            href = a.get('href')
            if (not href.startswith(disclaimerJs)
                    and href != '?psid={}'.format(fic.localId)):
                continue
            fic.title = a.getText()
            break
        else:
            raise Exception('error: unable to find title')

        fic.url = self.constructUrl(fic.localId)

        storySummaryTable = soup.findAll('table', {'class': 'storysummary'})
        if len(storySummaryTable) != 1:
            raise Exception('cannot find story summary table')
        storySummaryTable = storySummaryTable[0]
        fic.description = (storySummaryTable.getText().strip())
        if fic.description is None:
            raise Exception('error: unable to find description')

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        text = storyMainInfo.getText().replace('\xa0', ' ')
        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str),
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\d+)', int),
                'reviewCount': ('Story Reviews:\s*(\d+)', int),
                'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int),
                'updated': ('Last Updated:\s+(\S+)', str),
                'published': ('First Published:\s+(\S+)', str),
            })
        matcher.matchAll(fic)

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)',
                          text)
        if match is None:
            raise Exception('cannot find write status')

        status = match.group(1)
        if status == 'Completed':
            fic.ficStatus = FicStatus.complete
        elif status == 'Work In Progress':
            fic.ficStatus = FicStatus.ongoing  # should these be abandoned?
        elif status == 'Abandoned':
            fic.ficStatus = FicStatus.abandoned
        else:
            raise Exception('unknown status: {}'.format(status))

        for a in soup.findAll('a'):
            a_href = a.get('href')
            if a_href.startswith('viewuser.php?showuid='):
                author = a.get_text()
                authorUrl = self.baseUrl + '/' + a_href
                authorId = a_href[len('viewuser.php?showuid='):]
                self.setAuthor(fic, author, authorUrl, authorId)
                break
        else:
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: chars/pairings?
        fic.add(Fandom.define('Harry Potter'))
        return fic
Пример #14
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		raise Exception('FIXME TODO fanfics me format has changed')
		from bs4 import BeautifulSoup  # type: ignore
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		ficHead = soup.find('div', {'class': 'FicHead'})

		titleH1 = ficHead.find('h1')
		fic.title = titleH1.getText().strip()

		fandoms: List[str] = []
		trs = ficHead.findAll('div', {'class': 'tr'})
		author = None
		for tr in trs:
			divTitle = tr.find('div', {'class': 'title'})
			divContent = tr.find('div', {'class': 'content'})

			t = str(divTitle.getText()).strip()
			v = str(divContent.getText()).strip()

			if t == 'Автор:':
				author = v
			elif t == 'Фандом:':
				if v == 'Harry Potter' or v == 'Harry Potter - J. K. Rowling':
					fandoms += ['Harry Potter']
				else:
					raise Exception('unknown fandom: ' + v)
			elif t == 'Статус:':
				if v == 'В процессе':
					fic.ficStatus = FicStatus.ongoing
				elif v == 'Закончен':
					fic.ficStatus = FicStatus.complete
				else:
					raise Exception('unknown write status: ' + v)
			elif t == 'Опубликован:':
				fic.published = self.parseRussianDate(v)
			elif t == 'Изменен:':
				fic.updated = self.parseRussianDate(v)
			elif t == 'Ссылка:':
				src = v  # source archive url
			elif t == 'Читателей:':
				fic.followCount = int(v)
			elif t == 'Персонажи:':
				# characters, parse relationship?
				pass
			elif t == 'Рейтинг:':
				fic.ageRating = v
			elif t == 'Предупреждения:':
				# warnings?
				pass
			else:
				raise Exception('unknown metadata: ' + t)

		# TODO?
		assert (author is not None)
		authorUrl = author
		authorId = author
		self.setAuthor(fic, author, authorUrl, authorId)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		if fic.url is None:
			fic.url = self.constructUrl(fic.localId)

		summaryTextDiv = soup.find('div', {'class': 'summary_text'})
		if summaryTextDiv is None:
			summaryTextDiv = soup.find('div', {'class': 'summary_text_fic3'})
		fic.description = summaryTextDiv.getText()

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		if fic.followCount is None:
			fic.followCount = 0

		fic.ageRating = 'M'

		ficContentsUl = soup.find('ul', {'class': 'FicContents'})
		chapterLinks = ficContentsUl.findAll('li', {'class': 't-b-dotted'})
		fic.chapterCount = len(chapterLinks)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.localChapterId = str(cid)
			chapter.url = self.constructUrl(fic.localId, cid)

			# try to get it out of current blob first
			if chapter.html() is None:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapter.setHtml(
						'<div class="ReadContent">' + str(contentDiv) + '</div>'
					)

			if chapter.title is None or len(chapter.title) < 1:
				contentDiv = soup.find('div', {'id': 'c{}'.format(cid - 1)})
				if contentDiv is not None:
					chapterTitle = contentDiv.previous_sibling
					if chapterTitle is not None and chapterTitle.name == 'h2':
						chapter.title = chapterTitle.getText()

			# fallback to scraping it directly
			if chapter.html() is None:
				cdata = scrape.softScrape(chapter.url)
				assert (cdata is not None)
				chapter.setHtml(self.extractContent(fic, cdata))
				csoup = BeautifulSoup(cdata, 'html5lib')
				contentDiv = csoup.find('div', {'id': 'c{}'.format(cid - 1)})
				chapterTitle = contentDiv.previous_sibling
				if chapterTitle is not None and chapterTitle.name == 'h2':
					chapter.title = chapterTitle.getText()

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()
			wordCount += len(chapter.cachedContent().split())

		fic.wordCount = wordCount

		for fandom in fandoms:
			fic.add(Fandom.define(fandom))

		return fic
Пример #15
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		authorLid = fic.localId.split('/')[0]
		storyLid = fic.localId.split('/')[1]

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'

		soup = BeautifulSoup(wwwHtml, 'html5lib')

		pageHeader = soup.find('div', {'class': 'page-header'})
		titleH2 = pageHeader.find('h2')
		fic.title = titleH2.getText().strip()

		authorLink = pageHeader.find('a')
		author = authorLink.getText().strip()
		authorId = authorLid
		authorUrl = self.baseStoryUrl.format(authorLid, 'contact/')
		self.setAuthor(fic, author, authorUrl, authorId)

		divWell = soup.find('div', {'class': 'well'})

		summaryQuote = divWell.find('blockquote')

		fic.description = str(
			summaryQuote.getText()
		).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
		while fic.description.find('  ') != -1:
			fic.description = fic.description.replace('  ', ' ')
		fic.description = fic.description.strip()

		divWellText = divWell.getText().strip()

		match = re.search('Status:\s*([^-]*) -', divWellText)
		if match is not None and match.group(1) == 'In progress':
			fic.ficStatus = FicStatus.ongoing
		else:
			raise Exception('unable to find fic status')

		RegexMatcher(
			divWellText, {
				'ageRating': ('Rating\s*:\s+([^-]+) -', str),
				'chapterCount': ('Chapters\s*:\s+(\d+) -', int),
				'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
			}
		).matchAll(fic)
		assert (fic.chapterCount is not None)

		if str(fic.wordCount).find(',') != -1:
			fic.wordCount = int(str(fic.wordCount).replace(',', ''))

		wellParent = divWell.parent
		cid = 0
		wordCount = 0
		reviewCount = 0
		chapterDates: List[int] = []

		for child in wellParent.children:
			if child.name != 'p': continue
			cid += 1
			if str(child).find('Chapter {}'.format(cid)) == -1:
				continue
			chapterLink = child.find('a')
			expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower()
			if chapterLink.get('href').lower() != expectedUrl:
				raise Exception('unexpected chapter url: ' + chapterLink.get('href'))

			chInfo = ChapterInfo()

			RegexMatcher(
				child.getText(), {
					'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
					'reviewCount': ('Reviews\s*:\s+([^-]+) -', int),
					'updated': ('Uploaded on\s*:\s+(.+)', str),
				}
			).matchAll(chInfo)
			assert (chInfo.updated is not None)

			if str(chInfo.wordCount).find(',') != -1:
				chInfo.wordCount = int(str(chInfo.wordCount).replace(',', ''))

			wordCount += chInfo.wordCount
			reviewCount += chInfo.reviewCount

			dt = (util.parseDateAsUnix(chInfo.updated, int(time.time())))
			chapterDates += [dt]

		# wordCount is already set from overall metadata
		fic.reviewCount = reviewCount

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))

		fic.upsert()
		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = 'Chapter_{}'.format(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			ch.upsert()

		return fic
Пример #16
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicTexts = [
			# probably deleted by user
			'Story Not FoundUnable to locate story. Code 1.',
			# probably deleted by admin
			'Story Not FoundUnable to locate story. Code 2.',
			# unknown
			'Story Not FoundStory is unavailable for reading. (A)',
		]
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				for deletedFicText in deletedFicTexts:
					if gui_warning.get_text() == deletedFicText:
						if fic.ficStatus != FicStatus.complete:
							fic.ficStatus = FicStatus.abandoned
						fic.upsert()
						return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
				break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		descriptionFound = False
		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
			):
				fic.description = div.get_text()
				descriptionFound = True
				break
		if descriptionFound == False:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		# TODO we should match this only on the section following the description
		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Rated:.*Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text
		)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = []
		if preStoryLinks is not None:
			preStoryLinksLinks = preStoryLinks.find_all('a')
		pendingFandoms: List[Fandom] = []
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
			):
				cat = hrefParts[1]
				if cat in ffNetFandomCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/
			if (
				len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers")
				and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0
			):
				fIds = [int(hrefParts[2]), int(hrefParts[3])]
				pendingFandoms += self.handleCrossoverFandom(
					fic, hrefParts[1], fIds, href
				)
				continue

			# if it's a regular fandom in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
			):
				# ensure category is in our map
				if hrefParts[1] not in ffNetFandomCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				pendingFandoms += self.handleFandom(fic, hrefParts[2])
				continue

			util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href))

		fic.upsert()
		poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId})
		if len(poss) != 1:
			raise Exception(f'unable to upsert fic?')
		fic = poss[0]
		for pfandom in pendingFandoms:
			fic.add(pfandom)

		if fic.chapterCount is None:
			return fic

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = str(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid)
			elif fic.chapterCount == 1 and cid == 1:
				ch.title = fic.title
			ch.upsert()

		metaSpan = profile_top.find('span', {'class': 'xgray'})
		if metaSpan is not None:
			try:
				res = self.parseFicMetaSpan(metaSpan.decode_contents())
				#fic.language = res["language"]

				# reconstruct
				fields = [
					('rated', 'Rated: Fiction ZZZ'),
					('language', 'Language: ZZZ'),
					('genres', 'Genre: ZZZ'),
					('characters', 'Characters: ZZZ'),
					('reviews', 'Reviews: ZZZ'),
					('favorites', 'Favs: ZZZ'),
					('follows', 'Follows: ZZZ'),
				]
				rmeta = ' - '.join(
					[f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res]
				)

				fic.extraMeta = rmeta
				publishedUts = util.parseDateAsUnix(res['published'], fic.fetched)
				fic.published = OilTimestamp(publishedUts)
				fic.updated = fic.published
				if 'updated' in res:
					updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched)
					fic.updated = OilTimestamp(updatedUts)
				fic.upsert()

			except Exception as e:
				util.logMessage(
					f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}'
				)
				util.logMessage(
					f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}'
				)
				pass

		return fic
Пример #17
0
    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup
        archive = fic.localId.split('/')[0]
        storyNo = fic.localId.split('/')[1]

        soup = BeautifulSoup(wwwHtml, 'html5lib')

        titleH2 = soup.find('a', {'href': '/story.php?no={}'.format(storyNo)})
        fic.title = str(titleH2.getText())

        membersUrl = 'http://members.adult-fanfiction.org/profile.php?no='
        memberLink = soup.find(
            lambda t: (t.name == 'a' and t.has_attr("href") and t.get("href")
                       is not None and (t.get("href").startswith(membersUrl))))

        author = memberLink.getText()
        authorId = memberLink.get('href')[len(membersUrl):]
        authorUrl = memberLink.get('href')
        self.setAuthor(fic, author, authorUrl, authorId)

        # TODO
        fic.ficStatus = FicStatus.ongoing

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: description is on search page
        if fic.description is None:
            fic.description = 'TODO: on the search page?'

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        fic.ageRating = 'M'

        # TODO
        if fic.published is None:
            fic.published = OilTimestamp.now()
        if fic.updated is None:
            fic.updated = fic.published

        chapterDropdown = soup.find('div', {'class': 'dropdown-content'})
        chapterLinks = chapterDropdown.findAll('a')
        oldChapterCount = fic.chapterCount
        fic.chapterCount = len(chapterLinks)

        if fic.wordCount is None:
            fic.wordCount = 0
        fic.upsert()

        wordCount = 0
        for cid in range(1, fic.chapterCount + 1):
            chapterContent = scrape.softScrape(
                self.constructUrl(fic.localId, cid))
            chapter = fic.chapter(cid)
            if chapterContent is not None:
                chapter.setHtml(chapterContent)
            chapter.localChapterId = str(cid)
            chapter.url = self.constructUrl(fic.localId, cid)

            chapter.title = chapterLinks[cid - 1].getText().strip()
            if chapter.title is not None:
                chapter.title = util.cleanChapterTitle(chapter.title, cid)

            chapter.upsert()
            if chapterContent is not None:
                wordCount += len(chapterContent.split())

        fic.wordCount = wordCount

        if oldChapterCount is not None and oldChapterCount < fic.chapterCount:
            fic.updated = OilTimestamp.now()  # TODO
        fic.upsert()

        storyUrl = self.constructUrl(fic.localId, chapterId=None)

        # more metadata from search page
        searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                     'auth={}&title={}&summary=&tags=&cats=0&search=Search')
        searchUrl = searchUrl.format(archive, author,
                                     fic.title.replace(' ', '+'))
        data = scrape.scrape(searchUrl)['raw']

        metas = self.extractSearchMetadata(data)

        # fallback to pure author search
        if storyUrl not in metas:
            searchUrl = ('http://{}.adult-fanfiction.org/search.php?' +
                         'auth={}&title=&summary=&tags=&cats=0&search=Search')
            searchUrl = searchUrl.format(archive, author)
            data = scrape.scrape(searchUrl)['raw']
            metas = self.extractSearchMetadata(data)

        if storyUrl not in metas:
            raise Exception('cannot find search metadata')

        meta = metas[storyUrl]

        assert (meta.published is not None and meta.updated is not None)
        fic.published = OilTimestamp(meta.published)
        fic.updated = OilTimestamp(meta.updated)

        fic.reviewCount = meta.reviewCount
        fic.favoriteCount = meta.views  # TODO

        fic.ficStatus = meta.ficStatus

        assert (meta.description is not None)
        fic.description = meta.description
        assert (fic.description is not None)
        if len(meta.tags) > 0:
            fic.description += '\n<hr />\nContent Tags: ' + ' '.join(meta.tags)

        for fan in meta.fandoms:
            fic.add(Fandom.define(fan))

        return fic
Пример #18
0
 def create(self, fic: Fic) -> Fic:
     fic.url = self.constructUrl(fic.localId)
     return self.getCurrentInfo(fic)
Пример #19
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'  # TODO?

		ficTitleDiv = soup.find('div', {'class': 'fic-title'})
		fic.title = ficTitleDiv.find('h1').getText().strip()

		authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a')
		author = authorLink.getText().strip()
		authorUrl = self.baseUrl + authorLink.get('href')
		authorId = authorUrl.split('/')[-1]
		self.setAuthor(fic, author, authorUrl, authorId)

		divDescription = soup.find('div', {'class': 'description'})
		try:
			descView = HtmlView(str(divDescription), markdown=False)
			desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text])
			fic.description = desc
		except:
			fic.description = divDescription.getText().strip()

		fictionInfo = str(soup.find('div', {'class': 'fiction-info'}))
		if fictionInfo.find('>ONGOING<') != -1:
			fic.ficStatus = FicStatus.ongoing
		elif fictionInfo.find('>COMPLETED<') != -1:
			fic.ficStatus = FicStatus.complete
		elif fictionInfo.find('>HIATUS<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>STUB<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>DROPPED<') != -1:
			fic.ficStatus = FicStatus.abandoned
		else:
			raise Exception('unable to find fic status')

		divStatsContent = soup.find('div', {'class': 'stats-content'})
		followers = divStatsContent.find(text='Followers :')
		ul = followers.parent.parent

		RegexMatcher(
			ul.getText(), {
				'followCount?': ('Followers\s+:\s+([\d,]+)', str),
				'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str),
			}
		).matchAll(fic)

		if str(fic.followCount).find(','):
			fic.followCount = int(str(fic.followCount).replace(',', ''))
		if str(fic.favoriteCount).find(','):
			fic.favoriteCount = int(str(fic.favoriteCount).replace(',', ''))

		tableChapters = soup.find('table', {'id': 'chapters'})
		chapterLinks = tableChapters.findAll('a')

		chapterUrls: List[str] = []
		chapterTitles: List[str] = []
		for chapterLink in chapterLinks:
			# TODO FIXME is this inverted?
			if chapterLink.find('time') is not None:
				continue
			chapterUrls += [chapterLink.get('href')]
			chapterTitles += [chapterLink.getText().strip()]

		chapterDates: List[int] = []
		for chapterLink in chapterLinks:
			if chapterLink.find('time') is None:
				continue
			timeElement = chapterLink.find('time')
			if timeElement.get('unixtime'):
				chapterDates += [int(timeElement.get('unixtime'))]
			else:
				chapterDates += [
					util.parseDateAsUnix(timeElement.get('title'), fic.fetched)
				]

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))
		fic.chapterCount = len(chapterUrls)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.url = self.baseUrl + chapterUrls[cid - 1]
			if chapterUrls[cid - 1].startswith('/fiction/chapter/'):
				# alternate chapter syntax if the chapter itself has no slug
				# /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug>
				chapter.localChapterId = (
					chapterUrls[cid - 1].split('/')[3].split('?')[0]
				)
			else:
				# standard chapter syntax
				# /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug>
				chapter.localChapterId = chapterUrls[cid - 1].split('/')[5]
			chapter.title = chapterTitles[cid - 1]

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			if chapter.html() is None:
				chapter.cache()

			chapter.upsert()
			chtml = chapter.html()
			if chtml is not None:
				wordCount += len(chtml.split())

		fic.wordCount = wordCount

		return fic
Пример #20
0
	def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic:
		# existing data is newer, do nothing
		if fic.fetched is not None and fic.fetched.toUTS() > ts:
			return fic
		from bs4 import BeautifulSoup

		soup = BeautifulSoup(html, 'html5lib')

		text = soup.get_text()
		pt_str = str(html)

		fic.fetched = OilTimestamp(ts)
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		for a in soup.find_all('a', {'class': 'stitle'}):
			fic.title = a.getText()
			break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		for div in soup.find_all('div', {'class': 'z-padtop'}):
			fic.description = div.contents[0]
			break
		else:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text
		)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in soup.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		zl = soup.find('div', {'class': 'z-list'})
		fan = None if zl is None else zl.get('data-category')
		pendingFandoms: List[Fandom] = []
		if fan is not None:
			pendingFandoms += self.handleFandom(fic, fan)
			# TODO: crossovers?

		#print('---')
		#print(fic.__dict__)
		#raise Exception('todo')

		fic.upsert()
		for pfandom in pendingFandoms:
			fic.add(pfandom)

		return fic
Пример #21
0
	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicText = 'Story Not FoundUnable to locate story. Code 1.'
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				if gui_warning.get_text() == deletedFicText:
					fic.ficStatus = FicStatus.abandoned
					fic.upsert()
					return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
				break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
			):
				fic.description = div.get_text()
				break
		else:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+(\S+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search('Status:\s+(\S+)', text)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(1)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}'.format(status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = preStoryLinks.find_all('a')
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
			):
				cat = hrefParts[1]
				if cat in fictionPressCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a regular genre in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
			):
				# ensure category is in our map
				if hrefParts[1] not in fictionPressCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				# ensure it's in our whitelist
				if hrefParts[2] not in fictionPressGenres:
					util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}')
					continue

				fic.add(Fandom.define(hrefParts[2]))
				continue

			util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}')
			continue

		fic.upsert()

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1)
			elif fic.chapterCount == 1 and cid == 0:
				ch.title = fic.title
			ch.upsert()

		return fic