Python RegexMatcher примеры использования

Язык программирования: Python

Пространство имен/Пакет: adapter.regex_matcher

Класс/Тип: RegexMatcher

Примеров на hotexamples.com: 9

Python RegexMatcher - 9 примеров найдено. Это лучшие примеры Python кода для adapter.regex_matcher.RegexMatcher, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

RegexMatcher(9)

matchAll(6)

Основные методы

RegexMatcher (9)

matchAll (6)

Пример #1

Показать файл

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicTexts = [
			# probably deleted by user
			'Story Not FoundUnable to locate story. Code 1.',
			# probably deleted by admin
			'Story Not FoundUnable to locate story. Code 2.',
			# unknown
			'Story Not FoundStory is unavailable for reading. (A)',
		]
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				for deletedFicText in deletedFicTexts:
					if gui_warning.get_text() == deletedFicText:
						if fic.ficStatus != FicStatus.complete:
							fic.ficStatus = FicStatus.abandoned
						fic.upsert()
						return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
				break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		descriptionFound = False
		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
			):
				fic.description = div.get_text()
				descriptionFound = True
				break
		if descriptionFound == False:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		# TODO we should match this only on the section following the description
		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Rated:.*Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*Status:\s+(\S+)', text
		)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = []
		if preStoryLinks is not None:
			preStoryLinksLinks = preStoryLinks.find_all('a')
		pendingFandoms: List[Fandom] = []
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
			):
				cat = hrefParts[1]
				if cat in ffNetFandomCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a crossover /Fandom1_and_Fandm2_Crossovers/f1id/f2id/
			if (
				len(hrefParts) == 5 and hrefParts[1].endswith("_Crossovers")
				and len(hrefParts[0]) == 0 and len(hrefParts[4]) == 0
			):
				fIds = [int(hrefParts[2]), int(hrefParts[3])]
				pendingFandoms += self.handleCrossoverFandom(
					fic, hrefParts[1], fIds, href
				)
				continue

			# if it's a regular fandom in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
			):
				# ensure category is in our map
				if hrefParts[1] not in ffNetFandomCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				pendingFandoms += self.handleFandom(fic, hrefParts[2])
				continue

			util.logMessage('unknown fandom {0}: {1}'.format(fic.id, href))

		fic.upsert()
		poss = Fic.select({'sourceId': fic.sourceId, 'localId': fic.localId})
		if len(poss) != 1:
			raise Exception(f'unable to upsert fic?')
		fic = poss[0]
		for pfandom in pendingFandoms:
			fic.add(pfandom)

		if fic.chapterCount is None:
			return fic

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = str(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid - 1], cid)
			elif fic.chapterCount == 1 and cid == 1:
				ch.title = fic.title
			ch.upsert()

		metaSpan = profile_top.find('span', {'class': 'xgray'})
		if metaSpan is not None:
			try:
				res = self.parseFicMetaSpan(metaSpan.decode_contents())
				#fic.language = res["language"]

				# reconstruct
				fields = [
					('rated', 'Rated: Fiction ZZZ'),
					('language', 'Language: ZZZ'),
					('genres', 'Genre: ZZZ'),
					('characters', 'Characters: ZZZ'),
					('reviews', 'Reviews: ZZZ'),
					('favorites', 'Favs: ZZZ'),
					('follows', 'Follows: ZZZ'),
				]
				rmeta = ' - '.join(
					[f[1].replace('ZZZ', res[f[0]]) for f in fields if f[0] in res]
				)

				fic.extraMeta = rmeta
				publishedUts = util.parseDateAsUnix(res['published'], fic.fetched)
				fic.published = OilTimestamp(publishedUts)
				fic.updated = fic.published
				if 'updated' in res:
					updatedUts = util.parseDateAsUnix(res['updated'], fic.fetched)
					fic.updated = OilTimestamp(updatedUts)
				fic.upsert()

			except Exception as e:
				util.logMessage(
					f'FFNAdapter.parseInfoInto: .parseFicMetaSpan:\n{e}\n{traceback.format_exc()}'
				)
				util.logMessage(
					f'FFNAdapter.parseFicMetaSpan: {metaSpan.decode_contents()}'
				)
				pass

		return fic

Пример #2

Показать файл

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		authorLid = fic.localId.split('/')[0]
		storyLid = fic.localId.split('/')[1]

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'

		soup = BeautifulSoup(wwwHtml, 'html5lib')

		pageHeader = soup.find('div', {'class': 'page-header'})
		titleH2 = pageHeader.find('h2')
		fic.title = titleH2.getText().strip()

		authorLink = pageHeader.find('a')
		author = authorLink.getText().strip()
		authorId = authorLid
		authorUrl = self.baseStoryUrl.format(authorLid, 'contact/')
		self.setAuthor(fic, author, authorUrl, authorId)

		divWell = soup.find('div', {'class': 'well'})

		summaryQuote = divWell.find('blockquote')

		fic.description = str(
			summaryQuote.getText()
		).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ')
		while fic.description.find('  ') != -1:
			fic.description = fic.description.replace('  ', ' ')
		fic.description = fic.description.strip()

		divWellText = divWell.getText().strip()

		match = re.search('Status:\s*([^-]*) -', divWellText)
		if match is not None and match.group(1) == 'In progress':
			fic.ficStatus = FicStatus.ongoing
		else:
			raise Exception('unable to find fic status')

		RegexMatcher(
			divWellText, {
				'ageRating': ('Rating\s*:\s+([^-]+) -', str),
				'chapterCount': ('Chapters\s*:\s+(\d+) -', int),
				'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
			}
		).matchAll(fic)
		assert (fic.chapterCount is not None)

		if str(fic.wordCount).find(',') != -1:
			fic.wordCount = int(str(fic.wordCount).replace(',', ''))

		wellParent = divWell.parent
		cid = 0
		wordCount = 0
		reviewCount = 0
		chapterDates: List[int] = []

		for child in wellParent.children:
			if child.name != 'p': continue
			cid += 1
			if str(child).find('Chapter {}'.format(cid)) == -1:
				continue
			chapterLink = child.find('a')
			expectedUrl = '/{}/Chapter_{}/'.format(storyLid, cid).lower()
			if chapterLink.get('href').lower() != expectedUrl:
				raise Exception('unexpected chapter url: ' + chapterLink.get('href'))

			chInfo = ChapterInfo()

			RegexMatcher(
				child.getText(), {
					'wordCount': ('Word count\s*:\s+([\d,]+) -', str),
					'reviewCount': ('Reviews\s*:\s+([^-]+) -', int),
					'updated': ('Uploaded on\s*:\s+(.+)', str),
				}
			).matchAll(chInfo)
			assert (chInfo.updated is not None)

			if str(chInfo.wordCount).find(',') != -1:
				chInfo.wordCount = int(str(chInfo.wordCount).replace(',', ''))

			wordCount += chInfo.wordCount
			reviewCount += chInfo.reviewCount

			dt = (util.parseDateAsUnix(chInfo.updated, int(time.time())))
			chapterDates += [dt]

		# wordCount is already set from overall metadata
		fic.reviewCount = reviewCount

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))

		fic.upsert()
		for cid in range(1, fic.chapterCount + 1):
			ch = fic.chapter(cid)
			ch.localChapterId = 'Chapter_{}'.format(cid)
			ch.url = self.constructUrl(fic.localId, cid)
			ch.upsert()

		return fic

Пример #3

Показать файл

Файл: fictionHuntAdapter.py Проект: FanFicDev/hermes

    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        divDetails = soup.find_all('div', {'class': 'details'})
        if len(divDetails) != 1:
            raise Exception('error: unable to find details\n')
        else:
            divDetails = divDetails[0]

        text = divDetails.get_text()
        pt_str = str(divDetails)

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        divTitle = soup.find_all('div', {'class': 'title'})
        if len(divTitle) == 1:
            fic.title = divTitle[0].get_text().strip()
        else:
            raise Exception(
                'error: unable to find title:\n{}\n'.format(pt_str))

        fic.url = self.constructUrl(fic.localId, 1)

        # TODO: this may not exist on fictionhunt?
        fic.description = 'archive of {} from fictionhunt TODO'.format(
            fic.title)

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rated:\s+(\S+)', str),
                'chapterCount?': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\S+)', int),
                'reviewCount?': ('Reviews:\s+(\S+)', int),
                'favoriteCount?': ('Favs:\s+(\S+)', int),
                'followCount?': ('Follows:\s+(\S+)', int),
                'updated?': ('Updated:\s+(\S+)', str),
                'published': ('Published:\s+(\S+)', str),
            })
        matcher.matchAll(fic)

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('- Complete -', text)
        if match is None:
            fic.ficStatus = FicStatus.ongoing
        else:
            fic.ficStatus = FicStatus.complete

        for a in divDetails.find_all('a'):
            a_href = a.get('href')
            if a_href.find('fanfiction.net/u/') != -1:
                author = a.get_text()
                authorUrl = a_href
                authorId = a_href.split('/')[-1]
                self.setAuthor(fic, author, authorUrl, authorId)
                break
        else:
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: hardcode Harry Potter fanfic?

        return fic

Пример #4

Показать файл

	def parseZListInfoInto(self, fic: Fic, ts: int, html: str) -> Fic:
		# existing data is newer, do nothing
		if fic.fetched is not None and fic.fetched.toUTS() > ts:
			return fic
		from bs4 import BeautifulSoup

		soup = BeautifulSoup(html, 'html5lib')

		text = soup.get_text()
		pt_str = str(html)

		fic.fetched = OilTimestamp(ts)
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		for a in soup.find_all('a', {'class': 'stitle'}):
			fic.title = a.getText()
			break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		for div in soup.find_all('div', {'class': 'z-padtop'}):
			fic.description = div.contents[0]
			break
		else:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+(?:Fiction)?\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+([^-]+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search(
			'(Rated|Chapters|Words|Updated|Published):.*-\s+(Complete)', text
		)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(2)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}: {}'.format(fic.url, status))

		for a in soup.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		zl = soup.find('div', {'class': 'z-list'})
		fan = None if zl is None else zl.get('data-category')
		pendingFandoms: List[Fandom] = []
		if fan is not None:
			pendingFandoms += self.handleFandom(fic, fan)
			# TODO: crossovers?

		#print('---')
		#print(fic.__dict__)
		#raise Exception('todo')

		fic.upsert()
		for pfandom in pendingFandoms:
			fic.add(pfandom)

		return fic

Пример #5

Показать файл

Файл: royalroadlAdapter.py Проект: FanFicDev/hermes

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup
		soup = BeautifulSoup(wwwHtml, 'html5lib')

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		fic.url = self.constructUrl(fic.localId)

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		fic.ageRating = 'M'  # TODO?

		ficTitleDiv = soup.find('div', {'class': 'fic-title'})
		fic.title = ficTitleDiv.find('h1').getText().strip()

		authorLink = ficTitleDiv.find('h4', {'property': 'author'}).find('a')
		author = authorLink.getText().strip()
		authorUrl = self.baseUrl + authorLink.get('href')
		authorId = authorUrl.split('/')[-1]
		self.setAuthor(fic, author, authorUrl, authorId)

		divDescription = soup.find('div', {'class': 'description'})
		try:
			descView = HtmlView(str(divDescription), markdown=False)
			desc = ''.join(['<p>{}</p>'.format(l) for l in descView.text])
			fic.description = desc
		except:
			fic.description = divDescription.getText().strip()

		fictionInfo = str(soup.find('div', {'class': 'fiction-info'}))
		if fictionInfo.find('>ONGOING<') != -1:
			fic.ficStatus = FicStatus.ongoing
		elif fictionInfo.find('>COMPLETED<') != -1:
			fic.ficStatus = FicStatus.complete
		elif fictionInfo.find('>HIATUS<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>STUB<') != -1:
			fic.ficStatus = FicStatus.ongoing  # TODO?
		elif fictionInfo.find('>DROPPED<') != -1:
			fic.ficStatus = FicStatus.abandoned
		else:
			raise Exception('unable to find fic status')

		divStatsContent = soup.find('div', {'class': 'stats-content'})
		followers = divStatsContent.find(text='Followers :')
		ul = followers.parent.parent

		RegexMatcher(
			ul.getText(), {
				'followCount?': ('Followers\s+:\s+([\d,]+)', str),
				'favoriteCount?': ('Favorites\s+:\s+([\d,]+)', str),
			}
		).matchAll(fic)

		if str(fic.followCount).find(','):
			fic.followCount = int(str(fic.followCount).replace(',', ''))
		if str(fic.favoriteCount).find(','):
			fic.favoriteCount = int(str(fic.favoriteCount).replace(',', ''))

		tableChapters = soup.find('table', {'id': 'chapters'})
		chapterLinks = tableChapters.findAll('a')

		chapterUrls: List[str] = []
		chapterTitles: List[str] = []
		for chapterLink in chapterLinks:
			# TODO FIXME is this inverted?
			if chapterLink.find('time') is not None:
				continue
			chapterUrls += [chapterLink.get('href')]
			chapterTitles += [chapterLink.getText().strip()]

		chapterDates: List[int] = []
		for chapterLink in chapterLinks:
			if chapterLink.find('time') is None:
				continue
			timeElement = chapterLink.find('time')
			if timeElement.get('unixtime'):
				chapterDates += [int(timeElement.get('unixtime'))]
			else:
				chapterDates += [
					util.parseDateAsUnix(timeElement.get('title'), fic.fetched)
				]

		fic.published = OilTimestamp(min(chapterDates))
		fic.updated = OilTimestamp(max(chapterDates))
		fic.chapterCount = len(chapterUrls)

		if fic.wordCount is None:
			fic.wordCount = 0
		fic.upsert()

		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			chapter.url = self.baseUrl + chapterUrls[cid - 1]
			if chapterUrls[cid - 1].startswith('/fiction/chapter/'):
				# alternate chapter syntax if the chapter itself has no slug
				# /fiction/chapter/<lcid>fid=<lid>&fslug=<fic slug>
				chapter.localChapterId = (
					chapterUrls[cid - 1].split('/')[3].split('?')[0]
				)
			else:
				# standard chapter syntax
				# /fiction/<lid>/<fic slug>/chapter/<lcid>/<chapter slug>
				chapter.localChapterId = chapterUrls[cid - 1].split('/')[5]
			chapter.title = chapterTitles[cid - 1]

			if chapter.title is not None and len(chapter.title) > 0:
				chapter.title = util.cleanChapterTitle(chapter.title, cid)

			chapter.upsert()

		wordCount = 0
		for cid in range(1, fic.chapterCount + 1):
			chapter = fic.chapter(cid)
			if chapter.html() is None:
				chapter.cache()

			chapter.upsert()
			chtml = chapter.html()
			if chtml is not None:
				wordCount += len(chtml.split())

		fic.wordCount = wordCount

		return fic

Пример #6

Показать файл

    def extractSearchMetadata(
        self,
        html: str,
        metas: Dict[str, AdultFanfictionMeta] = {}
    ) -> Dict[str, AdultFanfictionMeta]:
        from bs4 import BeautifulSoup
        archiveFandomMap = {
            'naruto': 'Naruto',
            'hp': 'Harry Potter',
            'xmen': 'X-Men',
        }
        locatedFandomMap = [
            ('Mass Effect', 'Mass Effect'),
            ('Metroid', 'Metroid'),
            ('Pokemon', 'Pokemon'),
            ('Sonic', 'Sonic'),
            ('Witcher 3: Wild Hunt', 'Witcher'),
        ]
        chars = [
            'Harry', 'Hermione', 'Snape', 'Draco', 'Sirius', 'Remus', 'Lucius',
            'Ron', 'Voldemort', 'Ginny', 'Charlie', 'Lily', 'Scorpius',
            'James', 'George', 'Fred', 'Narcissa', 'Blaise', 'Bill', 'Luna',
            'Albus', 'Severus', 'Fenrir', 'Tonks', 'Rose', 'Neville', 'Cho',
            'Cedric', 'Tom', 'Seamus', 'Pansy', 'Bellatrix', 'Viktor', 'Percy',
            'Dudley', 'McGonagall', 'Lavendar', 'Dumbledore', 'Naruto',
            'Sasuke', 'Kakashi', 'Iruka', 'Sakura', 'Itachi', 'Gaara',
            'Shikamaru', 'Neji', 'Rock Lee', 'Hinata', 'Ino', 'Shino', 'Danzo'
        ]

        spaceSqeeezeRe = re.compile('\s+')

        searchSoup = BeautifulSoup(html, 'html5lib')
        resultTables = searchSoup.findAll('table', {'width': '90%'})
        for resultTable in resultTables:
            meta = AdultFanfictionMeta()

            links = resultTable.findAll('a')
            titleLink = links[0]
            meta.title = titleLink.getText()
            meta.url = titleLink.get('href')

            authorLink = links[1]
            meta.author = authorLink.getText().strip()
            meta.authorUrl = authorLink.get('href').strip()
            assert (meta.authorUrl is not None)
            meta.authorId = meta.authorUrl.split('=')[-1]

            trs = resultTable.findAll('tr')

            publishedText = trs[0].getText()
            RegexMatcher(publishedText, {
                'published': ('Published\s+:\s+(.+)', str),
            }).matchAll(meta)
            assert (meta.published is not None)
            meta.published = util.parseDateAsUnix(meta.published,
                                                  int(time.time()))

            extendedMetadata = trs[1].getText()
            util.logMessage(extendedMetadata, 'tmp_e_meta_aff.log')
            # TODO: dragon prints are actually views, not followCount/favoriteCount
            RegexMatcher(
                extendedMetadata, {
                    'chapterCount': ('Chapters\s*:\s*(\d+)', int),
                    'updated': ('Updated\s+:\s+(.+?)-:-', str),
                    'reviewCount?': ('Reviews\s+:\s+(\d+)', int),
                    'views?': ('Dragon prints\s+:\s+(\d+)', int),
                    'located?': ('Located\s*:\s*(.*)', str)
                }).matchAll(meta)
            assert (meta.updated is not None)
            meta.updated = util.parseDateAsUnix(meta.updated, int(time.time()))

            meta.description = str(trs[2])
            meta.description = util.filterUnicode(meta.description)
            meta.description = spaceSqeeezeRe.sub(' ', meta.description)

            meta.setTags(str(trs[3]))

            if 'COMPLETE' in meta.tags or 'Complete.' in meta.tags:
                meta.ficStatus = FicStatus.complete

            assert (meta.url is not None)
            ficId = FicId.tryParseUrl(meta.url)
            assert (ficId is not None)
            meta.localId = ficId.localId
            meta.archive = meta.localId.split('/')[0]
            meta.storyNo = meta.localId.split('/')[1]
            if meta.archive.lower() in archiveFandomMap:
                meta.fandoms += [archiveFandomMap[meta.archive.lower()]]

            meta.located = meta.located or ''
            loclow = meta.located.lower()

            for locFan in locatedFandomMap:
                if loclow.endswith(locFan[0].lower()):
                    meta.fandoms += [locFan[1]]

            for c1 in chars:
                for c2 in chars:
                    if loclow.endswith('{}/{}'.format(c1, c2).lower()):
                        meta.chars += [c1, c2]

            # TODO: try parse category, get chars
            #meta.info()

            if meta.url not in metas or meta.isNewerThan(metas[meta.url]):
                metas[meta.url] = meta

        return metas

Пример #7

Показать файл

Файл: hpFanficArchiveAdapter.py Проект: FanFicDev/hermes

    def parseInfoInto(self, fic: Fic, html: str) -> Fic:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        pagetitle = soup.find(id='pagetitle')
        aTags = pagetitle.findAll('a')
        author = None
        for a in aTags:
            href = a.get('href')
            if href.startswith('viewstory'):
                fic.title = a.contents[0].strip()
            elif href.startswith('viewuser.php?uid='):
                author = a.contents[0]
                authorUrl = self.baseUrl + href
                authorId = str(int(href[len('viewuser.php?uid='):]))
                self.setAuthor(fic, author, authorUrl, authorId)

        if fic.title is None:
            raise Exception('unable to find title')
        if author is None:
            raise Exception('unable to find author')

        lines = html.replace('\r', '\n').replace('<', '\n<').split('\n')
        inDescription = False
        description = ''
        for line in lines:
            cur = line.strip()
            if cur.find('!-- SUMMARY START --') != -1:
                inDescription = True
            elif cur.find('!-- SUMMARY END --') != -1:
                inDescription = False

            if inDescription == True:
                description += cur + '\n'

        fic.description = description

        fic.ageRating = '<unkown>'

        infoBlock = None
        infoText = None
        blocks = soup.findAll('div', {'class': 'block'})
        for block in blocks:
            title = block.find('div', {'class': 'title'})
            if title is None:
                continue
            if title.contents[0] != 'Story Information':
                continue
            infoBlock = block
            infoText = block.get_text()
            break
        else:
            raise Exception('unable to find info text')

        matcher = RegexMatcher(
            infoText, {
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Word count:\s+(\S+)', int),
            })
        matcher.matchAll(fic)

        sortDiv = soup.find(id='sort')
        match = re.search('Reviews\s*-\s*([^\]]+)', sortDiv.get_text())
        if match is not None:
            fic.reviewCount = int(match.group(1).replace(',', ''))
        else:
            fic.reviewCount = 0

        fic.favoriteCount = 0
        fic.followCount = 0

        infoBlockHtml = str(infoBlock)
        match = re.search(
            '<!-- PUBLISHED START -->([^<]*)<!-- PUBLISHED END -->',
            infoBlockHtml)
        if match is not None:
            publishedUts = util.parseDateAsUnix(match.group(1), fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        match = re.search('<!-- UPDATED START -->([^<]*)<!-- UPDATED END -->',
                          infoBlockHtml)
        if match is not None:
            updatedUts = util.parseDateAsUnix(match.group(1), fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.updated is None:
            fic.updated = fic.published

        match = re.search('Completed:\s+(\S+)', infoText)
        if match is not None:
            complete = match.group(1)
            if complete == 'No':
                fic.ficStatus = FicStatus.ongoing
            elif complete == 'Yes':
                fic.ficStatus = FicStatus.complete
            else:
                raise Exception('unknown complete value: {}'.format(complete))

        match = re.search('Crossovers', infoText)
        if match is not None:
            pass  # raise Exception('Found unknown crossover in {0}: {1}'.format(fic.id, fic.url))
        else:
            # otherwise not a crossover and just harry potter
            fic.add(Fandom.define('Harry Potter'))

        return fic

Пример #8

Показать файл

    def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
        from bs4 import BeautifulSoup  # type: ignore
        soup = BeautifulSoup(wwwHtml, 'html.parser')
        storyMainInfo = soup.findAll('table', {'class': 'storymaininfo'})
        if len(storyMainInfo) != 1:
            raise Exception('unable to find main story info')
        storyMainInfo = storyMainInfo[0]

        fic.fetched = OilTimestamp.now()
        fic.languageId = Language.getId("English")  # TODO: don't hard code?

        disclaimerJs = "javascript:if (confirm('Please note. This story may contain adult themes. By clicking here you are stating that you are over 17. Click cancel if you do not meet this requirement.')) location = '?psid="
        for a in soup.findAll('a'):
            href = a.get('href')
            if (not href.startswith(disclaimerJs)
                    and href != '?psid={}'.format(fic.localId)):
                continue
            fic.title = a.getText()
            break
        else:
            raise Exception('error: unable to find title')

        fic.url = self.constructUrl(fic.localId)

        storySummaryTable = soup.findAll('table', {'class': 'storysummary'})
        if len(storySummaryTable) != 1:
            raise Exception('cannot find story summary table')
        storySummaryTable = storySummaryTable[0]
        fic.description = (storySummaryTable.getText().strip())
        if fic.description is None:
            raise Exception('error: unable to find description')

        # default optional fields
        fic.reviewCount = 0
        fic.favoriteCount = 0
        fic.followCount = 0

        text = storyMainInfo.getText().replace('\xa0', ' ')
        matcher = RegexMatcher(
            text, {
                'ageRating': ('Rating:\s+(Mature|15\+|12\+)', str),
                'chapterCount': ('Chapters:\s+(\d+)', int),
                'wordCount': ('Words:\s+(\d+)', int),
                'reviewCount': ('Story Reviews:\s*(\d+)', int),
                'favoriteCount': ('Favorite Story Of:\s+(\d+) users', int),
                'updated': ('Last Updated:\s+(\S+)', str),
                'published': ('First Published:\s+(\S+)', str),
            })
        matcher.matchAll(fic)

        if fic.published is not None:
            publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
            fic.published = OilTimestamp(publishedUts)

        if fic.updated is None:
            fic.updated = fic.published
        elif fic.updated is not None:
            updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
            fic.updated = OilTimestamp(updatedUts)

        if fic.chapterCount is None:
            fic.chapterCount = 1

        match = re.search('Status:\s+(Completed|Work In Progress|Abandoned)',
                          text)
        if match is None:
            raise Exception('cannot find write status')

        status = match.group(1)
        if status == 'Completed':
            fic.ficStatus = FicStatus.complete
        elif status == 'Work In Progress':
            fic.ficStatus = FicStatus.ongoing  # should these be abandoned?
        elif status == 'Abandoned':
            fic.ficStatus = FicStatus.abandoned
        else:
            raise Exception('unknown status: {}'.format(status))

        for a in soup.findAll('a'):
            a_href = a.get('href')
            if a_href.startswith('viewuser.php?showuid='):
                author = a.get_text()
                authorUrl = self.baseUrl + '/' + a_href
                authorId = a_href[len('viewuser.php?showuid='):]
                self.setAuthor(fic, author, authorUrl, authorId)
                break
        else:
            raise Exception('unable to find author:\n{}'.format(text))

        # TODO: chars/pairings?
        fic.add(Fandom.define('Harry Potter'))
        return fic

Пример #9

Показать файл

Файл: fictionPressAdapter.py Проект: FanFicDev/hermes

	def parseInfoInto(self, fic: Fic, wwwHtml: str) -> Fic:
		from bs4 import BeautifulSoup  # type: ignore
		deletedFicText = 'Story Not FoundUnable to locate story. Code 1.'
		soup = BeautifulSoup(wwwHtml, 'html5lib')
		profile_top = soup.find(id='profile_top')
		# story might've been deleted
		if profile_top is None:
			gui_warnings = soup.find_all('span', {'class': 'gui_warning'})
			for gui_warning in gui_warnings:
				if gui_warning.get_text() == deletedFicText:
					fic.ficStatus = FicStatus.abandoned
					fic.upsert()
					return fic

		text = profile_top.get_text()
		pt_str = str(profile_top)

		fic.fetched = OilTimestamp.now()
		fic.languageId = Language.getId("English")  # TODO: don't hard code?

		for b in profile_top.find_all('b'):
			b_class = b.get('class')
			if len(b_class) == 1 and b_class[0] == 'xcontrast_txt':
				fic.title = b.get_text()
				break
		else:
			raise Exception('error: unable to find title:\n{}\n'.format(pt_str))

		fic.url = self.constructUrl(fic.localId, 1, fic.title)

		for div in profile_top.find_all('div'):
			div_class = div.get('class')
			if (
				div.get('style') == 'margin-top:2px' and len(div_class) == 1
				and div_class[0] == 'xcontrast_txt'
			):
				fic.description = div.get_text()
				break
		else:
			raise Exception('error: unable to find description:\n{}\n'.format(pt_str))

		# default optional fields
		fic.reviewCount = 0
		fic.favoriteCount = 0
		fic.followCount = 0

		matcher = RegexMatcher(
			text, {
				'ageRating': ('Rated:\s+Fiction\s*(\S+)', str),
				'chapterCount?': ('Chapters:\s+(\d+)', int),
				'wordCount': ('Words:\s+(\S+)', int),
				'reviewCount?': ('Reviews:\s+(\S+)', int),
				'favoriteCount?': ('Favs:\s+(\S+)', int),
				'followCount?': ('Follows:\s+(\S+)', int),
				'updated?': ('Updated:\s+(\S+)', str),
				'published': ('Published:\s+(\S+)', str),
			}
		)
		matcher.matchAll(fic)

		if fic.published is not None:
			publishedUts = util.parseDateAsUnix(fic.published, fic.fetched)
			fic.published = OilTimestamp(publishedUts)

		if fic.updated is None:
			fic.updated = fic.published
		elif fic.updated is not None:
			updatedUts = util.parseDateAsUnix(fic.updated, fic.fetched)
			fic.updated = OilTimestamp(updatedUts)

		if fic.chapterCount is None:
			fic.chapterCount = 1

		match = re.search('Status:\s+(\S+)', text)
		if match is None:
			fic.ficStatus = FicStatus.ongoing
		else:
			status = match.group(1)
			if status == 'Complete':
				fic.ficStatus = FicStatus.complete
			else:
				raise Exception('unknown status: {}'.format(status))

		for a in profile_top.find_all('a'):
			a_href = a.get('href')
			if a_href.startswith('/u/'):
				author = a.get_text()
				authorUrl = self.baseUrl + a_href
				authorId = a_href.split('/')[2]
				self.setAuthor(fic, author, authorUrl, authorId)
				break
		else:
			raise Exception('unable to find author:\n{}'.format(text))

		preStoryLinks = soup.find(id='pre_story_links')
		preStoryLinksLinks = preStoryLinks.find_all('a')
		for a in preStoryLinksLinks:
			href = a.get('href')
			hrefParts = href.split('/')

			# if it's a top level category
			if (
				len(hrefParts) == 3 and len(hrefParts[0]) == 0
				and len(hrefParts[2]) == 0
			):
				cat = hrefParts[1]
				if cat in fictionPressCategories:
					continue  # skip categories
				raise Exception('unknown category: {}'.format(cat))

			# if it's a regular genre in some category
			if (
				len(hrefParts) == 4 and len(hrefParts[0]) == 0
				and len(hrefParts[3]) == 0
			):
				# ensure category is in our map
				if hrefParts[1] not in fictionPressCategories:
					raise Exception('unknown category: {}'.format(hrefParts[1]))

				# ensure it's in our whitelist
				if hrefParts[2] not in fictionPressGenres:
					util.logMessage(f'FictionPressAdapter: unknown genre {hrefParts[2]}')
					continue

				fic.add(Fandom.define(hrefParts[2]))
				continue

			util.logMessage(f'FictionPressAdapter: unknown genre {fic.id}: {href}')
			continue

		fic.upsert()

		chapterTitles = []
		if fic.chapterCount > 1:
			chapterSelect = soup.find(id='chap_select')
			chapterOptions = []
			if chapterSelect is not None:
				chapterOptions = chapterSelect.findAll('option')
			chapterTitles = [co.getText().strip() for co in chapterOptions]

		for cid in range(fic.chapterCount):
			ch = fic.chapter(cid + 1)
			ch.localChapterId = str(cid + 1)
			if len(chapterTitles) > cid:
				ch.title = util.cleanChapterTitle(chapterTitles[cid], cid + 1)
			elif fic.chapterCount == 1 and cid == 0:
				ch.title = fic.title
			ch.upsert()

		return fic