Пример #1
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		parts = url.split('/')
		httpOrHttps = (parts[0] == 'https:' or parts[0] == 'http:')
		if len(parts) < 4:
			return None
		if (not parts[2].endswith(self.urlFragments[0])) or (not httpOrHttps):
			return None

		storyLid = parts[3]
		authorLid = parts[2].split('.')[0]
		lid = '{}/{}'.format(authorLid, storyLid)

		ficId = FicId(self.ftype, lid)

		if len(parts) > 4 and parts[4].startswith('Chapter_'):
			cid = int(parts[4][len('Chapter_'):])
			ficId.chapterId = cid
			ficId.ambiguous = False

		return ficId
Пример #2
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		mapPrefixes = ['http://www.', 'http://', 'https://www.']
		hasPrefix = True
		while hasPrefix:
			hasPrefix = False
			for pref in mapPrefixes:
				if url.startswith(pref):
					hasPrefix = True
					url = 'https://' + url[len(pref):]

		endsToStrip = [
			'#main',
			'#work_endnotes',
			'#bookmark-form',
			'?view_adult=true',
			'?view_full_work=true',
			'?viewfullwork=true',
			'?show_comments=true',
		]
		for send in endsToStrip:
			if url.endswith(send):
				url = url[:-len(send)]
		if url.find('#') >= 0:
			url = url[:url.find('#')]
		if url.find('?') >= 0:
			url = url[:url.find('?')]

		# TODO: this should probably return a FicId pointing to this chapter and
		# not just this fic in general...
		if url.find('/chapters/') >= 0 and url.find('/works/') < 0:
			meta = scrape.softScrapeWithMeta(url, delay=10)
			if meta is None or meta['raw'] is None or meta['status'] != 200:
				raise Exception('unable to lookup chapter: {}'.format(url))
			from bs4 import BeautifulSoup  # type: ignore
			soup = BeautifulSoup(meta['raw'], 'html5lib')
			for a in soup.find_all('a'):
				if a.get_text() == 'Entire Work':
					return self.tryParseUrl(self.baseUrl + a.get('href')[len('/works/'):])
			else:
				raise Exception('unable to lookup chapters entire work: {}'.format(url))

		if url.startswith(self.collectionUrl) and url.find('/works/') != -1:
			url = self.baseUrl + url[url.find('/works/') + len('/works/'):]
		if not url.startswith(self.baseUrl):
			return None

		pieces = url[len(self.baseUrl):].split('/')
		lid = pieces[0]
		if len(lid) < 1 or not lid.isnumeric():
			return None

		ficId = FicId(FicType.ao3, lid)
		fic = Fic.tryLoad(ficId)
		if fic is None:
			return ficId

		if len(pieces) >= 3 and pieces[1] == 'chapters' and pieces[2].isnumeric():
			localChapterId = pieces[2]
			mchaps = FicChapter.select(
				{
					'ficId': fic.id,
					'localChapterId': localChapterId
				}
			)
			if len(mchaps) == 1:
				ficId.chapterId = mchaps[0].chapterId
				ficId.ambiguous = False

		return ficId