예제 #1
0
    def tryParseUrl(self, url: str) -> Optional[FicId]:
        parts = url.split('/')
        httpOrHttps = (parts[0] == 'https:' or parts[0] == 'http:')
        if len(parts) < 4:
            return None
        if (not parts[2].endswith(self.urlFragments[0])) or (not httpOrHttps):
            return None
        if not parts[3].startswith('story.php?'):
            return None

        leftover = parts[3].split('?')[-1]

        qs = urllib.parse.parse_qs(leftover)
        if 'no' not in qs or len(qs['no']) != 1:
            return None

        storyNumber = int(qs['no'][0])
        archive = parts[2].split('.')[0]
        lid = '{}/{}'.format(archive, storyNumber)

        ficId = FicId(self.ftype, lid)

        if 'chapter' in qs and len(qs['chapter']) == 1:
            ficId.chapterId = int(qs['chapter'][0])

        return ficId
예제 #2
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		if not url.startswith(self.baseStoryUrl + '?'):
			return None
		qstring = url[len(self.baseStoryUrl + '?'):]

		qs = urllib.parse.parse_qs(qstring)
		if 'id' not in qs or len(qs['id']) != 1:
			return None

		lid = int(qs['id'][0])
		ficId = FicId(self.ftype, str(lid))

		if 'chapter' in qs and len(qs['chapter']) == 1:
			ficId.chapterId = int(qs['chapter'][0])

		return ficId
예제 #3
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		if not url.startswith(self.baseStoryUrl):
			return None

		leftover = url[len(self.baseStoryUrl):]
		if not leftover.startswith('?'):
			return None
		leftover = leftover[1:]

		qs = urllib.parse.parse_qs(leftover)
		if 'storyid' not in qs or len(qs['storyid']) != 1:
			return None

		assert (qs['storyid'][0].isnumeric())
		ficId = FicId(self.ftype, qs['storyid'][0])

		if 'chapno' in qs and len(qs['chapno']) == 1:
			ficId.chapterId = int(qs['chapno'][0])

		return ficId
예제 #4
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		parts = url.split('/')
		httpOrHttps = (parts[0] == 'https:' or parts[0] == 'http:')
		if len(parts) < 4:
			return None
		if (not parts[2].endswith(self.urlFragments[0])) or (not httpOrHttps):
			return None

		storyLid = parts[3]
		authorLid = parts[2].split('.')[0]
		lid = '{}/{}'.format(authorLid, storyLid)

		ficId = FicId(self.ftype, lid)

		if len(parts) > 4 and parts[4].startswith('Chapter_'):
			cid = int(parts[4][len('Chapter_'):])
			ficId.chapterId = cid
			ficId.ambiguous = False

		return ficId
예제 #5
0
    def tryParseUrl(self, url: str) -> Optional[FicId]:
        if url.startswith("https://"):
            url = "http://" + url[len("https://"):]
        url = url.replace('http://hpfanficarchive.com',
                          'http://www.hpfanficarchive.com')
        if not url.startswith(self.baseStoryUrl):
            return None
        leftover = url[len(self.baseStoryUrl):]
        if not leftover.startswith('?'):
            return None
        leftover = leftover[1:]

        qs = urllib.parse.parse_qs(leftover)
        if 'sid' not in qs or len(qs['sid']) != 1:
            return None

        ficId = FicId(self.ftype, str(int(qs['sid'][0])))

        if 'chapter' in qs and len(qs['chapter']) == 1:
            ficId.chapterId = int(qs['chapter'][0])

        return ficId
예제 #6
0
    def tryParseUrl(self, url: str) -> Optional[FicId]:
        url = url.replace('&textsize=0', '')
        url = url.replace('http://', 'https://')
        url = url.replace('https://siye', 'https://www.siye')
        if url.startswith(self.alternateBaseUrl):
            url = self.baseUrl + url[len(self.alternateBaseUrl):]
        if not url.startswith(self.baseStoryUrl):
            return None

        leftover = url[len(self.baseStoryUrl):]
        if not leftover.startswith('?'):
            return None
        leftover = leftover[1:]

        qs = urllib.parse.parse_qs(leftover)
        if 'sid' not in qs or len(qs['sid']) != 1:
            return None

        ficId = FicId(self.ftype, str(int(qs['sid'][0])))

        if 'chapter' in qs and len(qs['chapter']) == 1:
            ficId.chapterId = int(qs['chapter'][0])

        return ficId
예제 #7
0
	def tryParseUrl(self, url: str) -> Optional[FicId]:
		mapPrefixes = ['http://www.', 'http://', 'https://www.']
		hasPrefix = True
		while hasPrefix:
			hasPrefix = False
			for pref in mapPrefixes:
				if url.startswith(pref):
					hasPrefix = True
					url = 'https://' + url[len(pref):]

		endsToStrip = [
			'#main',
			'#work_endnotes',
			'#bookmark-form',
			'?view_adult=true',
			'?view_full_work=true',
			'?viewfullwork=true',
			'?show_comments=true',
		]
		for send in endsToStrip:
			if url.endswith(send):
				url = url[:-len(send)]
		if url.find('#') >= 0:
			url = url[:url.find('#')]
		if url.find('?') >= 0:
			url = url[:url.find('?')]

		# TODO: this should probably return a FicId pointing to this chapter and
		# not just this fic in general...
		if url.find('/chapters/') >= 0 and url.find('/works/') < 0:
			meta = scrape.softScrapeWithMeta(url, delay=10)
			if meta is None or meta['raw'] is None or meta['status'] != 200:
				raise Exception('unable to lookup chapter: {}'.format(url))
			from bs4 import BeautifulSoup  # type: ignore
			soup = BeautifulSoup(meta['raw'], 'html5lib')
			for a in soup.find_all('a'):
				if a.get_text() == 'Entire Work':
					return self.tryParseUrl(self.baseUrl + a.get('href')[len('/works/'):])
			else:
				raise Exception('unable to lookup chapters entire work: {}'.format(url))

		if url.startswith(self.collectionUrl) and url.find('/works/') != -1:
			url = self.baseUrl + url[url.find('/works/') + len('/works/'):]
		if not url.startswith(self.baseUrl):
			return None

		pieces = url[len(self.baseUrl):].split('/')
		lid = pieces[0]
		if len(lid) < 1 or not lid.isnumeric():
			return None

		ficId = FicId(FicType.ao3, lid)
		fic = Fic.tryLoad(ficId)
		if fic is None:
			return ficId

		if len(pieces) >= 3 and pieces[1] == 'chapters' and pieces[2].isnumeric():
			localChapterId = pieces[2]
			mchaps = FicChapter.select(
				{
					'ficId': fic.id,
					'localChapterId': localChapterId
				}
			)
			if len(mchaps) == 1:
				ficId.chapterId = mchaps[0].chapterId
				ficId.ambiguous = False

		return ficId