def tryParseUrl(self, url: str) -> Optional[FicId]: parts = url.split('/') httpOrHttps = (parts[0] == 'https:' or parts[0] == 'http:') if len(parts) < 4: return None if (not parts[2].endswith(self.urlFragments[0])) or (not httpOrHttps): return None if not parts[3].startswith('story.php?'): return None leftover = parts[3].split('?')[-1] qs = urllib.parse.parse_qs(leftover) if 'no' not in qs or len(qs['no']) != 1: return None storyNumber = int(qs['no'][0]) archive = parts[2].split('.')[0] lid = '{}/{}'.format(archive, storyNumber) ficId = FicId(self.ftype, lid) if 'chapter' in qs and len(qs['chapter']) == 1: ficId.chapterId = int(qs['chapter'][0]) return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: if not url.startswith(self.baseStoryUrl + '?'): return None qstring = url[len(self.baseStoryUrl + '?'):] qs = urllib.parse.parse_qs(qstring) if 'id' not in qs or len(qs['id']) != 1: return None lid = int(qs['id'][0]) ficId = FicId(self.ftype, str(lid)) if 'chapter' in qs and len(qs['chapter']) == 1: ficId.chapterId = int(qs['chapter'][0]) return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: if not url.startswith(self.baseStoryUrl): return None leftover = url[len(self.baseStoryUrl):] if not leftover.startswith('?'): return None leftover = leftover[1:] qs = urllib.parse.parse_qs(leftover) if 'storyid' not in qs or len(qs['storyid']) != 1: return None assert (qs['storyid'][0].isnumeric()) ficId = FicId(self.ftype, qs['storyid'][0]) if 'chapno' in qs and len(qs['chapno']) == 1: ficId.chapterId = int(qs['chapno'][0]) return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: parts = url.split('/') httpOrHttps = (parts[0] == 'https:' or parts[0] == 'http:') if len(parts) < 4: return None if (not parts[2].endswith(self.urlFragments[0])) or (not httpOrHttps): return None storyLid = parts[3] authorLid = parts[2].split('.')[0] lid = '{}/{}'.format(authorLid, storyLid) ficId = FicId(self.ftype, lid) if len(parts) > 4 and parts[4].startswith('Chapter_'): cid = int(parts[4][len('Chapter_'):]) ficId.chapterId = cid ficId.ambiguous = False return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: if url.startswith("https://"): url = "http://" + url[len("https://"):] url = url.replace('http://hpfanficarchive.com', 'http://www.hpfanficarchive.com') if not url.startswith(self.baseStoryUrl): return None leftover = url[len(self.baseStoryUrl):] if not leftover.startswith('?'): return None leftover = leftover[1:] qs = urllib.parse.parse_qs(leftover) if 'sid' not in qs or len(qs['sid']) != 1: return None ficId = FicId(self.ftype, str(int(qs['sid'][0]))) if 'chapter' in qs and len(qs['chapter']) == 1: ficId.chapterId = int(qs['chapter'][0]) return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: url = url.replace('&textsize=0', '') url = url.replace('http://', 'https://') url = url.replace('https://siye', 'https://www.siye') if url.startswith(self.alternateBaseUrl): url = self.baseUrl + url[len(self.alternateBaseUrl):] if not url.startswith(self.baseStoryUrl): return None leftover = url[len(self.baseStoryUrl):] if not leftover.startswith('?'): return None leftover = leftover[1:] qs = urllib.parse.parse_qs(leftover) if 'sid' not in qs or len(qs['sid']) != 1: return None ficId = FicId(self.ftype, str(int(qs['sid'][0]))) if 'chapter' in qs and len(qs['chapter']) == 1: ficId.chapterId = int(qs['chapter'][0]) return ficId
def tryParseUrl(self, url: str) -> Optional[FicId]: mapPrefixes = ['http://www.', 'http://', 'https://www.'] hasPrefix = True while hasPrefix: hasPrefix = False for pref in mapPrefixes: if url.startswith(pref): hasPrefix = True url = 'https://' + url[len(pref):] endsToStrip = [ '#main', '#work_endnotes', '#bookmark-form', '?view_adult=true', '?view_full_work=true', '?viewfullwork=true', '?show_comments=true', ] for send in endsToStrip: if url.endswith(send): url = url[:-len(send)] if url.find('#') >= 0: url = url[:url.find('#')] if url.find('?') >= 0: url = url[:url.find('?')] # TODO: this should probably return a FicId pointing to this chapter and # not just this fic in general... if url.find('/chapters/') >= 0 and url.find('/works/') < 0: meta = scrape.softScrapeWithMeta(url, delay=10) if meta is None or meta['raw'] is None or meta['status'] != 200: raise Exception('unable to lookup chapter: {}'.format(url)) from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(meta['raw'], 'html5lib') for a in soup.find_all('a'): if a.get_text() == 'Entire Work': return self.tryParseUrl(self.baseUrl + a.get('href')[len('/works/'):]) else: raise Exception('unable to lookup chapters entire work: {}'.format(url)) if url.startswith(self.collectionUrl) and url.find('/works/') != -1: url = self.baseUrl + url[url.find('/works/') + len('/works/'):] if not url.startswith(self.baseUrl): return None pieces = url[len(self.baseUrl):].split('/') lid = pieces[0] if len(lid) < 1 or not lid.isnumeric(): return None ficId = FicId(FicType.ao3, lid) fic = Fic.tryLoad(ficId) if fic is None: return ficId if len(pieces) >= 3 and pieces[1] == 'chapters' and pieces[2].isnumeric(): localChapterId = pieces[2] mchaps = FicChapter.select( { 'ficId': fic.id, 'localChapterId': localChapterId } ) if len(mchaps) == 1: ficId.chapterId = mchaps[0].chapterId ficId.ambiguous = False return ficId