def v0_fic_all(urlId: str) -> Any: fics = Fic.select({'urlId': urlId}) if len(fics) != 1: return Err.urlId_not_found.get() fic = fics[0] if fic.chapterCount is None: print(f'err: fic has no chapter count: {fic.id}') return Err.urlId_not_found.get() ficChapters = { fc.chapterId: fc for fc in FicChapter.select({'ficId': fic.id}) } chapters = {} for cid in range(1, fic.chapterCount + 1): if cid not in ficChapters: return Err.cid_not_found.get({'arg': f'{fic.id}/{cid}'}) chapter = ficChapters[cid] cres = chapter.toJSONable() try: content = cres['content'] if content is not None: content = util.decompress(content) content = scrape.decodeRequest(content, f'{fic.id}/{cid}') content = cleanHtml(content) if content != cleanHtml(content): print( f'v0_fic_all: {fic.id}/{cid} did not round-trip through cleanHtml' ) cres['content'] = content chapters[cid] = cres except: pass res = fic.toJSONable() return Err.ok({'info': res, 'chapters': chapters})
def softScrape(self, chapter: FicChapter) -> str: fic = chapter.getFic() curl = self.constructUrl(fic.localId, chapter.chapterId, None) #util.logMessage(f'FictionPressAdapter.scrape: {curl}') url = scrape.getLastUrlLike(curl) delay: float = 5 if url is None: url = curl data = str(skitter.softScrape(url)['raw']) if data is None: raise Exception('unable to scrape? FIXME') if ( data.lower().find('chapter not found.') != -1 and data.lower().find("id='storytext'") == -1 ): ts = scrape.getMostRecentScrapeTime(url) if ts is None: raise Exception('no most recent scrape time? FIXME') # if we last scraped more than half an hour ago rescrape if int(time.time()) - ts > (60 * 30): url = self.constructUrl(fic.localId, chapter.chapterId, None) data = self.scrape(url)['raw'] if data is None: raise Exception('unable to scrape? FIXME') if ( data.lower().find('chapter not found.') != -1 and data.lower().find("id='storytext'") == -1 ): raise Exception('unable to find chapter content {}'.format(url)) return data
def importFic(fdata): global ficImportRename ofic = inflateObject(fdata.copy(), ficImportRename) fic = Fic.new() for field in ofic: print('setting "{}" to "{}"'.format(field, ofic[field])) fic.__dict__[field] = ofic[field] fic.published = util.parseDateAsUnix(fic.published, int(time.time())) fic.updated = util.parseDateAsUnix(fic.updated, int(time.time())) print('setting "{}" to "{}"'.format('published', fic.published)) print('setting "{}" to "{}"'.format('updated', fic.updated)) print('adding "{}" ({}/{})'.format(fic.title, fic.type, fic.localId)) fic.insert() for fandom in fdata['fandoms']: print(' adding fandom "{}"'.format(fandom)) fic.add(Fandom.define(fandom)) for character in fdata['characters']: print( ' adding character "{}" from fandom "{}"'.format( character['name'], character['fandom'] ) ) fic.add( Character.define(Fandom.define(character['fandom']), character['name']) ) for genre in fdata['genres']: print(' adding genre "{}"'.format(genre)) fic.add(Genre.define(genre)) for tag in fdata['tags']: print(' adding tag "{}"'.format(tag)) fic.add(Tag.define(tag)) cids = [int(cid) for cid in fdata['chapters']] cids.sort() for cid in cids: print(' adding chapter {}'.format(cid)) ochap = fdata['chapters'][str(cid)] chapter = FicChapter.new() chapter.fic = fic chapter.ficId = fic.id chapter.chapterId = cid for field in ochap: chapter.__dict__[field] = ochap[field] contentPath = './content/{}/{}/{}/content.html'.format( fic.type, fic.localId, cid ) if os.path.isfile(contentPath): html = None with open(contentPath, 'r') as f: html = f.read() print(' has content: {}'.format(len(html))) chapter.setHtml(html) chapter.insert()
def softScrape(self, chapter: FicChapter) -> str: if chapter.url is None: chapter.url = self.buildUrl(chapter) # type: ignore chapter.localChapterId = str(chapter.chapterId) chapter.upsert() fic = chapter.getFic() # TODO should we be passing '%' instead of chapter.fic.title ? #url = scrape.getLastUrlLikeOrDefault( # (self.constructUrl(fic.localId, chapter.chapterId, None), # self.constructUrl(fic.localId, chapter.chapterId, fic.title))) curl = self.constructUrl(fic.localId, chapter.chapterId, None) #util.logMessage(f'FFNAdapter.scrape: {curl}') url = scrape.getLastUrlLike(curl) if url is None: url = curl data = str(skitter.softScrape(url)['raw']) if data is None: raise Exception('unable to scrape? FIXME') if ( data.lower().find('chapter not found.') != -1 and data.lower().find("id='storytext'") == -1 ): ts = scrape.getMostRecentScrapeTime(url) if ts is None: raise Exception('no most recent scrape time? FIXME') # if we last scraped more than half an hour ago rescrape if int(time.time()) - ts > (60 * 30): url = self.constructUrl(fic.localId, chapter.chapterId, None) data = self.scrape(url)['raw'] if data is None: raise Exception('unable to scrape? FIXME') if ( data.lower().find('chapter not found.') != -1 and data.lower().find("id='storytext'") == -1 ): raise Exception('unable to find chapter content {}'.format(url)) return data
def softScrape(self, chapter: FicChapter) -> Optional[str]: import scrape html = scrape.softScrape(chapter.url) if html is None: return html # TODO well this is a nightmare... if html.find('You are being redirected') < 0: return html import re match = re.search("window.location = ['\"]([^'\"]*)['\"];", html) if match is None or match.group(1) is None: return html if chapter.url == match.group(1): raise Exception('redirect loop') chapter.url = match.group(1) chapter.upsert() return self.softScrape(chapter)
def tryParseUrl(self, url: str) -> Optional[FicId]: # by default, we simply try to look up the url in existing chapters or fics chaps = FicChapter.select({'url': url}) if len(chaps) == 1: fic = Fic.get((chaps[0].ficId, )) if fic is not None: return FicId( FicType(fic.sourceId), fic.localId, chaps[0].chapterId, False ) fics = Fic.select({'url': url}) if len(fics) == 1: return FicId(FicType(fics[0].sourceId), fics[0].localId) raise NotImplementedError()
def tryParseUrl(self, url: str) -> Optional[FicId]: if not url.startswith(self.baseUrl): return None # by default, we simply try to look up the url in existing chapters or fics chaps = FicChapter.select({'url': url}) if len(chaps) == 1: fic = Fic.get((chaps[0].ficId, )) if fic is not None: ftype = FicType(fic.sourceId) return FicId(ftype, fic.localId, chaps[0].chapterId, False) fics = Fic.select({'url': url}) if len(fics) == 1: ftype = FicType(fics[0].sourceId) return FicId(ftype, fics[0].localId) leftover = url[len(self.baseUrl):] if not leftover.endswith('.html'): return None ps = leftover.split('/') if len(ps) != 3 or ps[0] != 'authors': return None author = ps[1] storyId = ps[2] suffixes = ['01a.html', '.html'] for suffix in suffixes: if storyId.endswith(suffix): storyId = storyId[:-len(suffix)] # note: seems to be safe to lowercase these lid = (author + '/' + storyId).lower() #print(lid) # make lid author/story ? # TODO: we need some sort of local lid mapping... raise NotImplementedError()
def dumpDB(): data = {} fandomMap = {f.id: f for f in Fandom.select()} characterMap = {c.id: c for c in Character.select()} genreMap = {g.id: g for g in Genre.select()} tagMap = {t.id: t for t in Tag.select()} data['fandoms'] = [fandomMap[k].name for k in fandomMap] data['characters'] = [ { 'name': characterMap[k].name, 'fandom': fandomMap[characterMap[k].fandom_id].name } for k in characterMap ] data['genres'] = [genreMap[k].name for k in genreMap] data['tags'] = [tagMap[k].name for k in tagMap] data['fics'] = {} frename = {'id': None, 'chapters': 'chapterCount'} crename = { 'id': None, 'ficId': None, 'cid': None, 'raw': None, 'fic': None, 'lastLine': None } cdefaults = { 'line': 0, 'subLine': 0, 'notes': None, 'status': Status.ongoing, 'fetched': None, 'url': None } fics = Fic.select() for fic in fics: k = '{}/{}'.format(fic.type, fic.localId) o = fic.__dict__.copy() o = deflateObject(o, frename) o['fandoms'] = [f.name for f in fic.fandoms()] o['characters'] = [ { 'name': c.name, 'fandom': fandomMap[c.fandom_id].name } for c in fic.characters() ] o['tags'] = [t.name for t in fic.tags()] o['genres'] = [g.name for g in fic.genres()] co = {} ficChapters = FicChapter.select({'ficId': fic.id}) for chapter in ficChapters: here = chapter.__dict__.copy() ffNetUrl = 'https://www.fanfiction.net/s/{}/{}/{}'.format( fic.localId, chapter.chapterId, util.urlTitle(fic.title) ) cdefaults['url'] = ffNetUrl cdefaults['lastModified'] = here['fetched'] here = deflateObject(here, crename, cdefaults) co[chapter.chapterId] = here if chapter.raw is None: continue contentPath = './content/{}/{}/{}/'.format( fic.type, fic.localId, chapter.chapterId ) if not os.path.isdir(contentPath): os.makedirs(contentPath) with open(contentPath + 'content.html', 'w') as f: f.write(chapter.content()) o['chapters'] = co data['fics'][k] = o return data
def tryParseUrl(self, url: str) -> Optional[FicId]: mapPrefixes = ['http://www.', 'http://', 'https://www.'] hasPrefix = True while hasPrefix: hasPrefix = False for pref in mapPrefixes: if url.startswith(pref): hasPrefix = True url = 'https://' + url[len(pref):] endsToStrip = [ '#main', '#work_endnotes', '#bookmark-form', '?view_adult=true', '?view_full_work=true', '?viewfullwork=true', '?show_comments=true', ] for send in endsToStrip: if url.endswith(send): url = url[:-len(send)] if url.find('#') >= 0: url = url[:url.find('#')] if url.find('?') >= 0: url = url[:url.find('?')] # TODO: this should probably return a FicId pointing to this chapter and # not just this fic in general... if url.find('/chapters/') >= 0 and url.find('/works/') < 0: meta = scrape.softScrapeWithMeta(url, delay=10) if meta is None or meta['raw'] is None or meta['status'] != 200: raise Exception('unable to lookup chapter: {}'.format(url)) from bs4 import BeautifulSoup # type: ignore soup = BeautifulSoup(meta['raw'], 'html5lib') for a in soup.find_all('a'): if a.get_text() == 'Entire Work': return self.tryParseUrl(self.baseUrl + a.get('href')[len('/works/'):]) else: raise Exception('unable to lookup chapters entire work: {}'.format(url)) if url.startswith(self.collectionUrl) and url.find('/works/') != -1: url = self.baseUrl + url[url.find('/works/') + len('/works/'):] if not url.startswith(self.baseUrl): return None pieces = url[len(self.baseUrl):].split('/') lid = pieces[0] if len(lid) < 1 or not lid.isnumeric(): return None ficId = FicId(FicType.ao3, lid) fic = Fic.tryLoad(ficId) if fic is None: return ficId if len(pieces) >= 3 and pieces[1] == 'chapters' and pieces[2].isnumeric(): localChapterId = pieces[2] mchaps = FicChapter.select( { 'ficId': fic.id, 'localChapterId': localChapterId } ) if len(mchaps) == 1: ficId.chapterId = mchaps[0].chapterId ficId.ambiguous = False return ficId
def buildUrl(self, chapter: FicChapter) -> str: if len(chapter.url.strip()) > 0: return chapter.url return self.constructUrl(chapter.getFic().localId, chapter.chapterId)
def __init__(self, chapter: FicChapter, header: bool = True, markdown: bool = True, footer: bool = True): self.chapter = chapter fic = chapter.getFic() extraTitle = ''.join([s[0] for s in (fic.title or '').split()]).lower() # FIXME this handles symbols badly like "Foo (complete)" #util.logMessage(f"using {extraTitle} as extraTitle in ChapterView") content = self.chapter.cachedContent() if content is None: raise Exception('missing content? FIXME') contentView = HtmlView(content, markdown=markdown, extraTitles=[extraTitle, f"-{extraTitle}-"]) self.totalWords = sum([len(l.split()) for l in contentView.text]) self.text: List[Union[str, List[str]]] = [] if header == True: descriptionView = HtmlView(fic.description or '{missing description}') self.text += [['', '"{}"'.format(fic.title), ''], ['', 'by {}'.format(fic.getAuthorName()), ''], [ 'chapter {}'.format(chapter.chapterId), 'words: {}'.format( util.formatNumber(self.totalWords)) ]] if chapter.chapterId >= 1: self.text += descriptionView.text if chapter.title is not None and len(chapter.title) > 0: self.text += [[ '', 'Chapter {}: {}'.format(chapter.chapterId, chapter.title), '' ]] if len(contentView.text) > 0 and contentView.text[0] != '<hr />': self.text += ['<hr />'] self.headerLength = len(self.text) self.text += contentView.text if footer == True: if len(contentView.text) > 0 and contentView.text[-1] != '<hr />': self.text += ['<hr />'] if chapter.title is not None and len(chapter.title) > 0: self.text += [[ '', 'Chapter {}: {}'.format(chapter.chapterId, chapter.title), '' ]] self.text += [ [ 'chapter {}'.format(chapter.chapterId), 'words: {}'.format(util.formatNumber(self.totalWords)) ], [ '"{}"'.format(fic.title), 'by {}'.format(fic.getAuthorName()) ], ] self.cumulativeLength = [0] * len(self.text) cumLen = 0 for idx in range(len(self.text)): self.cumulativeLength[idx] = cumLen cumLen += len(self.text[idx]) self.totalLength = cumLen self.preWrap = ' ' * 1 self.postWrap = '' self.wtext: Dict[int, List[str]] = {} self.width: int = -1 self.totalWrappedLines: int = -1 self.cumulativeTotalWrappedLines: List[int] = [] self.wrap(80)