def getLastPage(db: 'psycopg2.connection', scraper: WebScraper, url: str ) -> int: w = scraper.softScrape(url) dec = enc.decode(w.response, url) if dec is None: raise Exception(f"uhoh {w.url}") html = dec[1] soup = BeautifulSoup(html, 'html5lib') lcWrap = soup.find('div', { 'class': 'lc-wrapper' }) if lcWrap is None: return 1 maxSeen = 1 stub = '/'.join([''] + url.split('/')[3:6] + ['']) for a in lcWrap.findAll('a'): if a is None or a.getText() is None: continue href = a.get('href') if href is not None: if href.startswith(stub): maxSeen = max(maxSeen, int(href.split('/')[-2])) if a.getText().strip() != 'Last': continue ps = href.split('/') return int(ps[-2]) return maxSeen
def prescrapeCommunity(db: 'psycopg2.connection', scraper: WebScraper, comm: FFNCommunity) -> None: assert (comm.id is not None) deathCode = FFNCommunity.isDead(db, comm.id) if deathCode != 0: plog( f"skipping community {comm.id} {comm.stub}, already dead: {deathCode}" ) return plog(f"prescraping community {comm.id} {comm.stub}") # grab the first page to get counts url = comm.getUrl() w = scraper.softScrape(url) dec = enc.decode(w.response, url) if dec is None: plog(" {comm.id} has unknown encoding") return html = dec[1] if len(html) < 1: plog(f" {comm.id} is freshly dead: 1") FFNCommunity.bury(db, comm.id, 1, w.created) pages = getPageCount(comm, html) if pages > 1: plog(f" total pages: {pages}") for page in range(1, pages + 1): if pages > 1: plog(f" grabbing page {page}/{pages}") scraper.softScrape(comm.getUrl(page))
def lookupAbbreviatedFandoms() -> None: global fandomStubMap ks = [k for k in dict.keys(fandomStubMap)] ks.sort() ks.reverse() cnt = 0 for k in ks: if not fandomStubMap[k].endswith('...'): continue #print(f"{k}: {fandomStubMap[k]}") purl = f"{baseUrl}/{k}" #print(purl) cnt += 1 w = scraper.softScrape(purl) assert (w.url is not None) dec = enc.decode(w.response, w.url) assert (dec is not None) html = dec[1] soup = BeautifulSoup(html, 'html5lib') title = soup.find('title').getText().strip() sufs = [' | FanFiction', 'FanFiction Archive'] hasSuf = True while hasSuf: hasSuf = False for suf in sufs: if title.endswith(suf): hasSuf = True title = title[:-len(suf)].strip() break #print(f"{k} => {title}") fandomStubMap[k] = title
def scrape(source: WebSource, workerId: int, url: str, triesLeft: int = 3 ) -> Web: if triesLeft <= 0: raise Exception('scrape: exceeded retry count') created = int(time.time() * 1000) r = make_req(workerId, url) w = Web(created_=created, url_=url, status_=r.status, sourceId_=source.id) w.response = r.response w.requestHeaders = None # str(r.request.headers).encode('utf-8') w.responseHeaders = None # str(r.headers).encode('utf-8') if w.status == 200: dec = enc.decode(w.response, url) if dec is not None and dec[0] is not None: w.encoding = Encoding.lookup(oil.open(), dec[0]).id if dec is not None and dec[1] is not None: title = '' try: title = extractTitle(dec[1]).strip() except: pass if title == 'Just a moment...' \ or title == 'Attention Required! | Cloudflare': plog(f'scrape: got 200 status CF page, retrying: {triesLeft - 1}') time.sleep(9 + random.random() * 2) return scrape(source, workerId, url, triesLeft - 1) w.save(oil.open()) return w
def prescrape(scraper: RemoteWebScraper, url: str) -> None: print(url) w = scraper.softScrape(url) assert(w.url is not None) dec = enc.decode(w.response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] print(f' len: {len(html)}') print(html)
def main(db: 'psycopg2.connection') -> None: if len(sys.argv) != 2: raise Exception("expected wid") wid = int(sys.argv[1]) some = Web.fetchIdRange(db, wid, wid + 1) if len(some) != 1: raise Exception("TODO") w = some[0] assert (w.url is not None and w.created is not None) if not w.url.startswith('https://www.fanfiction.net/s/'): raise Exception("not a ffn url") fid = int(w.url.split('/')[4]) print(f"fid: {fid}") response = w.response if response is None and w.wbaseId is not None: wbase = WebBase.lookup(db, w.wbaseId) if wbase is None: raise Exception("has null web_base") response = wbase.response if response is None or len(response) < 1: print("response is null") return dec = enc.decode(response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] code = extractFFNDeathCode(html) if code != 0: plog(f" dead: {code}") c = FFNFic.bury(db, fid, code, w.created, True) print(c) #print(html) else: plog(f" {fid} healthy?") print(html) try: ffnParser = FFNParser() ts = int(w.created / 1000) fic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib')) plog(f"{fic.__dict__}") except: plog(f"{w.url} is broken") #with open(f"./edump_{fid}_{cid}.html", 'w') as f: # f.write(html) raise
def prescrapeUid(db: 'psycopg2.connection', scraper: WebScraper, uid: int) -> None: plog(f"prescraping uid {uid}") url = getUrl(uid) w = scraper.softScrape(url) dec = enc.decode(w.response, url) if dec is None: plog(" {uid} has unknown encoding") return html = dec[1] code = extractFFNUserDeathCode(html) if code != 0: plog(f" {uid} is freshly dead: {code}") FFNUser.bury(db, uid, code, w.created)
def dumpRequest(w: Web, f: IO) -> None: assert (w.url is not None and w.created is not None) #plog(f"{w.url} {len(w.response)}") url = w.url ts = int(w.created / 1000) dec = enc.decode(w.response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] # try to abbreviate down to just the meta info realStartIdx = html.find('id=pre_story_links') if realStartIdx > -1: realStartIdx = html.rfind('<div', 0, realStartIdx) startIdx = -2 if realStartIdx > -1: startIdx = html.find('<div id=profile_top', realStartIdx) endIdx = -3 if startIdx > -1: endIdx = html.find("class='lc-wrapper'", startIdx) if endIdx < 0: endIdx = html.find("id='storytextp'", startIdx) if endIdx > startIdx and startIdx > realStartIdx: html = html[realStartIdx:endIdx] + '>' soup = BeautifulSoup(html, 'html5lib') profile_top = soup.find(id='profile_top') if profile_top is None: return for t in ['script']: for e in soup.findAll(t): e.decompose() fid = url[len(urlPrefix):].split('/')[0] cid = url[len(urlPrefix):].split('/')[1] profile_top['id'] = f"profile_top_{fid}_{cid}" profile_top['data-fetched'] = ts profile_top_str = str(profile_top) f.write(f"<!-- start wid {w.id} -->\n".encode('utf-8')) #f.write(profile_top_str.encode('utf-8')) f.write(soup.find('body').encode_contents()) f.write(f"<!-- {w.id} end wid -->\n".encode('utf-8')) return
def prescrape(scraper: WebScraper, url: str) -> None: print(f"url: {url}") w = scraper.softScrape(url) responseSize = len(w.response) if w.response is not None else 0 print(f"\tresponse size: {responseSize}B") print(f"\trequest headers: {w.requestHeaders!r}") print(f"\tresponse headers: {w.responseHeaders!r}") dec = enc.decode(w.response, url) if dec is None: print("\tunknown encoding") return print(f"\tencoding: {dec[0]}") html = dec[1] soup = BeautifulSoup(html, 'html5lib') print(f"\tdecoded size: {len(html)}B")
def scrape(self, url: str) -> Web: if self.staleOnly: logMessage(f'staleScrape|{url}', 'scrape.log') wl = Web.latest(self.db, url) if wl is None: raise Exception(f'failed to stale scrape url: {url}') return wl logMessage(f'scrape|{url}', 'scrape.log') created = int(time.time()) * 1000 w = Web(created_=created, url_=url, sourceId_=self.source.id) try: import requests global defaultRequestTimeout r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=defaultRequestTimeout) w.status = r.status_code w.response = (r.content) w.requestHeaders = str(r.request.headers).encode('utf-8') w.responseHeaders = str(r.headers).encode('utf-8') w.finalUrl = r.url except: logMessage(f'scrape|exception|{url}', 'scrape.log') raise fuzz = getFuzz() # subtract out request time from fuzz fuzz -= (int(time.time() * 1000) - created) / 1000 # TODO: delay *before* scrape based on domain time.sleep(max(fuzz, .1) + getFuzz(0.01, 0.1)) self.last_ts = created if w.status != 200: w.save(self.db) raise Exception(f'failed to download url {w.url}: {w.status}') dec = enc.decode(w.response, url) if dec is not None and dec[0] is not None: w.encoding = Encoding.lookup(self.db, dec[0]).id w.save(self.db) return w
def prescrapeFid(db: 'psycopg2.connection', scraper: RemoteWebScraper, fid: int, cid: int) -> None: plog(f"prescraping fid {fid} cid {cid}") code = FFNFic.isDead(db, fid) if code != 0: plog(f" {fid} is dead: {code}") return url = getUrl(fid, cid) w = scraper.softScrape(url) dec = enc.decode(w.response, url) if dec is None: plog(" {fid}/{cid} has unknown encoding") return html = dec[1] code = extractFFNDeathCode(html) if code != 0: plog(f" {fid} is freshly dead: {code}") FFNFic.bury(db, fid, code)
def processCategory(db: 'psycopg2.connection', scraper: RemoteWebScraper, category: FFNCategory) -> None: assert (category.id is not None) url = category.getCrossoverUrl() print(url) w = scraper.softScrape(url) dec = enc.decode(w.response, url) if dec is None: plog(" {category.id} has unknown encoding") return html = dec[1] if len(html) < 1: plog(f" {category.id} is freshly dead: 1") return baseUrl = 'https://www.fanfiction.net' soup = BeautifulSoup(html, 'html5lib') for a in soup.findAll('a'): if not a.has_attr('href'): continue href = str(a.get('href')) if not href.startswith('/crossovers/'): continue parts = href.split('/') if len(parts) != 5: continue stub = parts[2] fandomId = int(parts[3]) name = str(a.text) if len(stub) > 254: continue # TODO oh god why # https://www.fanfiction.net/anime/Do-You-Love-Your-Mom-and-Her-Two-Hit-Multi-Target-Attacks%3F-%E9%80%9A%E5%B8%B8%E6%94%BB%E6%92%83%E3%81%8C%E5%85%A8%E4%BD%93%E6%94%BB%E6%92%83%E3%81%A7%E4%BA%8C%E5%9B%9E%E6%94%BB%E6%92%83%E3%81%AE%E3%81%8A%E6%AF%8D%E3%81%95%E3%82%93%E3%81%AF%E5%A5%BD%E3%81%8D%E3%81%A7%E3%81%99%E3%81%8B%EF%BC%9F/?&srt=1&r=10 print(baseUrl + href) print(f"{fandomId} {stub} => {name}") ffnFandom = FFNFandom.lookup(db, category.id, stub, remoteId=fandomId) print(f"{ffnFandom.remoteId} {ffnFandom.stub} => {ffnFandom.name}") ffnFandom.markHasCrossovers(db) print(ffnFandom.getAllCrossoversUrl())
def refreshMeta(db: 'psycopg2.connection', scraper: RemoteWebScraper, fid: int) -> int: plog(f" refreshing fid {fid} meta") fic = FFNFic.lookup(db, fid) if fic is not None and fic.chapterCount is not None: plog(f" old chapterCount: {fic.chapterCount}") url = getUrl(fid, 1) w = scraper.scrape(url) assert (w.url is not None and w.created is not None) response = w.response if response is None and w.wbaseId is not None: wbase = WebBase.lookup(db, w.wbaseId) if wbase is None: raise Exception("has null web_base") response = wbase.response if response is None or len(response) < 1: raise Exception(f'refreshMeta: unable to find response for {fid}') dec = enc.decode(response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] code = extractFFNDeathCode(html) if code != 0: plog(f" dead: {code}") c = FFNFic.bury(db, fid, code, w.created, True) return code try: ffnParser = FFNParser() ts = int(w.created / 1000) pfic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib')) except: raise return 0
def testLid(db: 'psycopg2.connection', lid: int) -> None: url = f'https://www.fanfiction.net/s/{lid}/1' scraper = RemoteWebScraper(db) w = scraper.softScrape(url) assert (w.created is not None) dec = enc.decode(w.response, url) if dec is None: plog(" {url} has unknown encoding") sys.exit(1) html = dec[1] code = extractFFNDeathCode(html) if code != 0: plog(f" {url} is freshly dead: {code}") return soup = BeautifulSoup(html, 'html5lib') parser = minerva.ffn.parser.FFNParser() fic = parser.get(db, lid, w.created // 1000, soup) print(fic.__dict__)
def main(db: 'psycopg2.connection') -> int: if len(sys.argv) != 3: raise Exception("expected wid range") wid_s = int(sys.argv[1]) wid_e = int(sys.argv[2]) some = Web.fetchIdRange(db, wid_s, wid_e) for w in some: if w.response is None or len(w.response) < 1: continue assert (w.url is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] with open(f"./{w.id}.html", "w") as f: f.write(html) return 0
def getCategories(scraper: RemoteWebScraper) -> Set[str]: categoryBlacklist = [ 'support', 'cookies', 'privacy', 'tos', 'betareaders', 'forums', 'communities', 'j', '', 'u', 's', 'crossovers' ] categories: Set[str] = set() root = scraper.softScrape(baseUrl) assert (root.url is not None) dec = enc.decode(root.response, root.url) assert (dec is not None) html = dec[1] soup = BeautifulSoup(html, 'html5lib') #print(len(html)) for a in soup.findAll('a'): href = urllib.parse.urljoin(baseUrl, a.get('href')) href = stripAfter(href, '#') href = stripAfter(href, '?') if not href.startswith(baseUrl): continue end = href[len(baseUrl):] if end.find('/') < 0: continue category = end.split('/')[1] if category in categoryBlacklist: continue ffnCategory = FFNCategory.lookup(db, category, a.getText().strip()) #print(f"{category}: {ffnCategory.id} {ffnCategory.name}") categories |= {category} #print(category) #print(f"{a.get('href')} {href}") return categories
def handleContent(db: 'psycopg2.connection', c: FFNFicContent ) -> Optional[Dict[str, Any]]: #if fid % stripeCount != stripe: return id_ = f'{c.fid}/{c.cid}' dec = enc.decode(c.content, id_) if dec is None: raise Exception("unknown encoding") html = dec[1] try: # try to grab just the story content md = htmlToMd(html) return { '_id': id_, 'fid': c.fid, 'cid': c.cid, 'content': md, } #res = es.index(index="ffn", id=id_, body=doc) #print(res['result']) except: plog(f"{c.wid} is broken") with open(f"./edump/edump_wid_{c.wid}.html", 'w') as f: f.write(html) plog(traceback.format_exc()) return None
def handleStoryPage(db: 'psycopg2.connection', w: Web, stripeCount: int, stripe: int) -> None: assert (w.url is not None and w.created is not None and w.id is not None) global storyUrlPrefix if not w.url.startswith(storyUrlPrefix): return url = w.url ts = int(w.created / 1000) fid = int(url[len(storyUrlPrefix):].split('/')[0]) cid = int(url[len(storyUrlPrefix):].split('/')[1]) if fid % stripeCount != stripe: return dec = enc.decode(w.response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] deathCode = extractFFNDeathCode(html) if deathCode != 0: #print(f" {fid} is dead: {deathCode}") return #plog(f"{w.url} {len(w.response)}: {fid}/{cid}") try: # try to grab just the story content content = extractContent(html) FFNFicContent.upsert(db, fid, cid, w.id, content, stripe) #plog(f"{w.url} has content len: {len(content)}") except: plog(f"{w.url} is broken") with open(f"./edump_{fid}_{cid}.html", 'w') as f: f.write(html) plog(traceback.format_exc()) raise
def prescrape(scraper: WebScraper, wq: WebQueue) -> Optional[Web]: assert (wq.url is not None) print(f"url: {wq.url}") w = scraper.softScrape(wq.url) assert (w.created is not None) #print(f" {w.created} {wq.musty}") if wq.musty is not None and w.created < wq.musty: print(f" musty, rescraping") w = scraper.scrape(wq.url) assert (w.url is not None and w.response is not None) print(f"\tresponse size: {len(w.response)}B") #print(f"\trequest headers: {w.requestHeaders}") #print(f"\tresponse headers: {w.responseHeaders}") dec = enc.decode(w.response, w.url) if dec is None: print("\tunknown encoding") return None print(f"\tencoding: {dec[0]}") html = dec[1] print(f"\tdecoded size: {len(html)}B") return w
def main(db: 'psycopg2.connection') -> int: if len(sys.argv) != 2: raise Exception("expected wid") wid = int(sys.argv[1]) some = Web.fetchIdRange(db, wid, wid + 1) if len(some) != 1: raise Exception("TODO") w = some[0] if w.response is None or len(w.response) < 1: return 0 assert (w.url is not None) dec = enc.decode(w.response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] print(html) return 0
def main(db: 'psycopg2.connection') -> int: if len(sys.argv) != 3: raise Exception("expected wid range") wid_s = int(sys.argv[1]) wid_e = int(sys.argv[2]) maxId = Web.maxId(db) if wid_s > maxId: return 0 wid_e = min(wid_e, maxId) wid_s_s = str(wid_s).zfill(10) wid_e_s = str(wid_e).zfill(10) xzfname = f"data_{wid_s_s}_{wid_e_s}.tar.xz" if wid_e > maxId: xzfname = f"data_{wid_s_s}_{wid_e_s}_partial.tar.xz" mfname = f"./manifest_{wid_s_s}_{wid_e_s}.tsv" if wid_e > maxId: mfname = f"./manifest_{wid_s_s}_{wid_e_s}_partial.tsv" ffnLike = 'https://www.fanfiction.net/%' with tarfile.open(xzfname, 'w:xz') as xzf: # compute manifest manifest_s = 'id\ttimestamp\turl\tlength\tmd5\n' for w in Web.fetchIdRange_g(db, wid_s, wid_e, ulike=ffnLike, status=200): if w.response is None or len(w.response) < 1: continue assert (w.url is not None and w.created is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] ts = int(w.created / 1000) html = f"<!--\t{ts}\t{w.url}\t-->\n" + html h = hashlib.md5(html.encode('utf-8')).hexdigest() l = len(html.encode('utf-8')) manifest_s += f"{w.id}\t{ts}\t{w.url}\t{l}\t{h}\n" # write raw manifest with open(mfname, "w") as mf: mf.write(manifest_s) # save manifest to txz s = io.BytesIO(manifest_s.encode('utf-8')) ti = tarfile.TarInfo(name=mfname) ti.size = len(manifest_s.encode('utf-8')) xzf.addfile(tarinfo=ti, fileobj=s) # save individual requests to txz for w in Web.fetchIdRange_g(db, wid_s, wid_e, ulike=ffnLike, status=200): if w.response is None or len(w.response) < 1: continue assert (w.url is not None and w.created is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] ts = int(w.created / 1000) html = f"<!--\t{ts}\t{w.url}\t-->\n" + html s = io.BytesIO(html.encode('utf-8')) ti = tarfile.TarInfo(name=f"./{w.id}.html") ti.mtime = int(w.created // 1000) ti.size = len(html.encode('utf-8')) xzf.addfile(tarinfo=ti, fileobj=s) return 0
def prescrapeFandom(db: 'psycopg2.connection', scraper: RemoteWebScraper, fandom: FFNFandom, scrollDate: int, recentThreshold: int, crossover: bool = False) -> None: # TODO fandom graveyard? look at community scraper assert (fandom.id is not None) plog( f"prescraping fandom {fandom.id} {fandom.stub}, crossover: {crossover}" ) lastCompleted = FFNFandomDeltaResult.lastCompleted(db, fandom.id, crossover=crossover) if lastCompleted is not None and lastCompleted > recentThreshold: plog(f" completed recently: {lastCompleted} > {recentThreshold}") return deltaResult = FFNFandomDeltaResult.create(db, fandom.id, crossover=crossover) page = 1 pages = 1 fanMinTs = None fanMaxTs = None while page <= pages: if pages > 1: plog(f" grabbing page {page}/{pages}") url = fandom.getUrl(db, page) if not crossover \ else fandom.getAllCrossoversUrl(page) w = scraper.softScrape(url) dec = enc.decode(w.response, url) if dec is None: plog(" {fandom.id} has unknown encoding") return html = dec[1] if len(html) < 1: plog(f" {fandom.id} is freshly dead: 1") #minerva.buryCommunity(comm.id, 1, w.created) return soup = BeautifulSoup(html, 'html5lib') pages = getPageCount(db, fandom, soup, crossover) page += 1 ficTs = getFicTimestamps(soup) if len(ficTs) == 0: break minTs = getMinFicTs(ficTs) maxTs = getMaxFicTs(ficTs) if fanMinTs is None: fanMinTs = minTs if fanMaxTs is None: fanMaxTs = maxTs if minTs is not None: assert (fanMinTs is not None) fanMinTs = min(fanMinTs, minTs) if maxTs is not None: assert (fanMaxTs is not None) fanMaxTs = max(fanMaxTs, maxTs) deltaResult.update(db, page - 1, pages, fanMinTs, fanMaxTs) if maxTs is not None and maxTs <= scrollDate: break deltaResult.finish(db, page - 1, pages, fanMinTs, fanMaxTs)
return categories categories = getCategories(scraper) #print(categories) fandomNameMap: Dict[str, Optional[int]] = {} fandomIdMap: Dict[int, str] = {} fandomStubMap: Dict[str, str] = {} for category in categories: url = f"{baseUrl}/{category}/" w = scraper.softScrape(url) assert (w.url is not None) dec = enc.decode(w.response, w.url) assert (dec is not None) html = dec[1] soup = BeautifulSoup(html, 'html5lib') for a in soup.findAll('a'): href = urllib.parse.urljoin(w.url, a.get('href')) href = stripAfter(href, '#') href = stripAfter(href, '?') if not href.startswith(url): continue if href == url: continue fandomName = stripAfter(href[len(url):], '/') fandomName = urllib.parse.unquote(fandomName) fandomName = f"{category}/{fandomName}"