def v0_cache() -> ResponseReturnValue: apiKey = get_request_value('apiKey', '') if apiKey not in API_KEYS: return make_response({'err':-401,'msg':'unauthorized'}, 401) q = get_request_value('q', None) u = get_request_value('u', None) print(f'v0_cache: {q=}, {u=}') if (q is None or len(q.strip()) < 1) \ and (u is None or len(u.strip()) < 1): print(f'v0_cache: q and u are empty') return page_not_found(NotFound()) latest = None db = oil.open() if u: latest = Web.latest(db, ulike=u, status=200) else: latest = Web.latest(db, q, status=200) if latest is None or latest.response is None or latest.created is None: print(f'v0_cache: {q=}, {u=}: not found') else: print(f'v0_cache: {q=}, {u=}: found: len: {len(latest.response)}, url: {latest.url}, id: {latest.id}, created: {latest.created}') return make_response_web(latest)
def main(db: 'psycopg2.connection') -> None: fileName = f"./meta.gz" if len(sys.argv) == 3: global logFileName logFileName = f"./dmpt2fs_{sys.argv[2]}.log" fileName = f"./meta_{sys.argv[2]}.gz" plog(f"using log {logFileName}") plog(f"writing to {fileName}") maxId = Web.maxId(db) plog(f"maxId: {maxId}") roundTo = 100 overshoot = 20 start = 0 end = maxId end = int((end + roundTo - 1) / roundTo) * roundTo if len(sys.argv) == 2: start = int(sys.argv[1]) if len(sys.argv) == 3: partCount = int(sys.argv[1]) partIdx = int(sys.argv[2]) per = int(math.floor(end / partCount)) start = per * partIdx - overshoot if partIdx == partCount - 1: end += overshoot else: end = per * partIdx + per + overshoot plog(f"from {start} to {end}") blockSize = 1000 fidx = start - blockSize dumpedBlockCount = 0 with gzip.open(fileName, "wb") as fgz: while fidx < end: fidx += blockSize eidx = min(fidx + blockSize, end) plog(f" doing ids [{fidx}, {eidx})") some = Web.fetchIdRange(db, fidx, eidx, ulike='https://www.fanfiction.net/s/%/%') for s in some: if s.response is None or len(s.response) < 1: continue try: dumpRequest(s, cast(IO, fgz)) except SystemExit as e: raise except: plog(f"{s.id}|problem with {s.id} {s.url}") with open(f"./{s.id}.html", "wb") as f: f.write(s.response) if len(some) > 0: dumpedBlockCount += 1
def main(db: 'psycopg2.connection') -> None: if len(sys.argv) not in {1, 2, 4}: print(f"usage: {sys.argv[0]} [start [stripeCount stripe]]") sys.exit(1) if len(sys.argv) == 4: global logFileName logFileName = f"./process_story_{processType}_{sys.argv[2]}_{sys.argv[3]}.log" plog(f"using log {logFileName}") maxId = Web.maxId(db) plog(f"maxId: {maxId}") roundTo = 100 overshoot = 20 start = 0 end = maxId end = int((end + roundTo - 1) / roundTo) * roundTo stripeCount = 1 stripe = 0 if len(sys.argv) >= 2: start = int(sys.argv[1]) if len(sys.argv) >= 4: stripeCount = int(sys.argv[2]) stripe = int(sys.argv[3]) plog(f"stripe: {stripe}") plog(f"stripeCount: {stripeCount}") plog(f"from {start} to {end}") blockSize = 1000 * stripeCount FFNFicContent.createStripeTable(db, stripe) fidx = start - blockSize while fidx < end: fidx += blockSize eidx = min(fidx + blockSize, end) plog(f" doing ids [{fidx}, {eidx})") try: with db: for s in Web.fetchIdRange_g( db, fidx, eidx, ulike='https://www.fanfiction.net/s/%/%'): if s.response is None or len(s.response) < 1: continue handlePage(db, s, stripeCount, stripe) except SystemExit as e: raise except: plog(f" trouble in ids [{fidx}, {eidx})") raise
def scrape(source: WebSource, workerId: int, url: str, triesLeft: int = 3 ) -> Web: if triesLeft <= 0: raise Exception('scrape: exceeded retry count') created = int(time.time() * 1000) r = make_req(workerId, url) w = Web(created_=created, url_=url, status_=r.status, sourceId_=source.id) w.response = r.response w.requestHeaders = None # str(r.request.headers).encode('utf-8') w.responseHeaders = None # str(r.headers).encode('utf-8') if w.status == 200: dec = enc.decode(w.response, url) if dec is not None and dec[0] is not None: w.encoding = Encoding.lookup(oil.open(), dec[0]).id if dec is not None and dec[1] is not None: title = '' try: title = extractTitle(dec[1]).strip() except: pass if title == 'Just a moment...' \ or title == 'Attention Required! | Cloudflare': plog(f'scrape: got 200 status CF page, retrying: {triesLeft - 1}') time.sleep(9 + random.random() * 2) return scrape(source, workerId, url, triesLeft - 1) w.save(oil.open()) return w
def prescrapeUidBlock(db: 'psycopg2.connection', scraper: WebScraper, start: int, end: int, stripeCount: int, stripe: int, minId: int, maxId: int) -> None: uids = [uid for uid in range(start, end) if uid % stripeCount == stripe] urls = [getUrl(uid) for uid in uids] wcache = Web.wcache(db, urls) random.shuffle(uids) needsScraped = False for url in urls: if url not in wcache: needsScraped = True break if not needsScraped: plog(f"skipping block [{start}, {end})") return plog(f"prescraping block [{start}, {end})") for uid in uids: if uid < minId or uid > maxId: continue if getUrl(uid) in wcache: continue prescrapeUid(db, scraper, uid)
def v0_crawl() -> ResponseReturnValue: apiKey = get_request_value('apiKey', '') if apiKey not in API_KEYS: return make_response({'err':-401,'msg':'unauthorized'}, 401) q = get_request_value('q', None) print(f'v0_crawl: {q=}') if (q is None or len(q.strip()) < 1): return page_not_found(NotFound()) if not q.startswith('http://') and not q.startswith('https://'): return page_not_found(NotFound()) ts = int(time.time()) - 1 db = oil.open() scraper = RemoteWebScraper(db) scraper.scrape(q) latest = Web.latest(db, q, status=200) if latest is None or latest.created is None: print(f'v0_crawl: {q=}: error: no latest entry') return make_response({'err':-500,'msg':'internal server error'}, 500) lts = int(latest.created//1000) if lts < ts: print(f'v0_crawl: {q=}: error getting fresh crawl: {ts} >= {lts}') return make_response({'err':-500,'msg':'internal server error'}, 500) return make_response_web(latest)
def testBlock(db: 'psycopg2.connection', start: int, end: int, cid: int) -> float: urls = [getUrl(fid, cid) for fid in range(start, end)] s1 = time.time() Web.wcache(db, urls) e1 = time.time() s2 = time.time() Web.latestMany(db, urls) e2 = time.time() t1 = e1 - s1 t2 = e2 - s2 print(f"{t1} {t2}: {t2 / t1}") return t2 / t1
def main(db: 'psycopg2.connection') -> None: if len(sys.argv) != 2: raise Exception("expected wid") wid = int(sys.argv[1]) some = Web.fetchIdRange(db, wid, wid + 1) if len(some) != 1: raise Exception("TODO") w = some[0] assert (w.url is not None and w.created is not None) if not w.url.startswith('https://www.fanfiction.net/s/'): raise Exception("not a ffn url") fid = int(w.url.split('/')[4]) print(f"fid: {fid}") response = w.response if response is None and w.wbaseId is not None: wbase = WebBase.lookup(db, w.wbaseId) if wbase is None: raise Exception("has null web_base") response = wbase.response if response is None or len(response) < 1: print("response is null") return dec = enc.decode(response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] code = extractFFNDeathCode(html) if code != 0: plog(f" dead: {code}") c = FFNFic.bury(db, fid, code, w.created, True) print(c) #print(html) else: plog(f" {fid} healthy?") print(html) try: ffnParser = FFNParser() ts = int(w.created / 1000) fic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib')) plog(f"{fic.__dict__}") except: plog(f"{w.url} is broken") #with open(f"./edump_{fid}_{cid}.html", 'w') as f: # f.write(html) raise
def prescrapeBlock(db: 'psycopg2.connection', scraper: RemoteWebScraper, start: int, end: int, cid: int, stripeCount: int, stripe: int, maxId: int) -> None: end = min(maxId, end) needsCachedCount = Web.countFFNNeedsCached(db, start, end, cid, stripeCount, stripe) if needsCachedCount == 0: return fids = [fid for fid in range(start, end) if fid % stripeCount == stripe] if cid != 1: fids = FFNFic.getLiveFids(db, start, end, cid) if len(fids) < 1: return random.shuffle(fids) urls = [getUrl(fid, cid) for fid in fids] wcache = Web.wcache(db, urls) needsScraped = False for url in urls: if url not in wcache: needsScraped = True break if not needsScraped: plog(f"skipping block [{start}, {end}) cid:{cid}") return plog(f"prescraping block [{start}, {end}) cid:{cid}") for fid in fids: if fid > maxId: continue if getUrl(fid, cid) in wcache: continue prescrapeFid(db, scraper, fid, cid)
def main(db: 'psycopg2.connection') -> int: if len(sys.argv) != 3: raise Exception("expected wid range") wid_s = int(sys.argv[1]) wid_e = int(sys.argv[2]) some = Web.fetchIdRange(db, wid_s, wid_e) for w in some: if w.response is None or len(w.response) < 1: continue assert (w.url is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] with open(f"./{w.id}.html", "w") as f: f.write(html) return 0
def main(db: 'psycopg2.connection') -> int: if len(sys.argv) != 2: raise Exception("expected wid") wid = int(sys.argv[1]) some = Web.fetchIdRange(db, wid, wid + 1) if len(some) != 1: raise Exception("TODO") w = some[0] if w.response is None or len(w.response) < 1: return 0 assert (w.url is not None) dec = enc.decode(w.response, w.url) if dec is None: raise Exception("unknown encoding") html = dec[1] print(html) return 0
def main(db: 'psycopg2.connection') -> int: if len(sys.argv) != 3: raise Exception("expected wid range") wid_s = int(sys.argv[1]) wid_e = int(sys.argv[2]) maxId = Web.maxId(db) if wid_s > maxId: return 0 wid_e = min(wid_e, maxId) wid_s_s = str(wid_s).zfill(10) wid_e_s = str(wid_e).zfill(10) xzfname = f"data_{wid_s_s}_{wid_e_s}.tar.xz" if wid_e > maxId: xzfname = f"data_{wid_s_s}_{wid_e_s}_partial.tar.xz" mfname = f"./manifest_{wid_s_s}_{wid_e_s}.tsv" if wid_e > maxId: mfname = f"./manifest_{wid_s_s}_{wid_e_s}_partial.tsv" ffnLike = 'https://www.fanfiction.net/%' with tarfile.open(xzfname, 'w:xz') as xzf: # compute manifest manifest_s = 'id\ttimestamp\turl\tlength\tmd5\n' for w in Web.fetchIdRange_g(db, wid_s, wid_e, ulike=ffnLike, status=200): if w.response is None or len(w.response) < 1: continue assert (w.url is not None and w.created is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] ts = int(w.created / 1000) html = f"<!--\t{ts}\t{w.url}\t-->\n" + html h = hashlib.md5(html.encode('utf-8')).hexdigest() l = len(html.encode('utf-8')) manifest_s += f"{w.id}\t{ts}\t{w.url}\t{l}\t{h}\n" # write raw manifest with open(mfname, "w") as mf: mf.write(manifest_s) # save manifest to txz s = io.BytesIO(manifest_s.encode('utf-8')) ti = tarfile.TarInfo(name=mfname) ti.size = len(manifest_s.encode('utf-8')) xzf.addfile(tarinfo=ti, fileobj=s) # save individual requests to txz for w in Web.fetchIdRange_g(db, wid_s, wid_e, ulike=ffnLike, status=200): if w.response is None or len(w.response) < 1: continue assert (w.url is not None and w.created is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] ts = int(w.created / 1000) html = f"<!--\t{ts}\t{w.url}\t-->\n" + html s = io.BytesIO(html.encode('utf-8')) ti = tarfile.TarInfo(name=f"./{w.id}.html") ti.mtime = int(w.created // 1000) ti.size = len(html.encode('utf-8')) xzf.addfile(tarinfo=ti, fileobj=s) return 0
source = WebSource.lookup(db, 'iris-bulk', 'iris-bulk') encoding = Encoding.lookup(db, 'utf8') with tarfile.open(xzfname, 'r:xz') as xzf: for ti in xzf: total += 1 if total % 100 == 0: print(total) fo = xzf.extractfile(ti) assert (fo is not None) html = str(fo.read().decode('utf-8')) header, _, html = html.partition('\n') ts = int(header.split('\t')[1]) url = str(header.split('\t')[2]) if len(Web.wcache(db, [url])) < 1: print(f' {url}: {ts}') w = Web( created_=ts, url_=url, status_=200, sourceId_=source.id, encoding_=encoding.id, response_=html.encode('utf-8'), requestHeaders_=None, responseHeaders_=None, wbaseId_=None, ) w.save(db)
def main(db: 'psycopg2.connection') -> None: baseDir = '/mnt/a2/fanfiction.net/s/' urlPrefix = 'https://www.fanfiction.net/s/' maxId = Web.maxId(db) print(f"maxId: {maxId}") roundTo = 100 overshoot = 20 start = 0 end = maxId print(end) end = int((end + roundTo - 1) / roundTo) * roundTo print(end) if len(sys.argv) == 2: start = int(sys.argv[1]) if len(sys.argv) == 3: partCount = int(sys.argv[1]) partIdx = int(sys.argv[2]) per = int(math.floor(end / partCount)) start = per * partIdx - overshoot if partIdx == partCount - 1: end += overshoot else: end = per * partIdx + per + overshoot print(f"from {start} to {end}") blockSize = 100 fidx = start - blockSize dumpedBlockCount = 0 while fidx < end: fidx += blockSize eidx = min(fidx + blockSize, end) print(f" doing ids [{fidx}, {eidx})") some = Web.fetchIdRange(db, fidx, eidx, ulike='https://www.fanfiction.net/s/%/%') for s in some: if s.response is None or len(s.response) < 1: continue assert (s.url is not None and s.created is not None) #print(f"{s.url} {len(s.response)}") url = s.url ts = int(s.created / 1000) data = s.response fid = url[len(urlPrefix):].split('/')[0] cid = url[len(urlPrefix):].split('/')[1] fidz = fid.zfill(9) spath = '/'.join([fidz[i * 3:i * 3 + 3] for i in range(3)] + [cid]) #print(f"{url} => {fid} => {fidz} => {spath}") fpath = baseDir + spath + f"/{ts}.html.gz" #print(fpath) os.makedirs(baseDir + spath, exist_ok=True) with gzip.open(fpath, 'wb') as f: f.write(data) if len(some) > 0: dumpedBlockCount += 1 time.sleep(.1) if dumpedBlockCount % 100 == 0: time.sleep(.4)
yield l[i:i + cnt] urls = [line.strip() for line in sys.stdin] print(len(urls)) totalLen = 0 totalChunks = 0 xzfname = 'min_bulk_dump.tar.xz' with oil.open() as db: with tarfile.open(xzfname, 'w:xz') as xzf: for chunk in chunkList(urls, 1000): totalChunks += 1 print(totalChunks) for w in Web.latestMany(db, chunk): assert (w.url is not None and w.created is not None) dec = enc.decode(w.response, w.url) if dec is None: continue html = dec[1] totalLen += len(html) ts = int(w.created / 1000) html = f"<!--\t{ts}\t{w.url}\t-->\n" + html s = io.BytesIO(html.encode('utf-8')) ti = tarfile.TarInfo(name=f"./{w.id}.html") ti.size = len(html.encode('utf-8')) xzf.addfile(tarinfo=ti, fileobj=s)
with oil.open() as db: scraper = WebScraper(db) plog('==========') plog(f"source: {scraper.source.__dict__}") if baseDelay: scraper.baseDelay = baseDelay # we handle sleeping in our loop loopDelay = scraper.baseDelay scraper.baseDelay = 0.01 while True: wq = WebQueue.next(db, workerId, stripeCount=stripeCount, stripe=stripe) if wq is None: time.sleep(.05) continue assert (wq.url is not None) w = prescrape(scraper, wq) if len(Web.wcache(db, [wq.url])) == 1: wq.dequeue(db) if w is not None: assert (w.created is not None) if w.created > int((time.time() - 30) * 1000): time.sleep(loopDelay) else: time.sleep(loopDelay)