示例#1
0
def v0_cache() -> ResponseReturnValue:
	apiKey = get_request_value('apiKey', '')
	if apiKey not in API_KEYS:
		return make_response({'err':-401,'msg':'unauthorized'}, 401)

	q = get_request_value('q', None)
	u = get_request_value('u', None)
	print(f'v0_cache: {q=}, {u=}')
	if (q is None or len(q.strip()) < 1) \
			and (u is None or len(u.strip()) < 1):
		print(f'v0_cache: q and u are empty')
		return page_not_found(NotFound())

	latest = None
	db = oil.open()
	if u:
		latest = Web.latest(db, ulike=u, status=200)
	else:
		latest = Web.latest(db, q, status=200)

	if latest is None or latest.response is None or latest.created is None:
		print(f'v0_cache: {q=}, {u=}: not found')
	else:
		print(f'v0_cache: {q=}, {u=}: found: len: {len(latest.response)}, url: {latest.url}, id: {latest.id}, created: {latest.created}')
	return make_response_web(latest)
def main(db: 'psycopg2.connection') -> None:
    fileName = f"./meta.gz"
    if len(sys.argv) == 3:
        global logFileName
        logFileName = f"./dmpt2fs_{sys.argv[2]}.log"
        fileName = f"./meta_{sys.argv[2]}.gz"

    plog(f"using log {logFileName}")
    plog(f"writing to {fileName}")

    maxId = Web.maxId(db)
    plog(f"maxId: {maxId}")

    roundTo = 100
    overshoot = 20
    start = 0
    end = maxId
    end = int((end + roundTo - 1) / roundTo) * roundTo

    if len(sys.argv) == 2:
        start = int(sys.argv[1])
    if len(sys.argv) == 3:
        partCount = int(sys.argv[1])
        partIdx = int(sys.argv[2])
        per = int(math.floor(end / partCount))
        start = per * partIdx - overshoot
        if partIdx == partCount - 1:
            end += overshoot
        else:
            end = per * partIdx + per + overshoot

    plog(f"from {start} to {end}")
    blockSize = 1000

    fidx = start - blockSize
    dumpedBlockCount = 0
    with gzip.open(fileName, "wb") as fgz:
        while fidx < end:
            fidx += blockSize
            eidx = min(fidx + blockSize, end)
            plog(f"  doing ids [{fidx}, {eidx})")

            some = Web.fetchIdRange(db,
                                    fidx,
                                    eidx,
                                    ulike='https://www.fanfiction.net/s/%/%')
            for s in some:
                if s.response is None or len(s.response) < 1:
                    continue
                try:
                    dumpRequest(s, cast(IO, fgz))
                except SystemExit as e:
                    raise
                except:
                    plog(f"{s.id}|problem with {s.id} {s.url}")
                    with open(f"./{s.id}.html", "wb") as f:
                        f.write(s.response)

            if len(some) > 0:
                dumpedBlockCount += 1
示例#3
0
def main(db: 'psycopg2.connection') -> None:
    if len(sys.argv) not in {1, 2, 4}:
        print(f"usage: {sys.argv[0]} [start [stripeCount stripe]]")
        sys.exit(1)

    if len(sys.argv) == 4:
        global logFileName
        logFileName = f"./process_story_{processType}_{sys.argv[2]}_{sys.argv[3]}.log"

    plog(f"using log {logFileName}")

    maxId = Web.maxId(db)
    plog(f"maxId: {maxId}")

    roundTo = 100
    overshoot = 20
    start = 0
    end = maxId
    end = int((end + roundTo - 1) / roundTo) * roundTo

    stripeCount = 1
    stripe = 0

    if len(sys.argv) >= 2:
        start = int(sys.argv[1])
    if len(sys.argv) >= 4:
        stripeCount = int(sys.argv[2])
        stripe = int(sys.argv[3])

    plog(f"stripe: {stripe}")
    plog(f"stripeCount: {stripeCount}")

    plog(f"from {start} to {end}")
    blockSize = 1000 * stripeCount

    FFNFicContent.createStripeTable(db, stripe)

    fidx = start - blockSize
    while fidx < end:
        fidx += blockSize
        eidx = min(fidx + blockSize, end)
        plog(f"  doing ids [{fidx}, {eidx})")

        try:
            with db:
                for s in Web.fetchIdRange_g(
                        db, fidx, eidx,
                        ulike='https://www.fanfiction.net/s/%/%'):
                    if s.response is None or len(s.response) < 1:
                        continue
                    handlePage(db, s, stripeCount, stripe)
        except SystemExit as e:
            raise
        except:
            plog(f"  trouble in ids [{fidx}, {eidx})")
            raise
示例#4
0
def scrape(source: WebSource, workerId: int, url: str, triesLeft: int = 3
		) -> Web:
	if triesLeft <= 0:
		raise Exception('scrape: exceeded retry count')

	created = int(time.time() * 1000)
	r = make_req(workerId, url)

	w = Web(created_=created, url_=url, status_=r.status, sourceId_=source.id)
	w.response = r.response
	w.requestHeaders = None # str(r.request.headers).encode('utf-8')
	w.responseHeaders = None # str(r.headers).encode('utf-8')

	if w.status == 200:
		dec = enc.decode(w.response, url)
		if dec is not None and dec[0] is not None:
			w.encoding = Encoding.lookup(oil.open(), dec[0]).id

		if dec is not None and dec[1] is not None:
			title = ''
			try:
				title = extractTitle(dec[1]).strip()
			except:
				pass
			if title == 'Just a moment...' \
					or title == 'Attention Required! | Cloudflare':
				plog(f'scrape: got 200 status CF page, retrying: {triesLeft - 1}')
				time.sleep(9 + random.random() * 2)
				return scrape(source, workerId, url, triesLeft - 1)

	w.save(oil.open())
	return w
def prescrapeUidBlock(db: 'psycopg2.connection', scraper: WebScraper,
                      start: int, end: int, stripeCount: int, stripe: int,
                      minId: int, maxId: int) -> None:
    uids = [uid for uid in range(start, end) if uid % stripeCount == stripe]
    urls = [getUrl(uid) for uid in uids]
    wcache = Web.wcache(db, urls)

    random.shuffle(uids)

    needsScraped = False
    for url in urls:
        if url not in wcache:
            needsScraped = True
            break
    if not needsScraped:
        plog(f"skipping block [{start}, {end})")
        return

    plog(f"prescraping block [{start}, {end})")

    for uid in uids:
        if uid < minId or uid > maxId:
            continue
        if getUrl(uid) in wcache:
            continue
        prescrapeUid(db, scraper, uid)
示例#6
0
def v0_crawl() -> ResponseReturnValue:
	apiKey = get_request_value('apiKey', '')
	if apiKey not in API_KEYS:
		return make_response({'err':-401,'msg':'unauthorized'}, 401)

	q = get_request_value('q', None)
	print(f'v0_crawl: {q=}')
	if (q is None or len(q.strip()) < 1):
		return page_not_found(NotFound())

	if not q.startswith('http://') and not q.startswith('https://'):
		return page_not_found(NotFound())

	ts = int(time.time()) - 1
	db = oil.open()
	scraper = RemoteWebScraper(db)
	scraper.scrape(q)
	latest = Web.latest(db, q, status=200)
	if latest is None or latest.created is None:
		print(f'v0_crawl: {q=}: error: no latest entry')
		return make_response({'err':-500,'msg':'internal server error'}, 500)
	lts = int(latest.created//1000)
	if lts < ts:
		print(f'v0_crawl: {q=}: error getting fresh crawl: {ts} >= {lts}')
		return make_response({'err':-500,'msg':'internal server error'}, 500)
	return make_response_web(latest)
示例#7
0
def testBlock(db: 'psycopg2.connection', start: int, end: int,
              cid: int) -> float:
    urls = [getUrl(fid, cid) for fid in range(start, end)]

    s1 = time.time()
    Web.wcache(db, urls)
    e1 = time.time()

    s2 = time.time()
    Web.latestMany(db, urls)
    e2 = time.time()

    t1 = e1 - s1
    t2 = e2 - s2

    print(f"{t1} {t2}: {t2 / t1}")
    return t2 / t1
def main(db: 'psycopg2.connection') -> None:
    if len(sys.argv) != 2:
        raise Exception("expected wid")

    wid = int(sys.argv[1])

    some = Web.fetchIdRange(db, wid, wid + 1)
    if len(some) != 1:
        raise Exception("TODO")
    w = some[0]
    assert (w.url is not None and w.created is not None)

    if not w.url.startswith('https://www.fanfiction.net/s/'):
        raise Exception("not a ffn url")

    fid = int(w.url.split('/')[4])
    print(f"fid: {fid}")

    response = w.response
    if response is None and w.wbaseId is not None:
        wbase = WebBase.lookup(db, w.wbaseId)
        if wbase is None:
            raise Exception("has null web_base")
        response = wbase.response

    if response is None or len(response) < 1:
        print("response is null")
        return

    dec = enc.decode(response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]

    code = extractFFNDeathCode(html)
    if code != 0:
        plog(f"  dead: {code}")
        c = FFNFic.bury(db, fid, code, w.created, True)
        print(c)
        #print(html)
    else:
        plog(f"  {fid} healthy?")
        print(html)
        try:
            ffnParser = FFNParser()
            ts = int(w.created / 1000)
            fic = ffnParser.get(db, fid, ts, BeautifulSoup(html, 'html5lib'))
            plog(f"{fic.__dict__}")
        except:
            plog(f"{w.url} is broken")
            #with open(f"./edump_{fid}_{cid}.html", 'w') as f:
            #	f.write(html)
            raise
示例#9
0
def prescrapeBlock(db: 'psycopg2.connection', scraper: RemoteWebScraper,
                   start: int, end: int, cid: int, stripeCount: int,
                   stripe: int, maxId: int) -> None:
    end = min(maxId, end)
    needsCachedCount = Web.countFFNNeedsCached(db, start, end, cid,
                                               stripeCount, stripe)
    if needsCachedCount == 0:
        return

    fids = [fid for fid in range(start, end) if fid % stripeCount == stripe]
    if cid != 1:
        fids = FFNFic.getLiveFids(db, start, end, cid)
        if len(fids) < 1:
            return
    random.shuffle(fids)

    urls = [getUrl(fid, cid) for fid in fids]
    wcache = Web.wcache(db, urls)

    needsScraped = False
    for url in urls:
        if url not in wcache:
            needsScraped = True
            break
    if not needsScraped:
        plog(f"skipping block [{start}, {end}) cid:{cid}")
        return

    plog(f"prescraping block [{start}, {end}) cid:{cid}")

    for fid in fids:
        if fid > maxId:
            continue
        if getUrl(fid, cid) in wcache:
            continue
        prescrapeFid(db, scraper, fid, cid)
示例#10
0
def main(db: 'psycopg2.connection') -> int:
    if len(sys.argv) != 3:
        raise Exception("expected wid range")

    wid_s = int(sys.argv[1])
    wid_e = int(sys.argv[2])

    some = Web.fetchIdRange(db, wid_s, wid_e)
    for w in some:
        if w.response is None or len(w.response) < 1:
            continue
        assert (w.url is not None)

        dec = enc.decode(w.response, w.url)
        if dec is None:
            continue
        html = dec[1]
        with open(f"./{w.id}.html", "w") as f:
            f.write(html)

    return 0
示例#11
0
def main(db: 'psycopg2.connection') -> int:
    if len(sys.argv) != 2:
        raise Exception("expected wid")

    wid = int(sys.argv[1])

    some = Web.fetchIdRange(db, wid, wid + 1)
    if len(some) != 1:
        raise Exception("TODO")

    w = some[0]
    if w.response is None or len(w.response) < 1:
        return 0
    assert (w.url is not None)

    dec = enc.decode(w.response, w.url)
    if dec is None:
        raise Exception("unknown encoding")
    html = dec[1]
    print(html)

    return 0
示例#12
0
def main(db: 'psycopg2.connection') -> int:
    if len(sys.argv) != 3:
        raise Exception("expected wid range")

    wid_s = int(sys.argv[1])
    wid_e = int(sys.argv[2])

    maxId = Web.maxId(db)

    if wid_s > maxId:
        return 0
    wid_e = min(wid_e, maxId)

    wid_s_s = str(wid_s).zfill(10)
    wid_e_s = str(wid_e).zfill(10)

    xzfname = f"data_{wid_s_s}_{wid_e_s}.tar.xz"
    if wid_e > maxId:
        xzfname = f"data_{wid_s_s}_{wid_e_s}_partial.tar.xz"

    mfname = f"./manifest_{wid_s_s}_{wid_e_s}.tsv"
    if wid_e > maxId:
        mfname = f"./manifest_{wid_s_s}_{wid_e_s}_partial.tsv"

    ffnLike = 'https://www.fanfiction.net/%'

    with tarfile.open(xzfname, 'w:xz') as xzf:
        # compute manifest
        manifest_s = 'id\ttimestamp\turl\tlength\tmd5\n'
        for w in Web.fetchIdRange_g(db,
                                    wid_s,
                                    wid_e,
                                    ulike=ffnLike,
                                    status=200):
            if w.response is None or len(w.response) < 1:
                continue
            assert (w.url is not None and w.created is not None)

            dec = enc.decode(w.response, w.url)
            if dec is None:
                continue
            html = dec[1]

            ts = int(w.created / 1000)
            html = f"<!--\t{ts}\t{w.url}\t-->\n" + html

            h = hashlib.md5(html.encode('utf-8')).hexdigest()
            l = len(html.encode('utf-8'))
            manifest_s += f"{w.id}\t{ts}\t{w.url}\t{l}\t{h}\n"

        # write raw manifest
        with open(mfname, "w") as mf:
            mf.write(manifest_s)

        # save manifest to txz
        s = io.BytesIO(manifest_s.encode('utf-8'))
        ti = tarfile.TarInfo(name=mfname)
        ti.size = len(manifest_s.encode('utf-8'))
        xzf.addfile(tarinfo=ti, fileobj=s)

        # save individual requests to txz
        for w in Web.fetchIdRange_g(db,
                                    wid_s,
                                    wid_e,
                                    ulike=ffnLike,
                                    status=200):
            if w.response is None or len(w.response) < 1:
                continue
            assert (w.url is not None and w.created is not None)

            dec = enc.decode(w.response, w.url)
            if dec is None:
                continue
            html = dec[1]

            ts = int(w.created / 1000)
            html = f"<!--\t{ts}\t{w.url}\t-->\n" + html

            s = io.BytesIO(html.encode('utf-8'))
            ti = tarfile.TarInfo(name=f"./{w.id}.html")
            ti.mtime = int(w.created // 1000)
            ti.size = len(html.encode('utf-8'))
            xzf.addfile(tarinfo=ti, fileobj=s)

    return 0
示例#13
0
    source = WebSource.lookup(db, 'iris-bulk', 'iris-bulk')
    encoding = Encoding.lookup(db, 'utf8')

    with tarfile.open(xzfname, 'r:xz') as xzf:
        for ti in xzf:
            total += 1
            if total % 100 == 0:
                print(total)
            fo = xzf.extractfile(ti)
            assert (fo is not None)
            html = str(fo.read().decode('utf-8'))
            header, _, html = html.partition('\n')

            ts = int(header.split('\t')[1])
            url = str(header.split('\t')[2])

            if len(Web.wcache(db, [url])) < 1:
                print(f'  {url}: {ts}')
                w = Web(
                    created_=ts,
                    url_=url,
                    status_=200,
                    sourceId_=source.id,
                    encoding_=encoding.id,
                    response_=html.encode('utf-8'),
                    requestHeaders_=None,
                    responseHeaders_=None,
                    wbaseId_=None,
                )
                w.save(db)
示例#14
0
def main(db: 'psycopg2.connection') -> None:
    baseDir = '/mnt/a2/fanfiction.net/s/'
    urlPrefix = 'https://www.fanfiction.net/s/'

    maxId = Web.maxId(db)
    print(f"maxId: {maxId}")

    roundTo = 100
    overshoot = 20
    start = 0
    end = maxId
    print(end)
    end = int((end + roundTo - 1) / roundTo) * roundTo
    print(end)

    if len(sys.argv) == 2:
        start = int(sys.argv[1])
    if len(sys.argv) == 3:
        partCount = int(sys.argv[1])
        partIdx = int(sys.argv[2])
        per = int(math.floor(end / partCount))
        start = per * partIdx - overshoot
        if partIdx == partCount - 1:
            end += overshoot
        else:
            end = per * partIdx + per + overshoot

    print(f"from {start} to {end}")
    blockSize = 100

    fidx = start - blockSize
    dumpedBlockCount = 0
    while fidx < end:
        fidx += blockSize
        eidx = min(fidx + blockSize, end)
        print(f"  doing ids [{fidx}, {eidx})")

        some = Web.fetchIdRange(db,
                                fidx,
                                eidx,
                                ulike='https://www.fanfiction.net/s/%/%')
        for s in some:
            if s.response is None or len(s.response) < 1:
                continue
            assert (s.url is not None and s.created is not None)
            #print(f"{s.url} {len(s.response)}")
            url = s.url
            ts = int(s.created / 1000)
            data = s.response

            fid = url[len(urlPrefix):].split('/')[0]
            cid = url[len(urlPrefix):].split('/')[1]
            fidz = fid.zfill(9)
            spath = '/'.join([fidz[i * 3:i * 3 + 3] for i in range(3)] + [cid])
            #print(f"{url} => {fid} => {fidz} => {spath}")
            fpath = baseDir + spath + f"/{ts}.html.gz"
            #print(fpath)
            os.makedirs(baseDir + spath, exist_ok=True)
            with gzip.open(fpath, 'wb') as f:
                f.write(data)

        if len(some) > 0:
            dumpedBlockCount += 1
            time.sleep(.1)
        if dumpedBlockCount % 100 == 0:
            time.sleep(.4)
示例#15
0
        yield l[i:i + cnt]


urls = [line.strip() for line in sys.stdin]
print(len(urls))
totalLen = 0
totalChunks = 0

xzfname = 'min_bulk_dump.tar.xz'

with oil.open() as db:
    with tarfile.open(xzfname, 'w:xz') as xzf:
        for chunk in chunkList(urls, 1000):
            totalChunks += 1
            print(totalChunks)
            for w in Web.latestMany(db, chunk):
                assert (w.url is not None and w.created is not None)
                dec = enc.decode(w.response, w.url)
                if dec is None:
                    continue
                html = dec[1]
                totalLen += len(html)

                ts = int(w.created / 1000)
                html = f"<!--\t{ts}\t{w.url}\t-->\n" + html

                s = io.BytesIO(html.encode('utf-8'))
                ti = tarfile.TarInfo(name=f"./{w.id}.html")
                ti.size = len(html.encode('utf-8'))
                xzf.addfile(tarinfo=ti, fileobj=s)
示例#16
0
with oil.open() as db:
    scraper = WebScraper(db)
    plog('==========')
    plog(f"source: {scraper.source.__dict__}")
    if baseDelay:
        scraper.baseDelay = baseDelay

    # we handle sleeping in our loop
    loopDelay = scraper.baseDelay
    scraper.baseDelay = 0.01

    while True:
        wq = WebQueue.next(db,
                           workerId,
                           stripeCount=stripeCount,
                           stripe=stripe)
        if wq is None:
            time.sleep(.05)
            continue
        assert (wq.url is not None)
        w = prescrape(scraper, wq)
        if len(Web.wcache(db, [wq.url])) == 1:
            wq.dequeue(db)
        if w is not None:
            assert (w.created is not None)
            if w.created > int((time.time() - 30) * 1000):
                time.sleep(loopDelay)
        else:
            time.sleep(loopDelay)