def download( url_entry, scraper=args.scraper, save_uncompressed=args.save_uncompressed, memoize=args.scraper_memoize, ): uid, url = url_entry url = url.strip() fid = "{:07d}-{}".format(uid, md5(url.encode()).hexdigest()) # is_good_link, link_type = vet_link(url) # if not is_good_link: # return if scraper == "bs4": scrape = bs4_scraper elif scraper == "newspaper": scrape = newspaper_scraper elif scraper == "raw": scrape = raw_scraper text, meta = scrape(url, memoize) if text is None or text.strip() == "": return ("", "", fid, uid) if save_uncompressed: month = extract_month(args.url_file) data_dir = mkdir(op.join(args.output_dir, "data", month)) meta_dir = mkdir(op.join(args.output_dir, "meta", month)) text_fp = op.join(data_dir, "{}.txt".format(fid)) meta_fp = op.join(meta_dir, "{}.json".format(fid)) with open(text_fp, "w") as out: out.write(text) with open(meta_fp, "w") as out: json.dump(meta, out) return (text, meta, fid, uid)
latest_cid = max( [int(a.split("-")[-1].split("_")[0]) for a in archives]) with open(state_fp, "r") as fh: completed_uids = set(int(i.strip()) for i in list(fh)) return completed_uids, state_fp, latest_cid def set_state(state_fp, cdata): _, _, _, uids = zip(*cdata) with open(state_fp, "a+") as handle: for uid in uids: handle.write("{}\n".format(uid)) if __name__ == "__main__": month = extract_month(args.url_file) # in case we are resuming from a previous run completed_uids, state_fp, prev_cid = get_state(month, args.output_dir) # URLs we haven't scraped yet (if first run, all URLs in file) url_entries = load_urls(args.url_file, completed_uids, args.max_urls) pool = mpl.Pool(args.n_procs) # process one "chunk" of args.chunk_size URLs at a time for i, chunk in enumerate(chunks(url_entries, args.chunk_size)): cid = prev_cid + i + 1 print("Downloading chunk {}".format(cid)) t1 = time.time()
def main(url_file, args): def download( url_entry, scraper=args.scraper, save_uncompressed=args.save_uncompressed, memoize=args.scraper_memoize, ): uid, url = url_entry url = url.strip() fid = "{:07d}-{}".format(uid, md5(url.encode()).hexdigest()) # is_good_link, link_type = vet_link(url) # if not is_good_link: # return if scraper == "bs4": scrape = bs4_scraper elif scraper == "newspaper": scrape = newspaper_scraper elif scraper == "raw": scrape = raw_scraper text, meta = scrape(url, memoize) if text is None or text.strip() == "": return ("", "", fid, uid) if save_uncompressed: month = extract_month(args.url_file) data_dir = mkdir(op.join(args.output_dir, "data", month)) meta_dir = mkdir(op.join(args.output_dir, "meta", month)) text_fp = op.join(data_dir, "{}.txt".format(fid)) meta_fp = op.join(meta_dir, "{}.json".format(fid)) with open(text_fp, "w") as out: out.write(text) with open(meta_fp, "w") as out: json.dump(meta, out) return (text, meta, fid, uid) # separate url_file argument for batch_download later on # the rest of the args are constant, but batch_download call main() on many url_file(s) if not args.show_warnings: # avoid lots of datetime warnings warnings.filterwarnings("ignore") month = extract_month(url_file) # in case we are resuming from a previous run completed_uids, state_fp, prev_cid = get_state(month, args.output_dir) # URLs we haven't scraped yet (if first run, all URLs in file) url_entries = load_urls(url_file, completed_uids, args.max_urls) pool = mpl.Pool(args.n_procs) # process one "chunk" of args.chunk_size URLs at a time for i, chunk in enumerate(chunks(url_entries, args.chunk_size)): cid = prev_cid + i + 1 print("Downloading chunk {}".format(cid)) t1 = time.time() if args.timeout > 0: # imap as iterator allows .next() w/ timeout. # ordered version doesn't seem to work correctly. # for some reason, you CANNOT track j or chunk[j] in the loop, # so don't add anything else to the loop below! # confusingly, chunksize below is unrelated to our chunk_size chunk_iter = pool.imap_unordered(download, chunk, chunksize=1) cdata = [] for j in range(len(chunk)): try: result = chunk_iter.next(timeout=args.timeout) cdata.append(result) except mpl.TimeoutError: print(" --- Timeout Error --- ") else: cdata = list(pool.imap(download, chunk, chunksize=1)) set_state(state_fp, cdata) print("{} / {} downloads timed out".format( len(chunk) - len(cdata), len(chunk))) print("Chunk time: {} seconds".format(time.time() - t1)) # archive and save this chunk to file if args.compress: print("Compressing...") t2 = time.time() count = archive_chunk(month, cid, cdata, args.output_dir, args.compress_fmt) print("Archive created in {} seconds".format(time.time() - t2)) print("{} out of {} URLs yielded content\n".format( count, len(chunk))) print("Done!")
if len(files) == 0: return if len(processed) > 0: print("{} files already processed.".format(len(processed))) pool = mpl.Pool(n_procs) for ci, chunk in enumerate(chunks(files, chunk_size)): file_entries = [(fn, tf.extractfile(fn).read()) for fn in chunk] t1 = time.time() parsed = list(pool.imap(parse_file, file_entries, chunksize=1)) # remove empty strings from output parsed = [p for p in parsed if len(p[1]) != 0] hit_rate = len(parsed) / len(chunk) * 100 print("Parsing chunk {} took {} seconds".format(ci + 1, time.time() - t1)) print(" -- {}% of chunk {}'s docs yielded text.".format(hit_rate, ci + 1)) t1 = time.time() save_parsed_text(parsed, out_dir) print("Saving chunk {} took {} seconds".format(ci + 1, time.time() - t1)) if __name__ == "__main__": month = extract_month(args.html_archive) out_dir = mkdir(op.join(args.output_dir, month)) parse_archive(args.html_archive, out_dir, args.n_procs, args.chunk_size) print("Done!")
return completed_uids, state_fp, latest_cid def set_state(state_fp, cdata): _, _, _, uids = zip(*cdata) with open(state_fp, "a+") as handle: for uid in uids: handle.write("{}\n".format(uid)) if __name__ == "__main__": print("downloading from urls in: {}".format(args.urls_dir)) print("saving downloaded chunks to: {}".format(args.output_dir)) for url_file in tqdm(os.listdir(args.urls_dir)): fullpath = os.path.join(args.urls_dir, url_file) month = extract_month(fullpath) def download( url_entry, scraper=args.scraper, save_uncompressed=args.save_uncompressed, memoize=args.scraper_memoize, ): uid, url = url_entry url = url.strip() fid = "{:07d}-{}".format(uid, md5(url.encode()).hexdigest()) # is_good_link, link_type = vet_link(url) # if not is_good_link: # return