예제 #1
0
def download(
    url_entry,
    scraper=args.scraper,
    save_uncompressed=args.save_uncompressed,
    memoize=args.scraper_memoize,
):
    uid, url = url_entry
    url = url.strip()
    fid = "{:07d}-{}".format(uid, md5(url.encode()).hexdigest())

    # is_good_link, link_type = vet_link(url)
    # if not is_good_link:
    #     return

    if scraper == "bs4":
        scrape = bs4_scraper
    elif scraper == "newspaper":
        scrape = newspaper_scraper
    elif scraper == "raw":
        scrape = raw_scraper

    text, meta = scrape(url, memoize)
    if text is None or text.strip() == "":
        return ("", "", fid, uid)

    if save_uncompressed:
        month = extract_month(args.url_file)
        data_dir = mkdir(op.join(args.output_dir, "data", month))
        meta_dir = mkdir(op.join(args.output_dir, "meta", month))
        text_fp = op.join(data_dir, "{}.txt".format(fid))
        meta_fp = op.join(meta_dir, "{}.json".format(fid))

        with open(text_fp, "w") as out:
            out.write(text)
        with open(meta_fp, "w") as out:
            json.dump(meta, out)

    return (text, meta, fid, uid)
예제 #2
0
        latest_cid = max(
            [int(a.split("-")[-1].split("_")[0]) for a in archives])
        with open(state_fp, "r") as fh:
            completed_uids = set(int(i.strip()) for i in list(fh))
    return completed_uids, state_fp, latest_cid


def set_state(state_fp, cdata):
    _, _, _, uids = zip(*cdata)
    with open(state_fp, "a+") as handle:
        for uid in uids:
            handle.write("{}\n".format(uid))


if __name__ == "__main__":
    month = extract_month(args.url_file)

    # in case we are resuming from a previous run
    completed_uids, state_fp, prev_cid = get_state(month, args.output_dir)

    # URLs we haven't scraped yet (if first run, all URLs in file)
    url_entries = load_urls(args.url_file, completed_uids, args.max_urls)

    pool = mpl.Pool(args.n_procs)

    # process one "chunk" of args.chunk_size URLs at a time
    for i, chunk in enumerate(chunks(url_entries, args.chunk_size)):
        cid = prev_cid + i + 1

        print("Downloading chunk {}".format(cid))
        t1 = time.time()
예제 #3
0
def main(url_file, args):
    def download(
        url_entry,
        scraper=args.scraper,
        save_uncompressed=args.save_uncompressed,
        memoize=args.scraper_memoize,
    ):
        uid, url = url_entry
        url = url.strip()
        fid = "{:07d}-{}".format(uid, md5(url.encode()).hexdigest())

        # is_good_link, link_type = vet_link(url)
        # if not is_good_link:
        #     return

        if scraper == "bs4":
            scrape = bs4_scraper
        elif scraper == "newspaper":
            scrape = newspaper_scraper
        elif scraper == "raw":
            scrape = raw_scraper

        text, meta = scrape(url, memoize)
        if text is None or text.strip() == "":
            return ("", "", fid, uid)

        if save_uncompressed:
            month = extract_month(args.url_file)
            data_dir = mkdir(op.join(args.output_dir, "data", month))
            meta_dir = mkdir(op.join(args.output_dir, "meta", month))
            text_fp = op.join(data_dir, "{}.txt".format(fid))
            meta_fp = op.join(meta_dir, "{}.json".format(fid))

            with open(text_fp, "w") as out:
                out.write(text)
            with open(meta_fp, "w") as out:
                json.dump(meta, out)

        return (text, meta, fid, uid)

    # separate url_file argument for batch_download later on
    # the rest of the args are constant, but batch_download call main() on many url_file(s)
    if not args.show_warnings:
        # avoid lots of datetime warnings
        warnings.filterwarnings("ignore")

    month = extract_month(url_file)

    # in case we are resuming from a previous run
    completed_uids, state_fp, prev_cid = get_state(month, args.output_dir)

    # URLs we haven't scraped yet (if first run, all URLs in file)
    url_entries = load_urls(url_file, completed_uids, args.max_urls)

    pool = mpl.Pool(args.n_procs)

    # process one "chunk" of args.chunk_size URLs at a time
    for i, chunk in enumerate(chunks(url_entries, args.chunk_size)):
        cid = prev_cid + i + 1

        print("Downloading chunk {}".format(cid))
        t1 = time.time()

        if args.timeout > 0:
            # imap as iterator allows .next() w/ timeout.
            # ordered version doesn't seem to work correctly.
            # for some reason, you CANNOT track j or chunk[j] in the loop,
            # so don't add anything else to the loop below!
            # confusingly, chunksize below is unrelated to our chunk_size
            chunk_iter = pool.imap_unordered(download, chunk, chunksize=1)
            cdata = []
            for j in range(len(chunk)):
                try:
                    result = chunk_iter.next(timeout=args.timeout)
                    cdata.append(result)
                except mpl.TimeoutError:
                    print("   --- Timeout Error ---   ")
        else:
            cdata = list(pool.imap(download, chunk, chunksize=1))

        set_state(state_fp, cdata)
        print("{} / {} downloads timed out".format(
            len(chunk) - len(cdata), len(chunk)))
        print("Chunk time: {} seconds".format(time.time() - t1))

        # archive and save this chunk to file
        if args.compress:
            print("Compressing...")
            t2 = time.time()
            count = archive_chunk(month, cid, cdata, args.output_dir,
                                  args.compress_fmt)
            print("Archive created in {} seconds".format(time.time() - t2))
            print("{} out of {} URLs yielded content\n".format(
                count, len(chunk)))

    print("Done!")
예제 #4
0
        if len(files) == 0:
            return

        if len(processed) > 0:
            print("{} files already processed.".format(len(processed)))

        pool = mpl.Pool(n_procs)
        for ci, chunk in enumerate(chunks(files, chunk_size)):
            file_entries = [(fn, tf.extractfile(fn).read()) for fn in chunk]

            t1 = time.time()
            parsed = list(pool.imap(parse_file, file_entries, chunksize=1))

            # remove empty strings from output
            parsed = [p for p in parsed if len(p[1]) != 0]

            hit_rate = len(parsed) / len(chunk) * 100
            print("Parsing chunk {} took {} seconds".format(ci + 1, time.time() - t1))
            print(" -- {}% of chunk {}'s docs yielded text.".format(hit_rate, ci + 1))

            t1 = time.time()
            save_parsed_text(parsed, out_dir)
            print("Saving chunk {} took {} seconds".format(ci + 1, time.time() - t1))


if __name__ == "__main__":
    month = extract_month(args.html_archive)
    out_dir = mkdir(op.join(args.output_dir, month))
    parse_archive(args.html_archive, out_dir, args.n_procs, args.chunk_size)
    print("Done!")
    return completed_uids, state_fp, latest_cid


def set_state(state_fp, cdata):
    _, _, _, uids = zip(*cdata)
    with open(state_fp, "a+") as handle:
        for uid in uids:
            handle.write("{}\n".format(uid))


if __name__ == "__main__":
    print("downloading from urls in: {}".format(args.urls_dir))
    print("saving downloaded chunks to: {}".format(args.output_dir))
    for url_file in tqdm(os.listdir(args.urls_dir)):
        fullpath = os.path.join(args.urls_dir, url_file)
        month = extract_month(fullpath)

        def download(
            url_entry,
            scraper=args.scraper,
            save_uncompressed=args.save_uncompressed,
            memoize=args.scraper_memoize,
        ):
            uid, url = url_entry
            url = url.strip()
            fid = "{:07d}-{}".format(uid, md5(url.encode()).hexdigest())

            # is_good_link, link_type = vet_link(url)
            # if not is_good_link:
            #     return