示例#1
0
def main(_):
    if FLAGS.for_commoncrawl:
        problem = wikisum.WikisumCommoncrawl()
    else:
        problem = wikisum.WikisumWeb()
    prefix = problem.dataset_filename()
    data_files = tf.gfile.Glob(os.path.join(FLAGS.out_dir, "%s*" % prefix))
    missing_files = validate_data_files(
        problem,
        data_files,
        min_size=(60 if FLAGS.for_commoncrawl else 120) * 1e6)

    task_ids = [filename_to_task_id(fname) for fname in missing_files]
    ids_for_flag = ",".join([str(i) for i in task_ids])
    tf.logging.error(
        "You should (re)generate %d of the data files. "
        "Rerun produce_examples with --instance_ids='%s'.", len(missing_files),
        ids_for_flag)

    # Compute and write out aggregated stats
    stats_files = tf.gfile.Glob(os.path.join(FLAGS.out_dir, "stats*"))
    agg_stats = aggregate_stats(stats_files)
    if not FLAGS.for_commoncrawl:
        coverage = agg_stats["overall_ref_coverage"] * 100
        if not coverage > 80:
            tf.logging.error(
                "Overall reference coverage is expected to be > 80%. "
                "It is %0.1f. You may want to rerun get_references_web.",
                coverage)
    with tf.gfile.Open(os.path.join(FLAGS.out_dir, "stats.json"), "w") as f:
        f.write(json.dumps(agg_stats))
    if FLAGS.rm_per_shard_stats and not missing_files:
        for fname in stats_files:
            tf.gfile.Remove(fname)
def main(_):
    if FLAGS.for_commoncrawl:
        problem = wikisum.WikisumCommoncrawl()
    else:
        problem = wikisum.WikisumWeb()

    out_filepaths = problem.out_filepaths(FLAGS.out_dir)
    out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id]

    if not FLAGS.vocab_dir:
        FLAGS.vocab_dir = FLAGS.out_dir

    shard_ids = utils.shard(list(range(utils.NUM_SHARDS)),
                            FLAGS.num_tasks)[FLAGS.task_id]

    with utils.timing("produce_examples"):
        wikisum.produce_examples(shard_ids=shard_ids,
                                 wikis_dir=FLAGS.wikis_dir,
                                 refs_dir=FLAGS.refs_dir,
                                 urls_dir=FLAGS.urls_dir,
                                 vocab_path=os.path.join(
                                     FLAGS.vocab_dir, problem.vocab_filename),
                                 out_filepaths=out_filepaths)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--in_dir', help='')
    parser.add_argument('--out_dir', help='')
    parser.add_argument('--for_commoncrawl', help='', default=False, action='store_true')
    parser.add_argument('--shard', help='if passed it will only process certain shard (file ids in that range)', default=None)
    parser.add_argument('--total_shards', help='if passed it will only process certain shard (total files in the shard)', default=None)
    parser.add_argument('--workers', default=1, type=int, help='set to -1 to use all cpus')
    args = parser.parse_args()

    if args.for_commoncrawl:
        problem = wikisum.WikisumCommoncrawl()
    else:
        problem = wikisum.WikisumWeb()

    prefix = problem.dataset_filename()
    data_files = sorted(tf.gfile.Glob(os.path.join(args.in_dir, "%s*" % prefix)))

    if args.shard is not None and args.total_shards is not None:
        total_num_files = len(data_files)
        shard_len = total_num_files // args.total_shards
        start_offset = args.shard * shard_len
        end_offset = (args.shard + 1) * shard_len
        current_files = data_files[start_offset: end_offset]
    else:
        current_files = data_files
        
    if args.workers > 1 or args.workers == -1:
        function_args = [{'file': e, 'args': args} for e in current_files]
        cpus = args.workers if args.workers != -1 else mp.cpu_count()
        with mp.Pool(cpus) as p:
            res = list(tqdm(p.imap(process_file, function_args), total=len(function_args), desc=f'processing shards with {cpus} workers'))  
    else:
        for tfdataset_file in tqdm(current_files, desc='processing shards'):
            process_file({'file': tfdataset_file, 'args': args})
        
    print('done')
示例#4
0
def main(_):
  if FLAGS.for_commoncrawl:
    problem = wikisum.WikisumCommoncrawl()
  else:
    problem = wikisum.WikisumWeb()
  problem.generate_vocab(FLAGS.out_dir, FLAGS.wikis_dir, FLAGS.refs_dir)