def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() prefix = problem.dataset_filename() data_files = tf.gfile.Glob(os.path.join(FLAGS.out_dir, "%s*" % prefix)) missing_files = validate_data_files( problem, data_files, min_size=(60 if FLAGS.for_commoncrawl else 120) * 1e6) task_ids = [filename_to_task_id(fname) for fname in missing_files] ids_for_flag = ",".join([str(i) for i in task_ids]) tf.logging.error( "You should (re)generate %d of the data files. " "Rerun produce_examples with --instance_ids='%s'.", len(missing_files), ids_for_flag) # Compute and write out aggregated stats stats_files = tf.gfile.Glob(os.path.join(FLAGS.out_dir, "stats*")) agg_stats = aggregate_stats(stats_files) if not FLAGS.for_commoncrawl: coverage = agg_stats["overall_ref_coverage"] * 100 if not coverage > 80: tf.logging.error( "Overall reference coverage is expected to be > 80%. " "It is %0.1f. You may want to rerun get_references_web.", coverage) with tf.gfile.Open(os.path.join(FLAGS.out_dir, "stats.json"), "w") as f: f.write(json.dumps(agg_stats)) if FLAGS.rm_per_shard_stats and not missing_files: for fname in stats_files: tf.gfile.Remove(fname)
def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() out_filepaths = problem.out_filepaths(FLAGS.out_dir) out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id] if not FLAGS.vocab_dir: FLAGS.vocab_dir = FLAGS.out_dir shard_ids = utils.shard(list(range(utils.NUM_SHARDS)), FLAGS.num_tasks)[FLAGS.task_id] with utils.timing("produce_examples"): wikisum.produce_examples(shard_ids=shard_ids, wikis_dir=FLAGS.wikis_dir, refs_dir=FLAGS.refs_dir, urls_dir=FLAGS.urls_dir, vocab_path=os.path.join( FLAGS.vocab_dir, problem.vocab_filename), out_filepaths=out_filepaths)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--in_dir', help='') parser.add_argument('--out_dir', help='') parser.add_argument('--for_commoncrawl', help='', default=False, action='store_true') parser.add_argument('--shard', help='if passed it will only process certain shard (file ids in that range)', default=None) parser.add_argument('--total_shards', help='if passed it will only process certain shard (total files in the shard)', default=None) parser.add_argument('--workers', default=1, type=int, help='set to -1 to use all cpus') args = parser.parse_args() if args.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() prefix = problem.dataset_filename() data_files = sorted(tf.gfile.Glob(os.path.join(args.in_dir, "%s*" % prefix))) if args.shard is not None and args.total_shards is not None: total_num_files = len(data_files) shard_len = total_num_files // args.total_shards start_offset = args.shard * shard_len end_offset = (args.shard + 1) * shard_len current_files = data_files[start_offset: end_offset] else: current_files = data_files if args.workers > 1 or args.workers == -1: function_args = [{'file': e, 'args': args} for e in current_files] cpus = args.workers if args.workers != -1 else mp.cpu_count() with mp.Pool(cpus) as p: res = list(tqdm(p.imap(process_file, function_args), total=len(function_args), desc=f'processing shards with {cpus} workers')) else: for tfdataset_file in tqdm(current_files, desc='processing shards'): process_file({'file': tfdataset_file, 'args': args}) print('done')
def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() problem.generate_vocab(FLAGS.out_dir, FLAGS.wikis_dir, FLAGS.refs_dir)