def main(_): assert FLAGS.out_dir assert FLAGS.metadata_dir out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id) tf.gfile.MakeDirs(out_dir) with utils.timing("get_refs_commoncrawl"): # Get all WET files if FLAGS.commoncrawl_wet_dir: wet_files = tf.gfile.Glob( os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz")) else: tmp_dir = tempfile.gettempdir() wet_files = list( utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"], tmp_dir)) # Shard and select this task's work wet_files.sort() wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id] tf.logging.info("Sharded out WET files. Processing %d files", len(wet_files)) wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir, out_dir)
def main(_): urls = get_urls_for_shard_group(FLAGS.urls_dir, FLAGS.shard_id, FLAGS.group_id) tf.logging.info("Fetching %d URLs for shard %d, group %d", len(urls), FLAGS.shard_id, FLAGS.group_id) tf.gfile.MakeDirs(FLAGS.out_dir) out_fname = tfrecord_fname(FLAGS.out_dir, FLAGS.shard_id) with utils.timing("group_fetch"): logging_fnames = {} if FLAGS.log_samples: logging_fnames["samples"] = os.path.join( FLAGS.out_dir, "samples.%d.txt" % FLAGS.shard_id) loop = asyncio.get_event_loop() num_written = loop.run_until_complete( asyncio.ensure_future(fetch_urls(urls, out_fname, logging_fnames))) tf.logging.info("Total URLs: %d", len(urls)) tf.logging.info("Num written: %d", num_written) tf.logging.info("Coverage: %.1f", (num_written / len(urls)) * 100)
def main(_): shard_urls = fetch.get_urls_for_shard(FLAGS.urls_dir, FLAGS.shard_id) num_groups = int(math.ceil(len(shard_urls) / fetch.URLS_PER_CLIENT)) tf.logging.info("Launching get_references_web_single_group sequentially for " "%d groups in shard %d. Total URLs: %d", num_groups, FLAGS.shard_id, len(shard_urls)) command_prefix = FLAGS.command.split() + [ "--urls_dir=%s" % FLAGS.urls_dir, "--shard_id=%d" % FLAGS.shard_id, "--debug_num_urls=%d" % FLAGS.debug_num_urls, ] with utils.timing("all_groups_fetch"): for i in range(num_groups): command = list(command_prefix) out_dir = os.path.join(FLAGS.out_dir, "process_%d" % i) command.append("--out_dir=%s" % out_dir) command.append("--group_id=%d" % i) try: # Even on 1 CPU, each group should finish within an hour. sp.check_call(command, timeout=60*60) except sp.TimeoutExpired: tf.logging.error("Group %d timed out", i)
def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() out_filepaths = problem.out_filepaths(FLAGS.out_dir) out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id] if not FLAGS.vocab_dir: FLAGS.vocab_dir = FLAGS.out_dir shard_ids = utils.shard(list(range(utils.NUM_SHARDS)), FLAGS.num_tasks)[FLAGS.task_id] with utils.timing("produce_examples"): wikisum.produce_examples(shard_ids=shard_ids, wikis_dir=FLAGS.wikis_dir, refs_dir=FLAGS.refs_dir, urls_dir=FLAGS.urls_dir, vocab_path=os.path.join( FLAGS.vocab_dir, problem.vocab_filename), out_filepaths=out_filepaths)
def main(_): urls = get_urls_for_shard_group( FLAGS.urls_dir, FLAGS.shard_id, FLAGS.group_id) tf.logging.info("Fetching %d URLs for shard %d, group %d", len(urls), FLAGS.shard_id, FLAGS.group_id) tf.gfile.MakeDirs(FLAGS.out_dir) out_fname = tfrecord_fname(FLAGS.out_dir, FLAGS.shard_id) with utils.timing("group_fetch"): logging_fnames = {} if FLAGS.log_samples: logging_fnames["samples"] = os.path.join( FLAGS.out_dir, "samples.%d.txt" % FLAGS.shard_id) loop = asyncio.get_event_loop() num_written = loop.run_until_complete(asyncio.ensure_future( fetch_urls(urls, out_fname, logging_fnames))) tf.logging.info("Total URLs: %d", len(urls)) tf.logging.info("Num written: %d", num_written) tf.logging.info("Coverage: %.1f", (num_written / len(urls)) * 100)
def main(_): assert FLAGS.out_dir assert FLAGS.metadata_dir out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id) tf.gfile.MakeDirs(out_dir) with utils.timing("get_refs_commoncrawl"): # Get all WET files if FLAGS.commoncrawl_wet_dir: wet_files = tf.gfile.Glob( os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz")) else: tmp_dir = tempfile.gettempdir() wet_files = list( utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"], tmp_dir)) # Shard and select this task's work wet_files.sort() wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id] tf.logging.info("Sharded out WET files. Processing %d files", len(wet_files)) wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir, out_dir)
def main(_): shard_urls = fetch.get_urls_for_shard(FLAGS.urls_dir, FLAGS.shard_id) num_groups = int(math.ceil(len(shard_urls) / fetch.URLS_PER_CLIENT)) tf.logging.info( "Launching get_references_web_single_group sequentially for " "%d groups in shard %d. Total URLs: %d", num_groups, FLAGS.shard_id, len(shard_urls)) command_prefix = FLAGS.command.split() + [ "--urls_dir=%s" % FLAGS.urls_dir, "--shard_id=%d" % FLAGS.shard_id, "--debug_num_urls=%d" % FLAGS.debug_num_urls, ] with utils.timing("all_groups_fetch"): for i in range(num_groups): command = list(command_prefix) out_dir = os.path.join(FLAGS.out_dir, "process_%d" % i) command.append("--out_dir=%s" % out_dir) command.append("--group_id=%d" % i) try: # Even on 1 CPU, each group should finish within an hour. sp.check_call(command, timeout=60 * 60) except sp.TimeoutExpired: tf.logging.error("Group %d timed out", i)
def main(_): if FLAGS.for_commoncrawl: problem = wikisum.WikisumCommoncrawl() else: problem = wikisum.WikisumWeb() out_filepaths = problem.out_filepaths(FLAGS.out_dir) out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id] if not FLAGS.vocab_dir: FLAGS.vocab_dir = FLAGS.out_dir shard_ids = utils.shard(list(range(utils.NUM_SHARDS)), FLAGS.num_tasks)[FLAGS.task_id] with utils.timing("produce_examples"): wikisum.produce_examples( shard_ids=shard_ids, wikis_dir=FLAGS.wikis_dir, refs_dir=FLAGS.refs_dir, urls_dir=FLAGS.urls_dir, vocab_path=os.path.join(FLAGS.vocab_dir, problem.vocab_filename), out_filepaths=out_filepaths)