def main(_):
    assert FLAGS.out_dir
    assert FLAGS.metadata_dir
    out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id)
    tf.gfile.MakeDirs(out_dir)

    with utils.timing("get_refs_commoncrawl"):
        # Get all WET files
        if FLAGS.commoncrawl_wet_dir:
            wet_files = tf.gfile.Glob(
                os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz"))
        else:
            tmp_dir = tempfile.gettempdir()
            wet_files = list(
                utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"],
                                        tmp_dir))

        # Shard and select this task's work
        wet_files.sort()
        wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id]
        tf.logging.info("Sharded out WET files. Processing %d files",
                        len(wet_files))

        wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir,
                                             out_dir)
def main(_):
    if FLAGS.for_commoncrawl:
        problem = wikisum.WikisumCommoncrawl()
    else:
        problem = wikisum.WikisumWeb()

    out_filepaths = problem.out_filepaths(FLAGS.out_dir)
    out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id]

    if not FLAGS.vocab_dir:
        FLAGS.vocab_dir = FLAGS.out_dir

    shard_ids = utils.shard(list(range(utils.NUM_SHARDS)),
                            FLAGS.num_tasks)[FLAGS.task_id]

    with utils.timing("produce_examples"):
        wikisum.produce_examples(shard_ids=shard_ids,
                                 wikis_dir=FLAGS.wikis_dir,
                                 refs_dir=FLAGS.refs_dir,
                                 urls_dir=FLAGS.urls_dir,
                                 vocab_path=os.path.join(
                                     FLAGS.vocab_dir, problem.vocab_filename),
                                 out_filepaths=out_filepaths)
예제 #3
0
def main(_):
  if FLAGS.for_commoncrawl:
    problem = wikisum.WikisumCommoncrawl()
  else:
    problem = wikisum.WikisumWeb()

  out_filepaths = problem.out_filepaths(FLAGS.out_dir)
  out_filepaths = utils.shard(out_filepaths, FLAGS.num_tasks)[FLAGS.task_id]

  if not FLAGS.vocab_dir:
    FLAGS.vocab_dir = FLAGS.out_dir

  shard_ids = utils.shard(list(range(utils.NUM_SHARDS)),
                          FLAGS.num_tasks)[FLAGS.task_id]

  with utils.timing("produce_examples"):
    wikisum.produce_examples(
        shard_ids=shard_ids,
        wikis_dir=FLAGS.wikis_dir,
        refs_dir=FLAGS.refs_dir,
        urls_dir=FLAGS.urls_dir,
        vocab_path=os.path.join(FLAGS.vocab_dir, problem.vocab_filename),
        out_filepaths=out_filepaths)
def main(_):
  assert FLAGS.out_dir
  assert FLAGS.metadata_dir
  out_dir = os.path.join(FLAGS.out_dir, "process_%d" % FLAGS.task_id)
  tf.gfile.MakeDirs(out_dir)

  with utils.timing("get_refs_commoncrawl"):
    # Get all WET files
    if FLAGS.commoncrawl_wet_dir:
      wet_files = tf.gfile.Glob(
          os.path.join(FLAGS.commoncrawl_wet_dir, "*.wet.gz"))
    else:
      tmp_dir = tempfile.gettempdir()
      wet_files = list(
          utils.wet_download_urls(utils.WET_PATHS_BY_DATE["0917"], tmp_dir))

    # Shard and select this task's work
    wet_files.sort()
    wet_files = utils.shard(wet_files, FLAGS.num_tasks)[FLAGS.task_id]
    tf.logging.info("Sharded out WET files. Processing %d files",
                    len(wet_files))

    wikisum.extract_references_from_wets(wet_files, FLAGS.metadata_dir, out_dir)