Пример #1
0
def _wiki_articles(shard_id, wikis_dir=None):
  """Generates WikipediaArticles from GCS that are part of shard shard_id."""
  if not wikis_dir:
    wikis_dir = WIKI_CONTENT_DIR
  with tf.Graph().as_default():
    dataset = tf.data.TFRecordDataset(
        cc_utils.readahead(
            os.path.join(wikis_dir, WIKI_CONTENT_FILE % shard_id)),
        buffer_size=16 * 1000 * 1000)

    def _parse_example(ex_ser):
      """Parse serialized Example containing Wikipedia article content."""
      features = {
          "url": tf.VarLenFeature(tf.string),
          "title": tf.VarLenFeature(tf.string),
          "section_titles": tf.VarLenFeature(tf.string),
          "section_texts": tf.VarLenFeature(tf.string),
      }
      ex = tf.parse_single_example(ex_ser, features)
      for k in ex.keys():
        ex[k] = ex[k].values
      ex["url"] = ex["url"][0]
      ex["title"] = ex["title"][0]
      return ex

    dataset = dataset.map(_parse_example, num_parallel_calls=32)
    dataset = dataset.prefetch(100)
    record_it = dataset.make_one_shot_iterator().get_next()

    with tf.Session() as sess:
      while True:
        try:
          ex = sess.run(record_it)
        except tf.errors.OutOfRangeError:
          break

        sections = [
            WikipediaSection(title=text_encoder.to_unicode(title),
                             text=text_encoder.to_unicode(text))
            for title, text in zip(ex["section_titles"], ex["section_texts"])
        ]
        yield WikipediaArticle(
            url=text_encoder.to_unicode(ex["url"]),
            title=text_encoder.to_unicode(ex["title"]),
            sections=sections)
Пример #2
0
def _references_content(ref_files):
  """Returns dict<str ref_url, str ref_content>."""
  example_spec = {
      "url": tf.FixedLenFeature([], tf.string),
      "content": tf.FixedLenFeature([], tf.string),
  }
  data = {}
  for ex in generator_utils.tfrecord_iterator(
      ref_files, gzipped=True, example_spec=example_spec):
    data[ex["url"]] = text_encoder.to_unicode(ex["content"])
  return data
Пример #3
0
def shell_output(cmd_, **kwargs):
    return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs)))
Пример #4
0
def shell_output(cmd_, **kwargs):
  return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs)))