def _wiki_articles(shard_id, wikis_dir=None): """Generates WikipediaArticles from GCS that are part of shard shard_id.""" if not wikis_dir: wikis_dir = WIKI_CONTENT_DIR with tf.Graph().as_default(): dataset = tf.data.TFRecordDataset( cc_utils.readahead( os.path.join(wikis_dir, WIKI_CONTENT_FILE % shard_id)), buffer_size=16 * 1000 * 1000) def _parse_example(ex_ser): """Parse serialized Example containing Wikipedia article content.""" features = { "url": tf.VarLenFeature(tf.string), "title": tf.VarLenFeature(tf.string), "section_titles": tf.VarLenFeature(tf.string), "section_texts": tf.VarLenFeature(tf.string), } ex = tf.parse_single_example(ex_ser, features) for k in ex.keys(): ex[k] = ex[k].values ex["url"] = ex["url"][0] ex["title"] = ex["title"][0] return ex dataset = dataset.map(_parse_example, num_parallel_calls=32) dataset = dataset.prefetch(100) record_it = dataset.make_one_shot_iterator().get_next() with tf.Session() as sess: while True: try: ex = sess.run(record_it) except tf.errors.OutOfRangeError: break sections = [ WikipediaSection(title=text_encoder.to_unicode(title), text=text_encoder.to_unicode(text)) for title, text in zip(ex["section_titles"], ex["section_texts"]) ] yield WikipediaArticle( url=text_encoder.to_unicode(ex["url"]), title=text_encoder.to_unicode(ex["title"]), sections=sections)
def _references_content(ref_files): """Returns dict<str ref_url, str ref_content>.""" example_spec = { "url": tf.FixedLenFeature([], tf.string), "content": tf.FixedLenFeature([], tf.string), } data = {} for ex in generator_utils.tfrecord_iterator( ref_files, gzipped=True, example_spec=example_spec): data[ex["url"]] = text_encoder.to_unicode(ex["content"]) return data
def shell_output(cmd_, **kwargs): return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs)))
def shell_output(cmd_, **kwargs): return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs)))