def write_examples(job_id, args): """A single process creating and writing out pre-processed examples.""" job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id)) owt_dir = os.path.join(args.data_dir, "openwebtext") def log(*args): msg = " ".join(map(str, args)) print("Job {}:".format(job_id), msg) log("Creating example writer") example_writer = build_pretraining_dataset.ExampleWriter( job_id=job_id, vocab_file=os.path.join(args.data_dir, "vocab.txt"), output_dir=os.path.join(args.data_dir, "pretrain_small_tfrecords"), max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=False, do_lower_case=args.do_lower_case) log("Writing tf examples") fnames = sorted(tf.io.gfile.listdir(owt_dir)) fnames = [ f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id ] random.shuffle(fnames) start_time = time.time() count = 0 for file_no, fname in enumerate(fnames): if count >= MAX_DATA_ROW: break count = count + 1 if file_no > 0 and file_no % 10 == 0: elapsed = time.time() - start_time log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, " "{:} examples written".format( file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed), int((len(fnames) - file_no) / (file_no / elapsed)), example_writer.n_written)) utils.rmkdir(job_tmp_dir) with tarfile.open(os.path.join(owt_dir, fname)) as f: f.extractall(job_tmp_dir) extracted_files = tf.io.gfile.listdir(job_tmp_dir) random.shuffle(extracted_files) for txt_fname in extracted_files: example_writer.write_examples(os.path.join(job_tmp_dir, txt_fname)) example_writer.finish() log("Done!")
def write_examples(job_id, args): """A single process creating and writing out pre-processed examples.""" job_tmp_dir = os.path.join(args.data_dir, "tmp", "job_" + str(job_id)) owt_dir = os.path.join(args.data_dir, "wiki") def log(*args): msg = " ".join(map(str, args)) print("Job {}:".format(job_id), msg) log("Creating example writer") example_writer = build_pretraining_dataset.ExampleWriter( job_id=job_id, model_file=os.path.join(args.model_dir, "wiki-ja.model"), vocab_file=os.path.join(args.model_dir, "wiki-ja.vocab"), output_dir=os.path.join(args.model_dir, "pretrain_tfrecords"), max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=False, do_lower_case=args.do_lower_case) log("Writing tf examples") fnames = tf.io.gfile.listdir(owt_dir) fnames = [f for f in fnames if '.' not in f] fnames = sorted(fnames) fnames = [ f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id ] random.shuffle(fnames) for file_no, fname in enumerate(fnames): print('file number : {} of job_id: {}'.format(file_no, job_id)) utils.rmkdir(job_tmp_dir) copy_tree(os.path.join(owt_dir, fname), job_tmp_dir) list_files = tf.io.gfile.listdir(job_tmp_dir) list_files = [fi for fi in list_files if fi != 'all.txt'] for file_name in list_files: example_writer.write_examples(os.path.join(job_tmp_dir, file_name)) example_writer.finish() log("Done!")