Exemplo n.º 1
0
def article_iterator(encoder, final_desired_size=1025):
    """ Iterate through the provided filename + tokenize"""
    assert os.path.exists(args.input_fn)
    with open(args.input_fn, 'r') as f:
        for l_no, l in enumerate(f):
            if l_no % args.num_folds == args.fold:
                article = json.loads(l)
                article['input_ids'] = tokenize_for_grover_training(encoder, article, desired_size=final_desired_size,
                                                                    unconditional_prob=.35)
                article['inst_index'] = (l_no // args.num_folds)
                if article['inst_index'] < 100:
                    print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(article['inst_index'],
                                                                       detokenize(encoder, article['input_ids']),
                                                                       article['input_ids']
                                                                       ), flush=True)
                if len(article['input_ids']) == 0:
                    continue
                yield article
Exemplo n.º 2
0
with S3TFRecordWriter(train_file) as train_writer, S3TFRecordWriter(
        val_file) as val_writer:
    for article in buffered_and_sliding_window_article_iterator(
            encoder,
            current_desired_size=args.max_seq_length + 1,
            final_desired_size=max(args.max_seq_length + 1, 1025)):
        writer2use = train_writer if article['split'] == 'train' else val_writer
        assert len(article['input_ids']) == (args.max_seq_length + 1)

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(article['input_ids'])
        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))

        writer2use.write(tf_example.SerializeToString())
        total_written += 1

        # DEBUG
        if article['inst_index'] < 5:
            print(
                "~~~\nSubindex{}. Index {}. ARTICLE: {}\n---\nTokens: {}\n\n".
                format(article['sub_index'], article['inst_index'],
                       detokenize(encoder, article['input_ids']),
                       article['input_ids']),
                flush=True)
        if article['inst_index'] % 1000 == 0:
            print("{} articles, {} written".format(article['inst_index'],
                                                   total_written),
                  flush=True)
print("DONE UPLOADING", flush=True)