Exemplo n.º 1
0
def tune(n_hidden_in, dropout_in):
    print("Tuning with Hidden Size = {}, Dropout = {}".format(
        n_hidden_in, dropout_in))
    m = Model(n_hidden=n_hidden_in, dropout=dropout_in)
    # Set up training parameters
    data = get_mnist_dataset()
    batch_size = 32
    epochs = 15
    examples = 60000

    train_data = data['train'].shuffle(examples,
                                       reshuffle_each_iteration=False)

    loss_logs = []

    # Main training loop - iterate through data and call train_step
    for i in range(epochs):
        num_correct = 0

        for j, ex in enumerate(
                train_data.batch(batch_size).take(int(examples / batch_size))):
            x, y = process_batch(ex)

            loss, accuracy = m.train_step(x, y)

            num_correct += accuracy * batch_size
            loss_logs.append(loss)

            print("Epoch {}, {}/{}".format(i, (j + 1) * batch_size, examples) +
                  " " * 10 + "Loss: {}, Accuracy {}".format(
                      loss, num_correct / (batch_size * (j + 1))),
                  end='\r',
                  flush=True)
        # Save the model after every epoch
        # m.save("epoch-{}.pkl".format(i))
        print()  # Print empty newline

    # Evaluate the model on the test set
    results = []
    for ex in data['test'].take(1000):
        x, _ = process_example(ex)
        true = ex['label']
        pred = m.predict_class(x)
        results.append(1 if true == pred else 0)

    # Low pass the loss logs to smoothen the graph
    loss_logs = np.convolve(np.array(loss_logs),
                            np.ones((12, )) / 12,
                            mode='valid')

    plt.plot(loss_logs)
    plt.savefig("figure-hidden{}-dropout{}-accuracy{}.png".format(
        n_hidden_in, dropout_in,
        sum(results) / len(results)),
                dpi=400)
    plt.clf()
Exemplo n.º 2
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    layer_indexes = [int(x) for x in FLAGS.layers.split(",")]

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        master=FLAGS.master,
        tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    json_examples = []
    for file in FLAGS.input_file.split(';'):
        with open(file) as f:
            json_examples.extend(
                (json.loads(jsonline) for jsonline in f.readlines()))

    orig_examples = []
    bert_examples = []
    for i, json_e in enumerate(json_examples):
        e = process_example(json_e, i)
        orig_examples.append(e)
        bert_examples.append(e.bertify(tokenizer))

    model_fn = model_fn_builder(
        bert_config=bert_config,
        init_checkpoint=FLAGS.init_checkpoint,
        layer_indexes=layer_indexes,
        use_tpu=FLAGS.use_tpu,
        use_one_hot_embeddings=FLAGS.use_one_hot_embeddings,
    )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=FLAGS.batch_size)

    input_fn = input_fn_builder(examples=bert_examples,
                                window_size=FLAGS.window_size,
                                stride=FLAGS.stride,
                                tokenizer=tokenizer)

    writer = h5py.File(FLAGS.output_file, 'w')
    with tqdm(total=sum(len(e.tokens) for e in orig_examples)) as t:
        for result in estimator.predict(input_fn, yield_single_examples=True):
            document_index = int(result["unique_ids"])
            bert_example = bert_examples[document_index]
            orig_example = orig_examples[document_index]
            file_key = bert_example.doc_key.replace('/', ':')

            t.update(n=(result['extract_indices'] >= 0).sum())

            for output_index, bert_token_index in enumerate(
                    result['extract_indices']):
                if bert_token_index < 0:
                    continue

                token_index = bert_example.bert_to_orig_map[bert_token_index]
                sentence_index, token_index = orig_example.unravel_token_index(
                    token_index)

                dataset_key = "{}/{}".format(file_key, sentence_index)
                if dataset_key not in writer:
                    writer.create_dataset(
                        dataset_key,
                        (len(orig_example.sentence_tokens[sentence_index]),
                         bert_config.hidden_size, len(layer_indexes)),
                        dtype=np.float32)

                dset = writer[dataset_key]
                for j, layer_index in enumerate(layer_indexes):
                    layer_output = result["layer_output_%d" % j]
                    dset[token_index, :, j] = layer_output[output_index]
    writer.close()
Exemplo n.º 3
0
        num_correct += accuracy * batch_size
        loss_logs.append(loss)

        print("Epoch {}, {}/{}".format(i, (j + 1) * batch_size, examples) +
              " " * 10 + "Loss: {}, Accuracy {}".format(
                  loss, num_correct / (batch_size * (j + 1))),
              end='\r',
              flush=True)
    # Save the model after every epoch
    m.save("epoch-{}.pkl".format(i))
    print()  # Print empty newline

# Evaluate the model on the test set
results = []
for ex in data['test'].take(1000):
    x, _ = process_example(ex)
    true = ex['label']
    pred = m.predict_class(x)
    results.append(1 if true == pred else 0)

print("Final accuracy on test set: {}".format(sum(results) / len(results)))

# Display a few images with predictions because it's fun
print("\n" * 10)
for ex in data['test'].shuffle(100).take(10):
    x, _ = process_example(ex)
    pred = m.predict_class(x)
    print("Predicted: {}".format(pred))
    X = x.reshape((28, 28))
    plt.gray()
    plt.imshow(X)
Exemplo n.º 4
0
import sentencepiece as spm
import json
import os
from data import process_example

input_file = ''

s = spm.SentencePieceProcessor()
s.Load('./xlnet_cased_L-12_H-768_A-12/spiece.model')

# Retrieve size
print(s.get_piece_size())
print(s.encode('this is a test'))

json_examples = []
for x in ['test', 'train', 'dev']:
    with open(os.path.join(input_file, x + '.english.jsonlines')) as f:
        json_examples.extend(
            (json.loads(jsonline) for jsonline in f.readlines()))

orig_examples = []
bert_examples = []
for i, json_e in enumerate(json_examples):
    e = process_example(json_e, i, should_filter_embedded_mentions=True)
    orig_examples.append(e)
    print(s.encode(' '.join(e.tokens)))
    # bert_examples.append(e.bertify(tokenizer))
# for i in orig_examples: print(i.tokens)
Exemplo n.º 5
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=False)

    json_examples = []
    for x in ['test', 'train', 'dev']:
        # for x in ['test']:
        with open(os.path.join(FLAGS.input_file,
                               x + '.english.jsonlines')) as f:
            json_examples.extend(
                (json.loads(jsonline) for jsonline in f.readlines()))

    orig_examples = []
    bert_examples = []
    for i, json_e in enumerate(json_examples):
        e = process_example(json_e, i, should_filter_embedded_mentions=True)
        orig_examples.append(e)
        bert_examples.append(e.bertify(tokenizer))

    writer = h5py.File(FLAGS.output_file, 'w')
    for data in tqdm(convert_examples_to_features(bert_examples, orig_examples,
                                                  FLAGS.window_size,
                                                  FLAGS.stride, tokenizer),
                     total=len(json_examples)):
        document_index = int(data["doc_index"][0])
        bert_example = bert_examples[document_index]
        dataset_key = bert_example.doc_key.replace('/', ':')

        sentences = []
        for sentence_indices in data['extract_sentences']:
            cur_sentence = []
            for i in sentence_indices:
                tokens_flattened = sum([list(ts) for ts in data['tokens']], [])
                if i > 0:
                    cur_sentence.append(tokens_flattened[i - 1])
            sentences.append(cur_sentence)
        assert [len(s) for s in sentences] == [
            len(s) for s in orig_examples[document_index].sentence_tokens
        ]
        sentences_flattened = sum(sentences, [])
        expected = [
            t for i, t in enumerate(bert_example.tokens)
            if bert_example.bert_to_orig_map[i] >= 0
        ]
        assert sentences_flattened == expected

        writer.create_dataset('{}/input_ids'.format(dataset_key),
                              data=data['input_ids'])
        writer.create_dataset('{}/input_mask'.format(dataset_key),
                              data=data['input_mask'])
        writer.create_dataset('{}/segment_ids'.format(dataset_key),
                              data=data['segment_ids'])
        writer.create_dataset('{}/extract_mask'.format(dataset_key),
                              data=data['extract_mask'])
        writer.create_dataset('{}/extract_sentences'.format(dataset_key),
                              data=data['extract_sentences'])
        # for i, s in enumerate(data['tokens']):
        #     tokens_dset = writer.create_dataset('{}/tokens/{}'.format(dataset_key, i),
        #                                         (len(s),),
        #                                         dtype=h5py.special_dtype(vlen=unicode))
        #     for j, w in enumerate(s):
        #         tokens_dset[j] = w
    writer.close()