예제 #1
0
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = ''
embedding_folder = ''
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary()
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.6B.100d.txt")

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=60,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model = BiDAF(vocab, pretrained_word_embedding=word_embedding)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
                         evaluator,
                         epochs=15,
                         eposides=2)
예제 #2
0
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = '/root/ZX/SMRCToolkit/data_folder/'
embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader(fine_grained=True)
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json")
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")

# save vocab
vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json'
vocab.save(vocab_save_path)

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=50,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50)
train_batch_generator.init()

content = ""
question = ""
예제 #3
0
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = ''
embedding_folder = ''
train_file = data_folder + "cmrc2018_train.json"
dev_file = data_folder + "cmrc2018_dev.json"

reader = CMRCReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = CMRCEvaluator(dev_file)

vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder)
train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=32,
                                       training=True)
eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=60)

model = BiDAF(vocab,
              pretrained_word_embedding=word_embedding,
              word_embedding_size=300)
model.compile(tf.train.AdamOptimizer, 0.001)
model.train_and_evaluate(train_batch_generator,
                         eval_batch_generator,
                         evaluator,
                         epochs=50,
                         eposides=2)
예제 #4
0
train_file = "train-v2.0.json"
dev_file = "dev-v2.0.json"

t0 = time.time()
reader = SquadV2Reader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadV2Evaluator(dev_file)
cost = time.time() - t0
logging.info("seg cost=%.3f" % cost)

t0 = time.time()
vocab = Vocabulary(do_lowercase=True)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding("glove.840B.300d.txt",
                                           init_scale=0.05)
cost = time.time() - t0
logging.info("make vocab cost=%.3f" % cost)

train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    batch_size=16,
    training=True,
    additional_fields=["abstractive_answer_mask"])
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
    batch_size=16,
    training=False,
    additional_fields=["abstractive_answer_mask"])