예제 #1
0
data_folder = ''
embedding_folder = ''
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"
reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)
vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")
feature_transformer = FeatureExtractor(
    features=['match_lemma', 'match_lower', 'pos', 'ner', 'context_tf'],
    build_vocab_feature_names=set(['pos', 'ner']),
    word_counter=vocab.get_word_counter())
train_data = feature_transformer.fit_transform(dataset=train_data)
eval_data = feature_transformer.transform(dataset=eval_data)
train_batch_generator = BatchGenerator(
    vocab,
    train_data,
    training=True,
    batch_size=32,
    additional_fields=feature_transformer.features,
    feature_vocab=feature_transformer.vocab)
eval_batch_generator = BatchGenerator(
    vocab,
    eval_data,
    batch_size=32,
    additional_fields=feature_transformer.features,
    feature_vocab=feature_transformer.vocab)
예제 #2
0
reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

# Build a vocabulary and load the pretrained embedding
# 构建词汇表并加载预训练嵌入
print("step 2:构建词汇表并加载预训练嵌入...")
vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder+"glove.840B.300d.txt")

# Use the feature extractor,which is only necessary when using linguistic features
# 用特征提取器。特征提取器只是在使用语言特征时才需要
print("step 3:用特征提取器(特征提取器只是在使用语言特征时才需要)...")
feature_transformer = FeatureExtractor(features=['match_lemma','match_lower','pos','ner','context_tf'],build_vocab_feature_names=set(['pos','ner']),word_counter=vocab.get_word_counter())
train_data = feature_transformer.fit_transform(dataset=train_data)
eval_data = feature_transformer.transform(dataset=eval_data)

# 构建用于训练和评估的批处理生成器,其中在使用语言特征时需要附加特征和特征词汇表
print("step 4:构建用于训练和评估的批处理生成器,其中在使用语言特征时需要附加特征和特征词汇表...")
train_batch_generator = BatchGenerator(vocab,train_data,training=True,batch_size=32,additional_fields=feature_transformer.features,feature_vocab=feature_transformer.vocab)
eval_batch_generator = BatchGenerator(vocab,eval_data,batch_size=32,additional_fields=feature_transformer.features,feature_vocab=feature_transformer.vocab)

# original paper adamax optimizer
# 导入内置模型并编译训练操作,调用train_and_evaluate等函数进行训练和评估
print("step 5:导入内置模型并编译训练操作,调用train_and_evaluate等函数进行训练和评估...")
model = DrQA(vocab, word_embedding,features=feature_transformer.features,feature_vocab=feature_transformer.vocab)
model.compile()
model.train_and_evaluate(train_batch_generator, eval_batch_generator,evaluator, epochs=40, eposides=2)
model.evaluate(eval_batch_generator,eval_data,evaluator)