예제 #1
0
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader()
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

# Build a vocabulary and load the pretrained embedding
vocab = Vocabulary()
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
word_embedding = vocab.make_word_embedding(embedding_folder + "glove.6B.100d.txt")

# save vocab
vocab_save_path = 'H:/result/bidaf/vocab.json'
vocab.save(vocab_save_path)

# Use the feature extractor,which is only necessary when using linguistic features.
# feature_transformer = FeatureExtractor(features=['match_lemma','match_lower','pos','ner','context_tf'],
# build_vocab_feature_names=set(['pos','ner']),word_counter=vocab.get_word_counter())
# train_data = feature_transformer.fit_transform(dataset=train_data)
# eval_data = feature_transformer.transform(dataset=eval_data)

# Build a batch generator for training and evaluation,where additional features and a feature vocabulary are
# necessary when a linguistic feature is used.
train_batch_generator = BatchGenerator(vocab, train_data, batch_size=64, training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=64)
# train and save checkpoint in save_dir
save_dir = 'H:/result/bidaf'
# Import the built-in model and compile the training operation, call functions such as train_and_evaluate for
예제 #2
0
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = '/root/ZX/SMRCToolkit/data_folder/'
embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/'
train_file = data_folder + "train-v1.1.json"
dev_file = data_folder + "dev-v1.1.json"

reader = SquadReader(fine_grained=True)
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

vocab = Vocabulary(do_lowercase=False)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json")
word_embedding = vocab.make_word_embedding(embedding_folder +
                                           "glove.840B.300d.txt")

# save vocab
vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json'
vocab.save(vocab_save_path)

train_batch_generator = BatchGenerator(vocab,
                                       train_data,
                                       batch_size=50,
                                       training=True)

eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50)
train_batch_generator.init()
data_folder = os.path.join("/", "home", "baheti", "QADialogueSystem", "Data",
                           "QA_datasets", "coqa/")
train_filename = "coqa-train-v1.0.json"
eval_filename = "coqa-dev-v1.0.json"
vocab = Vocabulary(do_lowercase=True)
vocab_filepath = os.path.join("models", "vocab.txt")
if os.path.exists(vocab_filepath):
    print("loading from filepath")
    # load from the filepath
    vocab.load(vocab_filepath)
else:
    print("creating vocab as new")
    train_data = coqa_reader.read(data_folder + train_filename, 'train')
    eval_data = coqa_reader.read(data_folder + eval_filename, 'dev')
    vocab.build_vocab(train_data + eval_data)
    vocab.save(vocab_filepath)

# Squad seq2seq_train_moses_tokenized
# DATA_DIR = os.path.join("/", "home", "baheti", "QADialogueSystem", "RuleBasedQuestionsToAnswer", "squad_seq2seq_train_moses_tokenized")
# coqa_format_test_save_file = os.path.join(DATA_DIR, "squad_seq2seq_predicted_responses_test_coqa_format.json")
# src_squad_seq2seq_predicted_responses_file = os.path.join(DATA_DIR, "src_squad_seq2seq_predicted_responses_test.txt")
# predictions_save_file = "coqa_predictions_on_squad_seq2seq_predicted_responses_test.txt"

# SQUAD seq2seq dev moses tokenized
DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer",
                        "squad_seq2seq_dev_moses_tokenized")
coqa_format_test_save_file = os.path.join(
    DATA_DIR, "squad_seq2seq_dev_moses_test_coqa_format.json")
src_squad_seq2seq_predicted_responses_file = os.path.join(
    DATA_DIR, "src_squad_seq2seq_dev_moses_test.txt")
predictions_save_file = "coqa_predictions_on_squad_seq2seq_dev_moses_test.txt"