Пример #1
0
import sys
sys.path.append('../..')
import logging
import torch
from pytorch_mrc.dataset.squad import SquadReader, SquadEvaluator
from pytorch_mrc.model.bidaf import BiDAF
from pytorch_mrc.data.batch_generator import BatchGenerator

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

bg_folder = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/'
train_bg_file = bg_folder + "bg_train_32b_100d.pkl"
eval_bg_file = bg_folder + "bg_eval_32b_100d.pkl"
dev_file = "/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/dev-v1.1.json"

reader = SquadReader()
eval_data = reader.read(dev_file)
evaluator = SquadEvaluator(dev_file)

train_batch_generator = BatchGenerator()
eval_batch_generator = BatchGenerator()
train_batch_generator.load(train_bg_file)
eval_batch_generator.load(eval_bg_file)
vocab = train_batch_generator.get_vocab()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BiDAF(vocab, device, pretrained_word_embedding=vocab.get_word_embedding())
model.compile()
model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator, epochs=20, episodes=2)
Пример #2
0
import torch
from pytorch_mrc.data.vocabulary import Vocabulary
from pytorch_mrc.dataset.squad import SquadReader, SquadEvaluator
from pytorch_mrc.model.rnet_hkust import RNET
from pytorch_mrc.data.batch_generator import BatchGenerator

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
data_folder = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/'
embedding_folder = '/home/len/yingzq/nlp/mrc_dataset/word_embeddings/'
tiny_file = data_folder + "tiny-v1.1.json"
embedding_file = embedding_folder + 'glove.6B.100d.txt'

reader = SquadReader(fine_grained=True)
tiny_data = reader.read(tiny_file)
evaluator = SquadEvaluator(tiny_file)

logging.info('building vocab and making embedding...')
vocab = Vocabulary()
vocab.build_vocab(tiny_data, min_word_count=3, min_char_count=10)
vocab.make_word_embedding(embedding_file)
word_embedding = vocab.get_word_embedding()
logging.info('word vocab size: {}, word embedding shape: {}'.format(
    len(vocab.get_word_vocab()), word_embedding.shape))

train_batch_generator = BatchGenerator()
train_batch_generator.build(vocab, tiny_data, batch_size=32, shuffle=True)
eval_batch_generator = BatchGenerator()
eval_batch_generator.build(vocab, tiny_data, batch_size=32)
# define data path
train_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/train-v1.1.json'
dev_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/dev-v1.1.json'
embedding_file = '/home/len/yingzq/nlp/mrc_dataset/word_embeddings/glove.840B.300d.txt'

# the path to save file
vocab_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/vocab_data/vocab_{}d_{}.pkl'.format(
    EMB_DIM, 'cased' if DO_LOWERCASE else 'uncased')
bg_train_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/bg_train_{}b_{}d_{}.pkl'.format(
    BATCH_SIZE, EMB_DIM, 'cased' if DO_LOWERCASE else 'uncased')
bg_eval_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/bg_eval_{}b_{}d_{}.pkl'.format(
    BATCH_SIZE, EMB_DIM, 'cased' if DO_LOWERCASE else 'uncased')

# read data
reader = SquadReader(fine_grained=FINE_GRAINED)
train_data = reader.read(train_file)
eval_data = reader.read(dev_file)

# build vocab and embedding
vocab = Vocabulary(do_lowercase=DO_LOWERCASE)
vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10)
vocab.make_word_embedding(embedding_file)
vocab.save(vocab_file)

logging.info("building train batch generator...")
train_batch_generator = BatchGenerator()
train_batch_generator.build(vocab,
                            train_data,
                            batch_size=BATCH_SIZE,
                            shuffle=True)