def main(): args = set_args() global logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) logger.info('~Processing SQuAD dataset~') train_path = os.path.join(args.data_dir, 'train-v1.1.json') valid_path = os.path.join(args.data_dir, 'dev-v1.1.json') logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.glove_dim, args.glove)) glove_path = args.glove glove_dim = args.glove_dim nlp = spacy.load('en', parser=False) set_environment(args.seed) logger.info('Loading glove vocab.') glove_vocab = load_glove_vocab(glove_path, glove_dim) # load data logger.info('Loading data vocab.') train_data = load_data(train_path) valid_data = load_data(valid_path, False) vocab_tag = Vocabulary.build(nlp.tagger.tag_names, neat=True) vocab_ner = Vocabulary.build([''] + nlp.entity.cfg[u'actions']['1'], neat=True) logger.info('Build vocabulary') vocab = build_vocab(train_data + valid_data, glove_vocab, sort_all=args.sort_all, clean_on=True) meta_path = os.path.join(args.data_dir, args.meta) logger.info('building embedding') embedding = build_embedding(glove_path, vocab, glove_dim) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } # If you want to check vocab token IDs, etc., load the meta file below (squad_meta.pick). with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('started the function build_data') train_fout = os.path.join(args.data_dir, args.train_data) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, thread=args.threads) dev_fout = os.path.join(args.data_dir, args.dev_data) build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False, thread=args.threads)
from datetime import datetime from collections import Counter, defaultdict from src.model import DocReaderModel from src.batcher import load_meta, BatchGen from config import set_args from my_utils.utils import set_environment from my_utils.log_wrapper import create_logger args = set_args() # set model dir model_dir = args.model_dir os.makedirs(model_dir, exist_ok=True) model_dir = os.path.abspath(model_dir) # set environment set_environment(args.seed, args.cuda) # setup logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) def main(): logger.info('Launching the SAN') opt = vars(args) logger.info(opt) embedding, opt, vocab = load_meta(opt, args.meta) max_doc = opt['max_doc'] smooth = opt['smooth'] is_rep = opt['is_rep'] eval_step = opt['eval_step'] curve_file = opt['curve_file']
from src.batcher import load_meta, BatchGen from config import set_args from my_utils.utils import set_environment from my_utils.log_wrapper import create_logger from my_utils.squad_eval import evaluate from my_utils.data_utils import predict_squad, gen_name, gen_gold_name, load_squad_v2_label, compute_acc from my_utils.squad_eval_v2 import my_evaluation as evaluate_v2 args = set_args() # set model dir model_dir = args.model_dir # default='checkpoint' os.makedirs(model_dir, exist_ok=True) model_dir = os.path.abspath(model_dir) # acquire absolute path # set environment set_environment(args.seed, args.cuda) # seed default=2018 # setup logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) def load_squad(data_path): with open(data_path) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] return dataset def main(): logger.info('Launching the SAN') opt = vars(args) logger.info('Loading data')
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) v2_on = args.v2_on version = 'v1' if v2_on: msg = '~Processing SQuAD v2.0 dataset~' # train_path = 'train-v2.0.json' # dev_path = 'dev-v2.0.json' train_path = 'msmarco_squad_train.json' dev_path = 'msmarco_squad_dev.json' version = 'v2' else: msg = '~Processing SQuAD dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGING MODE***') train_path = os.path.join(args.data_dir, train_path) valid_path = os.path.join(args.data_dir, dev_path) logger.info('Train path is: {}'.format(train_path)) logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on, limit=20000) dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500) wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) del meta del embedding logger.info('deleted meta and embedding') logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on) end_time = time.time() logger.warning('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) logger.warning('~Processing SQuAD dataset~') train_path = os.path.join(args.data_dir, 'train-v1.1.json') valid_path = os.path.join(args.data_dir, 'dev-v1.1.json') logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) # load data train_data = load_data(train_path) valid_data = load_data(valid_path, False) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + valid_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = os.path.join(args.data_dir, args.meta) logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) train_fout = os.path.join(args.data_dir, args.train_data) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True) dev_fout = os.path.join(args.data_dir, args.dev_data) build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False) end_time = time.time() logger.info('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
def main(): args = set_args() args.datasets = args.datasets.split(',') global logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) all_data = [] all_datasets = [] for dataset_name in args.datasets: test_file_prefix = 'test' if test_mode: if 'marco' in dataset_name: train_file_prefix = 'train' dev_file_prefix = 'dev' else: train_file_prefix = 'dev' dev_file_prefix = 'dev' else: train_file_prefix = 'train' dev_file_prefix = 'dev' logger.info('Processing %s dataset' % dataset_name) this_data_dir = args.data_dir + dataset_name + '/' train_data = None train_path = os.path.join(this_data_dir, '%s.json' % train_file_prefix) logger.info('The path of training data: {}'.format(train_path)) train_data = load_data(train_path) all_data += train_data valid_path = os.path.join(this_data_dir, '%s.json' % dev_file_prefix) logger.info('The path of validation data: {}'.format(valid_path)) valid_data = load_data(valid_path, False) all_data += valid_data if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name: test_path = os.path.join(this_data_dir, '%s.json' % test_file_prefix) logger.info('The path of test data: {}'.format(test_path)) test_data = load_data(test_path, False) all_data += test_data all_datasets.append((train_data, valid_data, test_data)) else: all_datasets.append((train_data, valid_data)) logger.info('{}-dim word vector path: {}'.format(args.glove_dim, args.glove)) glove_path = args.glove glove_dim = args.glove_dim nlp = spacy.load('en', parser=False) set_environment(args.seed) logger.info('Loading glove vocab.') glove_vocab = load_glove_vocab(glove_path, glove_dim) multitask_base_path = '../data/mtmrc/' with open(multitask_base_path + 'vocab_tag.pick', 'rb') as f: vocab_tag = pickle.load(f) with open(multitask_base_path + 'vocab_ner.pick', 'rb') as f: vocab_ner = pickle.load(f) logger.info('Build vocabulary ') vocab = build_vocab(all_data, glove_vocab, sort_all=args.sort_all, clean_on=True, args=args) meta_path = os.path.join(args.output_path, args.meta) logger.info('building embedding ') embedding = build_embedding(glove_path, vocab, glove_dim) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) for i, item in enumerate(all_datasets): dataset_name = args.datasets[i] if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name: train_data, valid_data, test_data = item else: train_data, valid_data = item print('building output file for ', dataset_name) train_fout = os.path.join(args.output_path, dataset_name + '_train.json') build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, dataset_name=dataset_name) dev_fout = os.path.join(args.output_path, dataset_name + '_dev.json') build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False, dataset_name=dataset_name) if args.include_test_set and 'squad' not in dataset_name: test_fout = os.path.join(args.output_path, dataset_name + '_test.json') build_data(test_data, vocab, vocab_tag, vocab_ner, test_fout, False, dataset_name=dataset_name)
n_threads=n_threads) dev_data = BatchGen(test_data, batch_size, have_gpu, is_train=False, with_label=True) #batches.reset() #batches = list(batches) model_path = model_root + 'best_checkpoint.pt' checkpoint = torch.load(model_path) opt = checkpoint['config'] set_environment(opt['seed'], have_gpu) opt['covec_path'] = mtlstm_path opt['cuda'] = have_gpu opt['multi_gpu'] = False opt['max_len'] = max_len state_dict = checkpoint['state_dict'] model = DocReaderModel(opt, state_dict=state_dict) model.setup_eval_embed(torch.Tensor(test_embedding)) logger.info('Loaded model!') if have_gpu: model.cuda() results, score_list = evaluate_squad_v2(model, dev_data) dev_gold = load_squad_v2(test_file)
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) # ./san.log v2_on = args.v2_on version = 'v1' if v2_on: msg = '~Processing SQuAD v2.0 dataset~' train_path = 'train-v2.0.json' dev_path = 'dev-v2.0.json' version = 'v2' else: msg = '~Processing SQuAD dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGING MODE***') train_path = os.path.join( args.data_dir, train_path) # args.data_dir=data/, data/train-v2.0.json valid_path = os.path.join(args.data_dir, dev_path) # data/dev-v2.0.json logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format( args.embedding_dim, args.glove)) # embedding_dim=300 # could be fasttext embedding emb_path = args.glove # data/glove.840B.300d.txt embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: # store_true logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on) dev_data = load_data(valid_path, False, v2_on=v2_on) """From GLoVe to acquire tokens, to set()""" wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') """ '--sort_all', action='store_true', sort the vocabulary by frequencies of all words, Otherwise consider question words first. """ vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on) end_time = time.time() logger.warning('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
def main(): # Create a argument parser and read arguments from command line args = set_args() # logger will be a global variable global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) v2_on = args.v2_on if v2_on: msg = '~Processing SQuAD v2.0 dataset~' train_path = 'train-v2.0.json' dev_path = 'dev-v2.0.json' version = 'v2' else: msg = '~Processing SQuAD v1.1 dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' version = 'v1' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGGING MODE***') train_path = os.path.join(args.data_dir, train_path) valid_path = os.path.join(args.data_dir, dev_path) logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on) dev_data = load_data(valid_path, False, v2_on=v2_on) wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') # what do these vocab tags and vocab ners do? with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding} with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on, bert_tokenizer=BERT_TOKENIZER) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on, bert_tokenizer=BERT_TOKENIZER) end_time = time.time() logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))