def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) # ./san.log v2_on = args.v2_on version = 'v1' if v2_on: msg = '~Processing SQuAD v2.0 dataset~' train_path = 'train-v2.0.json' dev_path = 'dev-v2.0.json' version = 'v2' else: msg = '~Processing SQuAD dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGING MODE***') train_path = os.path.join( args.data_dir, train_path) # args.data_dir=data/, data/train-v2.0.json valid_path = os.path.join(args.data_dir, dev_path) # data/dev-v2.0.json logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format( args.embedding_dim, args.glove)) # embedding_dim=300 # could be fasttext embedding emb_path = args.glove # data/glove.840B.300d.txt embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: # store_true logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on) dev_data = load_data(valid_path, False, v2_on=v2_on) """From GLoVe to acquire tokens, to set()""" wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') """ '--sort_all', action='store_true', sort the vocabulary by frequencies of all words, Otherwise consider question words first. """ vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on) end_time = time.time() logger.warning('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) v2_on = args.v2_on version = 'v1' if v2_on: msg = '~Processing SQuAD v2.0 dataset~' # train_path = 'train-v2.0.json' # dev_path = 'dev-v2.0.json' train_path = 'msmarco_squad_train.json' dev_path = 'msmarco_squad_dev.json' version = 'v2' else: msg = '~Processing SQuAD dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGING MODE***') train_path = os.path.join(args.data_dir, train_path) valid_path = os.path.join(args.data_dir, dev_path) logger.info('Train path is: {}'.format(train_path)) logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on, limit=20000) dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500) wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) del meta del embedding logger.info('deleted meta and embedding') logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on) end_time = time.time() logger.warning('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
def main(): # Create a argument parser and read arguments from command line args = set_args() # logger will be a global variable global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) v2_on = args.v2_on if v2_on: msg = '~Processing SQuAD v2.0 dataset~' train_path = 'train-v2.0.json' dev_path = 'dev-v2.0.json' version = 'v2' else: msg = '~Processing SQuAD v1.1 dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' version = 'v1' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGGING MODE***') train_path = os.path.join(args.data_dir, train_path) valid_path = os.path.join(args.data_dir, dev_path) logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on) dev_data = load_data(valid_path, False, v2_on=v2_on) wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') # what do these vocab tags and vocab ners do? with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding} with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on, bert_tokenizer=BERT_TOKENIZER) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on, bert_tokenizer=BERT_TOKENIZER) end_time = time.time() logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))