def func(gpu_id): config.set_args(args.gpu_ids.split(',')[gpu_id]) tester = Tester(Model(), cfg) tester.load_weights(test_model) range = [ranges[gpu_id], ranges[gpu_id + 1]] if (cfg.MODEL.occluded_detection): return test_net_occ(tester, dets, range, gpu_id, d.sigmas) else: return test_net(tester, dets, range, gpu_id, d.sigmas)
def main(): args = set_args() # loading EmoContext data print("loading data") data = EMO(args) setattr(args, 'word_vocab_size', len(data.TEXT.vocab)) setattr(args, 'model_time', strftime('%H:%M:%S', gmtime())) setattr(args, 'class_size', len(data.LABEL.vocab)) setattr(args, 'max_word_len', data.max_word_len) setattr(args, 'char_vocab_size', len(data.char_vocab)) setattr(args, 'FILTER_SIZES', [1, 3, 5]) print(args.char_vocab_size) print('Vocab Size: ' + str(len(data.TEXT.vocab))) if args.ss_emb: build_sswe_vectors() best_model, max_test_acc, max_test_f1 = train(args, data) model_name_str = 'NN4EMO_' + args.name_tag if not os.path.exists('saved_models'): os.makedirs('saved_models') model_name = f'{model_name_str}_{args.model_time}_{max_test_acc:.4f}_{max_test_f1:.4f}.pt' torch.save(best_model, 'saved_models/' + model_name) print('training finished!') submission(args, model_name)
def main(): args = set_args() global logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) logger.info('~Processing SQuAD dataset~') train_path = os.path.join(args.data_dir, 'train-v1.1.json') valid_path = os.path.join(args.data_dir, 'dev-v1.1.json') logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.glove_dim, args.glove)) glove_path = args.glove glove_dim = args.glove_dim nlp = spacy.load('en', parser=False) set_environment(args.seed) logger.info('Loading glove vocab.') glove_vocab = load_glove_vocab(glove_path, glove_dim) # load data logger.info('Loading data vocab.') train_data = load_data(train_path) valid_data = load_data(valid_path, False) vocab_tag = Vocabulary.build(nlp.tagger.tag_names, neat=True) vocab_ner = Vocabulary.build([''] + nlp.entity.cfg[u'actions']['1'], neat=True) logger.info('Build vocabulary') vocab = build_vocab(train_data + valid_data, glove_vocab, sort_all=args.sort_all, clean_on=True) meta_path = os.path.join(args.data_dir, args.meta) logger.info('building embedding') embedding = build_embedding(glove_path, vocab, glove_dim) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } # If you want to check vocab token IDs, etc., load the meta file below (squad_meta.pick). with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('started the function build_data') train_fout = os.path.join(args.data_dir, args.train_data) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, thread=args.threads) dev_fout = os.path.join(args.data_dir, args.dev_data) build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False, thread=args.threads)
def __init__(self, embedding_pre=None): super(BiLSTM_ATT, self).__init__() args = set_args() self.hidden_size = args.hidden_size self.tag_size = args.tag_size # 1. 词嵌入 if args.is_train_embedding: self.word_embeds = nn.Embedding(args.vocab_size, args.embed_dim) else: self.word_embeds = nn.Embedding.from_pretrained( torch.FloatTensor(embedding_pre), freeze=False) # 两种位置嵌入 self.pos1_embeds = nn.Embedding(args.pos_size, args.pos_dim) self.pos2_embeds = nn.Embedding(args.pos_size, args.pos_dim) self.lstm = nn.LSTM(input_size=args.embed_dim + args.pos_dim * 2, hidden_size=args.hidden_size // 2, num_layers=1, bidirectional=True, batch_first=True) self.dropout_lstm = nn.Dropout(p=0.5) self.dropout_att = nn.Dropout(p=0.5) self.relation_embeds = nn.Embedding(args.tag_size, self.hidden_size)
def main(): args = set_args() setattr(args, 'model_time', strftime('%H:%M:%S', gmtime())) setattr(args, 'class_size', 4) # loading EmoContext data print("loading data") train_dataloader, valid_dataloader, num_train_examples = getDataLoaders( args) best_model, max_dev_f1 = train(args, train_dataloader, valid_dataloader, num_train_examples) if not os.path.exists('saved_models'): os.makedirs('saved_models') torch.save(best_model, f'saved_models/BERT_{args.model_time}_{max_dev_f1}.pt') print('training finished!')
def main(): # 设置模型训练参数 args = set_args() # 设置随机种子,方便模型复现 if args.seed: torch.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) # 加载模型的config model_config = GPT2Config.from_json_file(args.config_path) # 实例化GPT2LMHeadModel模型,这里我们没有加载预训练好的模型,而是直接从头开始训练。 if args.pretrained_model_path: model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path) else: # 如果没有指定的预训练模型,则初始化模型 model = GPT2LMHeadModel(config=model_config) tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True) # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']"; # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']" tokenizer.add_tokens("[Space]", special_tokens=True) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) # 加载训练数据和测试数据 train_data = GPT2NewsTitleDataSet(tokenizer, args.max_len, args.title_max_len, args.data_dir, "train", args.train_file_path) test_data = GPT2NewsTitleDataSet(tokenizer, args.max_len, args.title_max_len, args.data_dir, "test", args.test_file_path) # 开始训练 train(model, train_data, test_data, args)
def forward(self, batch): doc_input, query_input,\ doc_emb, query_emb,\ doc_cove_low, doc_cove_high,\ query_cove_low, query_cove_high,\ doc_mask, query_mask,\ doc_elmo, query_elmo = self.lexicon_encoder(batch) query_list, doc_list = [], [] query_list.append(query_input) doc_list.append(doc_input) # doc encode if self.opt['elmo_on']: doc_low = self.doc_encoder_low( torch.cat([doc_input, doc_cove_low, doc_elmo[0]], 2), doc_mask) else: doc_low = self.doc_encoder_low( torch.cat([doc_input, doc_cove_low], 2), doc_mask) doc_low = self.dropout(doc_low) if self.opt['elmo_on']: doc_high = self.doc_encoder_high( torch.cat([doc_low, doc_cove_high, doc_elmo[1]], 2), doc_mask) else: doc_high = self.doc_encoder_high( torch.cat([doc_low, doc_cove_high], 2), doc_mask) doc_high = self.dropout(doc_high) # query if self.opt['elmo_on']: query_low = self.query_encoder_low( torch.cat([query_input, query_cove_low, query_elmo[0]], 2), query_mask) else: query_low = self.query_encoder_low( torch.cat([query_input, query_cove_low], 2), query_mask) query_low = self.dropout(query_low) if self.opt['elmo_on']: query_high = self.query_encoder_high( torch.cat([query_low, query_cove_high, query_elmo[1]], 2), query_mask) else: query_high = self.query_encoder_high( torch.cat([query_low, query_cove_high], 2), query_mask) query_high = self.dropout(query_high) query_mem_hiddens = self.query_understand( torch.cat([query_low, query_high], 2), query_mask) query_mem_hiddens = self.dropout(query_mem_hiddens) query_list = [query_low, query_high, query_mem_hiddens] doc_list = [doc_low, doc_high] query_att_input = torch.cat( [query_emb, query_cove_high, query_low, query_high], 2) doc_att_input = torch.cat([doc_emb, doc_cove_high] + doc_list, 2) if self.opt['elmo_on'] and self.opt['elmo_att_on']: idx = -2 if self.opt['elmo_self_att_on'] else -1 doc_att_input = torch.cat([doc_att_input, doc_elmo[idx]], 2) query_att_input = torch.cat([query_att_input, query_elmo[idx]], 2) # setup logger args = set_args() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) # logger.warning('doc_self_hiddens {}{}{}'.format(doc_self_hiddens.output_size,doc_mem_gen.output_size,query_sum_attn.output_size)) # logger.warning('before att {}{}{}'.format(doc_att_input.shape,query_att_input.shape, query_mask.shape,query_low.shape,query_mem_hiddens.shape )) # before att torch.Size([64, 246, 1412])torch.Size([64, 37, 1412])torch.Size([64, 37]) # s=ConvAtt(doc_att_input,query_att_input,0.5) # s=s.cuda() # a=s() doc_attn_hiddens = self.deep_attn(doc_att_input, query_att_input, query_list, query_mask) # logger.warning('before att {}'.format(doc_attn_hiddens.shape)) # before att torch.Size([64, 246, 768]) doc_attn_hiddens = self.dropout(doc_attn_hiddens) # doc_attn_hiddens = self.dropout(a) doc_mem_hiddens = self.doc_understand( torch.cat([doc_attn_hiddens] + doc_list, 2), doc_mask) doc_mem_hiddens = self.dropout(doc_mem_hiddens) doc_mem_inputs = torch.cat([doc_attn_hiddens] + doc_list, 2) if self.opt['self_attention_on']: doc_att = torch.cat( [doc_mem_inputs, doc_mem_hiddens, doc_cove_high, doc_emb], 2) if self.opt['elmo_on'] and self.opt['elmo_self_att_on']: doc_att = torch.cat([doc_att, doc_elmo[-1]], 2) doc_self_hiddens = self.doc_self_attn(doc_att, doc_att, doc_mask, x3=doc_mem_hiddens) doc_mem = self.doc_mem_gen( torch.cat([doc_mem_hiddens, doc_self_hiddens], 2), doc_mask) else: doc_mem = doc_mem_hiddens query_mem = self.query_sum_attn(query_mem_hiddens, query_mask) start_scores, end_scores = self.decoder(doc_mem, query_mem, doc_mask) # logger.warning('query_mem {}'.format(query_mem.shape)) # logger.warning('hiddens {}'.format(query_mem_hiddens.shape)) pred_score = None if self.classifier is not None: doc_sum = self.doc_sum_attn(doc_mem, doc_mask) pred_score = F.sigmoid( self.classifier(doc_sum, query_mem, doc_mask)) return start_scores, end_scores, pred_score
scores).contiguous(), device) batch_critic_loss = critic_criterion(scores, rewards) critic_loss += batch_critic_loss.item() batch_rl_loss = a2c_loss(points, logits, rewards, scores.detach()) rl_loss += batch_rl_loss.item() batch_loss = batch_critic_loss + batch_rl_loss optimizer.zero_grad() batch_loss.backward() optimizer.step() if (step + 1) % opt['print_every'] == 0: print('step ' + str(step + 1) + '/' + str(len(data.train_loader)) + ': critic loss ' + str(critic_loss) + ' rl loss ' + str(rl_loss)) critic_loss = 0 rl_loss = 0 if (step + 1) % opt['validate_every'] == 0: validate(step, extractor, abstractor, data.valid_loader, device) if __name__ == '__main__': opt = set_args() opt['mode'] = 'r' opt['model_time'] = strftime('%H:%M:%S', gmtime()) data = CnnDm(opt) opt['vocab_size'] = len(data.vocab) train(opt, data)
def __init__(self): self.args = config.set_args() self.filename = self.args["yolo_comm_txt"] self.image_name = self.args["yolo_comm_img"] self.slave = CommSlave(filename=self.filename) self.detector = Detector()
def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', type=str, dest='gpu_ids') parser.add_argument('--continue', dest='continue_train', action='store_true') parser.add_argument('--cfg', type=str, dest="cfg") args = parser.parse_args() if not args.gpu_ids: args.gpu_ids = str(np.argmin(mem_info())) if '-' in args.gpu_ids: gpus = args.gpu_ids.split('-') gpus[0] = 0 if not gpus[0].isdigit() else int(gpus[0]) gpus[1] = len(mem_info()) if not gpus[1].isdigit() else int(gpus[1]) + 1 args.gpu_ids = ','.join(map(lambda x: str(x), list(range(*gpus)))) return args args = parse_args() cfg.set_config(args.cfg, train=True) cfg.set_args(args.gpu_ids, args.continue_train) random.seed(2233) from model import Model from tfflat.base import Trainer from tfflat.utils import mem_info trainer = Trainer(Model(), cfg.cfg) trainer.train()
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) # ./san.log v2_on = args.v2_on version = 'v1' if v2_on: msg = '~Processing SQuAD v2.0 dataset~' train_path = 'train-v2.0.json' dev_path = 'dev-v2.0.json' version = 'v2' else: msg = '~Processing SQuAD dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGING MODE***') train_path = os.path.join( args.data_dir, train_path) # args.data_dir=data/, data/train-v2.0.json valid_path = os.path.join(args.data_dir, dev_path) # data/dev-v2.0.json logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format( args.embedding_dim, args.glove)) # embedding_dim=300 # could be fasttext embedding emb_path = args.glove # data/glove.840B.300d.txt embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: # store_true logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on) dev_data = load_data(valid_path, False, v2_on=v2_on) """From GLoVe to acquire tokens, to set()""" wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') """ '--sort_all', action='store_true', sort the vocabulary by frequencies of all words, Otherwise consider question words first. """ vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on) end_time = time.time() logger.warning('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
from torch.autograd import Variable from utils.util import * import time import argparse """ Description: This is a script of Gradient-based Foreground Adjustment Algorithm. (x, y, Scale) of foreground objects will be adjust guided by model's gradient. """ # ========================== Constants ===================== parser = argparse.ArgumentParser(description='Inference Phase') time = time.gmtime() time = "-".join([str(p) for p in list(time)[:5]]) config = set_args() test_fg = [] SAMPLE_NUM = config['sample_num'] ROUND = config['update_rd'] TOPK = config['top_k'] start_x = 0 start_y = 0 fx = [[-1, 0, 1], [1, 0, 1], [0, -1, 1], [0, 1, 1], [-1, 0, 0.95], [1, 0, 0.95], [0, -1, 0.95], [0, 1, 0.95], [-1, 0, 1.05], [1, 0, 1.05], [0, -1, 1.05], [0, 1, 1.05]] # ======================== loading ckpt ================== # ckpt = os.path.join( "checkpoints",
def main(): args = set_args() # 加载训练集 with gzip.open(args.train_data_path, 'rb') as f: train_features = pickle.load(f) # 加载验证集 with gzip.open(args.dev_data_path, 'rb') as f: eval_features = pickle.load(f) # 总共训练的步数 num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # 模型 model = Model() # 指定多gpu运行 if torch.cuda.is_available(): model.cuda() if torch.cuda.device_count() > 1: args.n_gpu = torch.cuda.device_count() print("Let's use", torch.cuda.device_count(), "GPUs!") # 就这一行 model = nn.DataParallel(model) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] warmup_steps = 0.05 * num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_steps) best_loss = None global_step = 0 # 开始训练 print("***** Running training *****") print(" Num examples = {}".format(len(train_features))) print(" Batch size = {}".format(args.train_batch_size)) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in train_features], dtype=torch.float32) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) model.train() for epoch in range(args.num_train_epochs): train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size) for step, batch in enumerate(train_dataloader): start_time = time.time() if torch.cuda.is_available(): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label = batch logits = model(input_ids=input_ids, attention_mask=input_mask, segment_ids=segment_ids, labels=label) loss = loss_fct(logits, label) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps s = '****Epoch: {}, step: {}, loss: {:10f}, time_cost: {:10f}'.format(epoch, step, loss, time.time() - start_time) rainbow(s) loss.backward() # nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2) # 是否进行梯度裁剪 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # test_loss, test_acc = evaluate(epoch, eval_features, args, model) # 一轮跑完 进行eval test_loss, test_acc = evaluate(epoch, eval_features, args, model) model.train() if best_loss is None or best_loss > test_loss: best_loss = test_loss model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self os.makedirs(args.save_model, exist_ok=True) output_model_file = os.path.join(args.save_model, "best_pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.save_model, "epoch{}_ckpt.bin".format(epoch)) torch.save(model_to_save.state_dict(), output_model_file)
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) logger.warning('~Processing SQuAD dataset~') train_path = os.path.join(args.data_dir, 'train-v1.1.json') valid_path = os.path.join(args.data_dir, 'dev-v1.1.json') logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) # load data train_data = load_data(train_path) valid_data = load_data(valid_path, False) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + valid_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = os.path.join(args.data_dir, args.meta) logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) train_fout = os.path.join(args.data_dir, args.train_data) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True) dev_fout = os.path.join(args.data_dir, args.dev_data) build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False) end_time = time.time() logger.info('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
def main(): args = set_args() args.datasets = args.datasets.split(',') global logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) all_data = [] all_datasets = [] for dataset_name in args.datasets: test_file_prefix = 'test' if test_mode: if 'marco' in dataset_name: train_file_prefix = 'train' dev_file_prefix = 'dev' else: train_file_prefix = 'dev' dev_file_prefix = 'dev' else: train_file_prefix = 'train' dev_file_prefix = 'dev' logger.info('Processing %s dataset' % dataset_name) this_data_dir = args.data_dir + dataset_name + '/' train_data = None train_path = os.path.join(this_data_dir, '%s.json' % train_file_prefix) logger.info('The path of training data: {}'.format(train_path)) train_data = load_data(train_path) all_data += train_data valid_path = os.path.join(this_data_dir, '%s.json' % dev_file_prefix) logger.info('The path of validation data: {}'.format(valid_path)) valid_data = load_data(valid_path, False) all_data += valid_data if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name: test_path = os.path.join(this_data_dir, '%s.json' % test_file_prefix) logger.info('The path of test data: {}'.format(test_path)) test_data = load_data(test_path, False) all_data += test_data all_datasets.append((train_data, valid_data, test_data)) else: all_datasets.append((train_data, valid_data)) logger.info('{}-dim word vector path: {}'.format(args.glove_dim, args.glove)) glove_path = args.glove glove_dim = args.glove_dim nlp = spacy.load('en', parser=False) set_environment(args.seed) logger.info('Loading glove vocab.') glove_vocab = load_glove_vocab(glove_path, glove_dim) multitask_base_path = '../data/mtmrc/' with open(multitask_base_path + 'vocab_tag.pick', 'rb') as f: vocab_tag = pickle.load(f) with open(multitask_base_path + 'vocab_ner.pick', 'rb') as f: vocab_ner = pickle.load(f) logger.info('Build vocabulary ') vocab = build_vocab(all_data, glove_vocab, sort_all=args.sort_all, clean_on=True, args=args) meta_path = os.path.join(args.output_path, args.meta) logger.info('building embedding ') embedding = build_embedding(glove_path, vocab, glove_dim) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) for i, item in enumerate(all_datasets): dataset_name = args.datasets[i] if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name: train_data, valid_data, test_data = item else: train_data, valid_data = item print('building output file for ', dataset_name) train_fout = os.path.join(args.output_path, dataset_name + '_train.json') build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, dataset_name=dataset_name) dev_fout = os.path.join(args.output_path, dataset_name + '_dev.json') build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False, dataset_name=dataset_name) if args.include_test_set and 'squad' not in dataset_name: test_fout = os.path.join(args.output_path, dataset_name + '_test.json') build_data(test_data, vocab, vocab_tag, vocab_ner, test_fout, False, dataset_name=dataset_name)
def main(): args = set_args() global logger logger = create_logger(__name__, to_disk=True, log_file=args.log_file) logger.info('Processing dataset') train_path = os.path.join(args.raw_data_dir, 'train') valid_path = os.path.join(args.raw_data_dir, 'dev') test_path = os.path.join(args.raw_data_dir, 'test') logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('The path of test data: {}'.format(test_path)) logger.info('{}-dim word vector path: {}'.format(args.glove_dim, args.glove)) glove_path = args.glove glove_dim = args.glove_dim # set_environment(args.seed) # load data train_data = load_reddit_data(train_path, anc_type='section', fact_len=12, just_anc=False, is_train=True) valid_data = load_reddit_data(valid_path, anc_type='section', fact_len=12, just_anc=False, is_train=False) test_data = load_reddit_data(test_path, anc_type='section', fact_len=12, just_anc=False, is_train=False) logger.info('#train data: {}'.format(len(train_data))) logger.info('#valid data: {}'.format(len(valid_data))) logger.info('#test data: {}'.format(len(test_data))) meta_path = args.meta if not os.path.exists(meta_path): logger.info('Build vocabulary') vocab = build_vocab(train_data + valid_data) logger.info('building embedding') embedding = build_embedding(glove_path, vocab, glove_dim) logger.info('emb done') meta = {'vocab': vocab, 'embedding': embedding} with open(meta_path, 'wb') as f: pickle.dump(meta, f) else: with open(meta_path, 'rb') as f: meta = pickle.load(f) vocab = meta['vocab'] train_fout = os.path.join(args.data_dir, args.train_data) build_data(train_data, vocab, train_fout) logger.info('train data done') dev_fout = os.path.join(args.data_dir, args.dev_data) build_data(valid_data, vocab, dev_fout) logger.info('valid data done') test_fout = os.path.join(args.data_dir, args.test_data) build_data(test_data, vocab, test_fout) logger.info('test data done') write_files(args.data_dir + '/train', train_data) write_files(args.data_dir + '/dev', valid_data) write_files(args.data_dir + '/test', test_data)
def main(): args = set_args() global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) v2_on = args.v2_on version = 'v1' if v2_on: msg = '~Processing SQuAD v2.0 dataset~' # train_path = 'train-v2.0.json' # dev_path = 'dev-v2.0.json' train_path = 'msmarco_squad_train.json' dev_path = 'msmarco_squad_dev.json' version = 'v2' else: msg = '~Processing SQuAD dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGING MODE***') train_path = os.path.join(args.data_dir, train_path) valid_path = os.path.join(args.data_dir, dev_path) logger.info('Train path is: {}'.format(train_path)) logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on, limit=20000) dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500) wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = { 'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding } with open(meta_path, 'wb') as f: pickle.dump(meta, f) del meta del embedding logger.info('deleted meta and embedding') logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on) end_time = time.time() logger.warning('It totally took {} minutes to processe the data!!'.format( (end_time - start_time) / 60.))
import torch import utils from sklearn.decomposition import PCA from sklearn.preprocessing import MinMaxScaler import scipy import statistics import math import scipy.stats as ss tstart=time.time() from config import set_args ######################################################################################################################## args = set_args() args.output='./res/'+args.experiment+'_'+args.approach+'_'+str(args.note)+'.txt' print('='*100) print('Arguments =') for arg in vars(args): print('\t'+arg+':',getattr(args,arg)) print('='*100) ######################################################################################################################## # Seed np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed)
def main(): # Create a argument parser and read arguments from command line args = set_args() # logger will be a global variable global logger start_time = time.time() logger = create_logger(__name__, to_disk=True, log_file=args.log_file) v2_on = args.v2_on if v2_on: msg = '~Processing SQuAD v2.0 dataset~' train_path = 'train-v2.0.json' dev_path = 'dev-v2.0.json' version = 'v2' else: msg = '~Processing SQuAD v1.1 dataset~' train_path = 'train-v1.1.json' dev_path = 'dev-v1.1.json' version = 'v1' logger.warning(msg) if DEBUG_ON: logger.error('***DEBUGGING MODE***') train_path = os.path.join(args.data_dir, train_path) valid_path = os.path.join(args.data_dir, dev_path) logger.info('The path of training data: {}'.format(train_path)) logger.info('The path of validation data: {}'.format(valid_path)) logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove)) # could be fasttext embedding emb_path = args.glove embedding_dim = args.embedding_dim set_environment(args.seed) if args.fasttext_on: logger.info('Loading fasttext vocab.') else: logger.info('Loading glove vocab.') # load data train_data = load_data(train_path, v2_on=v2_on) dev_data = load_data(valid_path, False, v2_on=v2_on) wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on) logger.info('Build vocabulary') vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False) logger.info('Done with vocabulary collection') # loading ner/pos tagging vocab resource_path = 'resource' logger.info('Loading resource') # what do these vocab tags and vocab ners do? with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f: vocab_tag = pickle.load(f) with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f: vocab_ner = pickle.load(f) meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick') logger.info('building embedding') embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on) meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding} with open(meta_path, 'wb') as f: pickle.dump(meta, f) logger.info('building training data') train_fout = gen_name(args.data_dir, args.train_data, version) build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on, bert_tokenizer=BERT_TOKENIZER) logger.info('building dev data') dev_fout = gen_name(args.data_dir, args.dev_data, version) build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on, bert_tokenizer=BERT_TOKENIZER) end_time = time.time() logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))