def greedyUntied(st): A = st.Psi M, N = A.shape mul = np.matmul ls = st.loaded_state We = ls['We'] S = ls['S'] theta = ls['theta'] L = 5000 while len(S) < st.T: Tm1 = len(S) T = Tm1 + 1 x, y = bg_gen(A, L) B = mul(We, y) xhat = lista_run(y, We, S, theta, Tm1) S_ = np.identity(N) - mul(We, A) theta_ = theta[Tm1 - 1] stepsize = 1.0 nmsePrev = 999 for steps in range(10): rhat_ = B + mul(S_, xhat) xhat_ = ut.eta(rhat_, theta_) nmse = 20 * math.log10( la.norm(x - xhat_, 'fro') / la.norm(x, 'fro')) print ' %d nmse=%.4fdB ' % (steps, nmse) if nmse > nmsePrev: stepsize = stepsize * .5 else: stepsize = stepsize * 1.1 nmsePrev = nmse S_ = S_ - stepsize * mul( ((xhat_ - x) * abs(np.sign(xhat_))), rhat_.T) / L theta_ = theta_ - stepsize * np.mean( (xhat_ - x) * (-np.sign(xhat_))) xhat = ut.eta(B + mul(S_, xhat), theta_) S = np.concatenate((S, np.reshape(S_, (1, N, N)))) theta = np.concatenate((theta, np.reshape(theta_, (1, )))) print 't=%d nmse=%.2fdB ' % ( Tm1, 20 * math.log10(la.norm(xhat - x) / la.norm(x))) x, y = bg_gen(A, L) xhat = lista_run(y, We, S, theta, st.T) print 'fresh nmse=%.2fdB' % (20 * math.log10(la.norm(xhat - x) / la.norm(x))) return S, theta
def printTracks(self, eventVars=None, params=None, coords=None, nMax=None, tracks=None, color=r.kBlack): self.prepareText(params, coords) self.printText(tracks) headers = " name pdgId pT eta phi" self.printText(headers) self.printText("-" * len(headers)) nTracks = utils.size(eventVars, tracks) for iTrack in range(nTracks): if nMax <= iTrack: self.printText("[%d more not listed]" % (nTracks - nMax)) break track = eventVars[tracks][iTrack] name = pdgLookup.pdgid_to_name( track.PID) if pdgLookupExists else "" self.printText("%6s %6d%5.0f %s %4.1f" % ( name[-6:], track.PID, track.PT, utils.eta(track), track.Phi, ), color=color) return
def printTracks(self, eventVars=None, params=None, coords=None, nMax=None, tracks=None, color=r.kBlack): self.prepareText(params, coords) self.printText(tracks) headers = " name pdgId pT eta phi" self.printText(headers) self.printText("-" * len(headers)) nTracks = utils.size(eventVars, tracks) for iTrack in range(nTracks): if nMax <= iTrack: self.printText("[%d more not listed]" % (nTracks - nMax)) break track = eventVars[tracks][iTrack] name = pdgLookup.pdgid_to_name(track.PID) if pdgLookupExists else "" self.printText("%6s %6d%5.0f %s %4.1f" % (name[-6:], track.PID, track.PT, utils.eta(track), track.Phi, ), color=color) return
def printGenParticles(self, eventVars=None, params=None, coords=None, nMax=None, particles=None, color=r.kBlack): self.prepareText(params, coords) self.printText(particles) headers = " name pdgId pT eta phi st PU" self.printText(headers) self.printText("-" * len(headers)) nParticles = utils.size(eventVars, particles) for iParticle in range(nParticles): if nMax <= iParticle: self.printText("[%d more not listed]" % (nParticles - nMax)) break particle = eventVars[particles].At(iParticle) name = pdgLookup.pdgid_to_name(particle.PID) if pdgLookupExists else "" self.printText("%6s %6d%5.0f %s %4.1f %1d %1d" % (name[-6:], particle.PID, particle.PT, utils.eta(particle), particle.Phi, particle.Status, particle.IsPU, ), color=color) return
def printLeptons(self, eventVars=None, params=None, coords=None, nMax=None, leptons=None, color=r.kBlack, ptMin=None): self.prepareText(params, coords) self.printText(leptons) headers = " pT eta phi iso" self.printText(headers) self.printText("-" * len(headers)) nLeptons = utils.size(eventVars, leptons) for iLepton in range(nLeptons): if nMax <= iLepton: self.printText("[%d more not listed]" % (nLeptons - nMax)) break lepton = eventVars[leptons][iLepton] iso = "%4.1f" % lepton.IsolationVar if hasattr( lepton, "IsolationVar") else " " self.printText("%5.0f %s %4.1f %s" % ( lepton.PT, utils.eta(lepton), lepton.Phi, iso, ), color=color) return
def printTowers(self, eventVars=None, params=None, coords=None, nMax=None, towers=None, color=r.kBlack): self.prepareText(params, coords) self.printText(towers) headers = " ET eta phi" self.printText(headers) self.printText("-" * len(headers)) nTowers = utils.size(eventVars, towers) for iTower in range(nTowers): if nMax <= iTower: self.printText("[%d more not listed]" % (nTowers - nMax)) break tower = eventVars[towers][iTower] self.printText("%5.0f %s %4.1f" % ( tower.ET, utils.eta(tower), tower.Phi, ), color=color) return
def printGenParticles(self, eventVars=None, params=None, coords=None, nMax=None, particles=None, color=r.kBlack): self.prepareText(params, coords) self.printText(particles) headers = " name pdgId pT eta phi st PU" self.printText(headers) self.printText("-" * len(headers)) nParticles = utils.size(eventVars, particles) for iParticle in range(nParticles): if nMax <= iParticle: self.printText("[%d more not listed]" % (nParticles - nMax)) break particle = eventVars[particles].At(iParticle) name = pdgLookup.pdgid_to_name( particle.PID) if pdgLookupExists else "" self.printText("%6s %6d%5.0f %s %4.1f %1d %1d" % ( name[-6:], particle.PID, particle.PT, utils.eta(particle), particle.Phi, particle.Status, particle.IsPU, ), color=color) return
def eta_opt_lambda(rhat, x, **kwargs): 'find the MSE-optimal lambda' minlam = 0 maxlam = np.abs(rhat).max() for k in range(5): lamvec = np.linspace(minlam, maxlam, 11) dlam = lamvec[1] - lamvec[0] err = [la.norm(x - ut.eta(rhat, lam)) for lam in lamvec] minidx = np.argmin(err) bestlam = lamvec[minidx] minlam = max(0, bestlam - dlam) maxlam = bestlam + dlam xhat = ut.eta(rhat, bestlam) #print 'lambda = %.3f nmse=%.3fdB' % (bestlam,20*math.log10( la.norm(err[minidx])/la.norm(x) ) ) return (xhat, bestlam)
def train(model, generated_image, initial_image): """ Train your model.""" with tf.Session() as sess: saver = tf.train.Saver() #Initialize variables sess.run(tf.global_variables_initializer()) sess.run(generated_image.assign(initial_image)) ckpt = tf.train.get_checkpoint_state( os.path.dirname('checkpointsTransfer/checkpoint/')) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) initial_step = model['global_step'].eval() start_time = time.time() skip_step = 1 for index in range(initial_step, ITERS): if index >= 5 and index < 20: skip_step = 10 elif index >= 20: skip_step = 20 sess.run(model['optimizer']) if (index + 1) % skip_step == 0: #Generated image and loss gen_image, total_loss, summary = sess.run([ generated_image, model['total_loss'], model['summary_op'] ]) elapsed_time = time.time() - start_time gen_image = gen_image + MEAN_PIXELS print('Step {}\n Sum: {:5.1f}'.format( index + 1, np.sum(gen_image))) print(' Loss: {:5.1f}'.format(total_loss)) print(' Time: {}'.format(elapsed_time)) utils.eta(index, skip_step, elapsed_time, ITERS) start_time = time.time() filename = 'outputs/%d.png' % (index) utils.save_image(filename, gen_image) if (index + 1) % 20 == 0: saver.save(sess, 'checkpointsTransfer/style_transfer/', index)
def Setup(st, **kwargs): A = st.Psi M, N = A.shape with tf.Session() as sess: sess.run(tf.initialize_all_variables()) (y, x) = sess.run(st.generators) mul = np.matmul xhat = np.zeros_like(x) ls = st.loaded_state for key in ('We', 'S', 'theta'): print '%s.shape = %s' % (key, repr(ls[key].shape)) if st.untieS: S, theta = greedyUntied(st) else: We = ls['We'] S = ls['S'] theta = ls['theta'] Tprev = len(theta) theta = np.concatenate((theta, np.zeros(st.T - Tprev))) bt = 0 v = 0 B = mul(We, y) for t in range(st.T): # basic recursion: # xhat = eta(We*y + S*xhat;theta_t) rhat = B + mul(S, xhat) if t < Tprev: xhat = ut.eta(rhat, theta[t]) else: (xhat, theta[t]) = eta_opt_lambda(rhat, x) print 't=%d lambda=%.3f nmse=%.3fdB' % ( t, theta[t], 20 * math.log10(la.norm(xhat - x) / la.norm(x))) ls['theta'] = np.float32(theta) ls['S'] = np.float32(S) return theta, S
def printTowers(self, eventVars=None, params=None, coords=None, nMax=None, towers=None, color=r.kBlack): self.prepareText(params, coords) self.printText(towers) headers = " ET eta phi" self.printText(headers) self.printText("-" * len(headers)) nTowers = utils.size(eventVars, towers) for iTower in range(nTowers): if nMax <= iTower: self.printText("[%d more not listed]" % (nTowers - nMax)) break tower = eventVars[towers][iTower] self.printText("%5.0f %s %4.1f" % (tower.ET, utils.eta(tower), tower.Phi, ), color=color) return
def printLeptons(self, eventVars=None, params=None, coords=None, nMax=None, leptons=None, color=r.kBlack, ptMin=None): self.prepareText(params, coords) self.printText(leptons) headers = " pT eta phi iso" self.printText(headers) self.printText("-" * len(headers)) nLeptons = utils.size(eventVars, leptons) for iLepton in range(nLeptons): if nMax <= iLepton: self.printText("[%d more not listed]" % (nLeptons - nMax)) break lepton = eventVars[leptons][iLepton] iso = "%4.1f" % lepton.IsolationVar if hasattr(lepton, "IsolationVar") else " " self.printText("%5.0f %s %4.1f %s" % (lepton.PT, utils.eta(lepton), lepton.Phi, iso, ), color=color) return
def test(self, e=1): correct = torch.zeros((6), dtype=torch.float) tp = torch.zeros((6), dtype=torch.float) fp = torch.zeros((6), dtype=torch.float) fn = torch.zeros((6), dtype=torch.float) correct_total = 0 step = 1 data_len = 0 iter_lst = [self.get_iter(self.test_features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) start = time.time() with torch.no_grad(): for data_loader, sampler in iter_lst: for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, labels = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) dis_loss, log_prob = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="dis", global_step=step) #print(log_prob.shape, labels.shape) data_len += labels.shape[0] onehot_labels = torch.nn.functional.one_hot( labels, num_classes=6).float() onehot_pred = torch.nn.functional.one_hot( (log_prob.argmax(dim=1).detach().cpu()), num_classes=6).float() correct_total += ((log_prob.argmax( dim=1).detach().cpu()) == labels.detach().cpu() ).float().sum() correct += (onehot_pred == onehot_labels).sum( dim=0).float() tp += ((onehot_pred.float() == 1) & (onehot_labels.float() == 1)).sum(dim=0).float() fp += ((onehot_pred.float() == 1) & (onehot_labels.float() == 0)).sum(dim=0).float() fn += ((onehot_pred.float() == 0) & (onehot_labels.float() == 1)).sum(dim=0).float() msg = "Test {}/{} {} - ETA : {}".format( i, num_batches, progress_bar(i, num_batches), eta(start, i, num_batches)) print(msg, end="\r") writer.add_scalar("Test/Total_accuracy", correct_total / data_len, e) writer.add_scalars("Test/By_class_accuracy", summary_map(self.num_to_name, correct / data_len), e) writer.add_scalars("Test/By_class_true_positives", summary_map(self.num_to_name, tp / data_len), e) writer.add_scalars("Test/By_class_false_negatives", summary_map(self.num_to_name, fn / data_len), e) writer.add_scalars("Test/By_class_false_positives", summary_map(self.num_to_name, fp / data_len), e) print( "Test accuracy total {}, by class {}, tp {}, fp {}, fn {}".format( correct_total / data_len, correct / data_len, tp / data_len, fp / data_len, fn / data_len), end="\n")
def get_embeddings(self, e=1): step = 1 data_len = 0 iter_lst = [self.get_iter(self.test_features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) start = time.time() ds = [] cls = [] st_emb = [] end_emb = [] ent_emb = [] labels_ent = [] with torch.no_grad(): for data_loader, sampler in iter_lst: for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, entity_mask, labels = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() entity_mask = entity_mask.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) entity_mask = entity_mask.cuda(self.args.gpu, non_blocking=True) sequence_output, _ = self.model(input_ids, seg_ids, input_mask, dtype="bert") cls.append(sequence_output[:, 0].detach().cpu().numpy()) st_emb.append( sequence_output[:, start_positions[start_positions. nonzero()]]. detach().cpu().numpy().mean(1).squeeze(1)) end_emb.append( sequence_output[:, end_positions[end_positions.nonzero( )]].detach().cpu().numpy().mean(1).squeeze(1)) ent_emb.append(sequence_output[ entity_mask > 0].detach().cpu().numpy()) ds.append(labels.detach().cpu().numpy()) labels_ent.append( torch.unsqueeze(labels, 1).repeat( 1, 384)[entity_mask > 0].detach().cpu().numpy()) data_len += labels.shape[0] if data_len > 10000: break msg = "Test {}/{} {} - ETA : {}".format( i, num_batches, progress_bar(i, num_batches), eta(start, i, num_batches)) print(msg, end="\r") print( np.concatenate(ds).shape, np.concatenate(cls).shape, np.concatenate(st_emb).shape) np.savetxt('labels.out', np.concatenate(ds), delimiter=',') np.savetxt('cls.out', np.concatenate(cls), delimiter=',') np.savetxt('start_emb.out', np.concatenate(st_emb), delimiter=',') np.savetxt('end_emb.out', np.concatenate(end_emb), delimiter=',') np.savetxt('ent_emb.out', np.concatenate(ent_emb), delimiter=',') np.savetxt('labels_ent.out', np.concatenate(labels_ent), delimiter=',')
def train(self): step = 1 avg_qa_loss = 0 avg_dis_loss = 0 iter_lst = [self.get_iter(self.train_features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) correct = torch.zeros((6), dtype=torch.float) tp = torch.zeros((6), dtype=torch.float) fp = torch.zeros((6), dtype=torch.float) fn = torch.zeros((6), dtype=torch.float) correct_total = 0 for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): start = time.time() self.model.train() batch_step = 1 data_len = 0 for data_loader, sampler in iter_lst: if self.args.distributed: sampler.set_epoch(epoch) for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, labels = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) qa_loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="qa", global_step=step) qa_loss = qa_loss.mean() qa_loss.backward() if self.args.train_qa: # update qa model avg_qa_loss = self.cal_running_avg_loss( qa_loss.item(), avg_qa_loss) self.qa_optimizer.step() self.qa_optimizer.zero_grad() # update discriminator dis_loss, log_prob = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="dis", global_step=step) dis_loss = dis_loss.mean() dis_loss.backward() avg_dis_loss = self.cal_running_avg_loss( dis_loss.item(), avg_dis_loss) self.dis_optimizer.step() self.dis_optimizer.zero_grad() step += 1 if self.do_test_every > 0 and i % self.do_test_every == 0: self.test(step) if i % 2000 == 0: result_dict = self.evaluate_model(i) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") writer.add_scalar("Train/Total_accuracy", correct_total / data_len, i) writer.add_scalars( "Train/By_class_accuracy", summary_map(self.num_to_name, correct / data_len), i) writer.add_scalars( "Train/By_class_true_positives", summary_map(self.num_to_name, tp / data_len), i) writer.add_scalars( "Train/By_class_false_negatives", summary_map(self.num_to_name, fn / data_len), i) writer.add_scalars( "Train/By_class_false_positives", summary_map(self.num_to_name, fp / data_len), i) correct = torch.zeros((6), dtype=torch.float) tp = torch.zeros((6), dtype=torch.float) fp = torch.zeros((6), dtype=torch.float) fn = torch.zeros((6), dtype=torch.float) correct_total = 0 batch_step += 1 msg = "" if self.args.train_qa: msg = "Train {}/{} {} - ETA : {} - QA loss: {:.4f}, DIS loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_qa_loss, avg_dis_loss) else: msg = "Train {}/{} {} - ETA : {} - DIS loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_dis_loss) writer.add_scalar("Loss/QA", avg_qa_loss, i) writer.add_scalar("Loss/Discriminator", avg_dis_loss, i) print(msg, end="\r") data_len += labels.shape[0] onehot_labels = torch.nn.functional.one_hot( labels, num_classes=6).float() onehot_pred = torch.nn.functional.one_hot( (log_prob.argmax(dim=1).detach().cpu()), num_classes=6).float() correct_total += ((log_prob.argmax( dim=1).detach().cpu()) == labels.detach().cpu() ).float().sum() correct += (onehot_pred == onehot_labels).sum( dim=0).float() tp += ((onehot_pred.float() == 1) & (onehot_labels.float() == 1)).sum(dim=0).float() fp += ((onehot_pred.float() == 1) & (onehot_labels.float() == 0)).sum(dim=0).float() fn += ((onehot_pred.float() == 0) & (onehot_labels.float() == 1)).sum(dim=0).float() if i % 1000 == 0: print( "Accuracy total {}, by class {}, tp {}, fp {}, fn {}" .format(correct_total / data_len, correct / data_len, tp / data_len, fp / data_len, fn / data_len), end="\n") print( "[GPU Num: {}, Epoch: {}, Final QA loss: {:.4f}, Final DIS loss: {:.4f}]" .format(self.args.gpu, epoch, avg_qa_loss, avg_dis_loss)) print("Train accuracy total {}, by class {}, tp {}, fp {}, fn {}". format(correct_total / data_len, correct / data_len, tp / data_len, fp / data_len, fn / data_len), end="\n") # save model if not self.args.distributed or self.args.rank == 0: self.save_model(epoch, avg_qa_loss) if self.args.do_valid: result_dict = self.evaluate_model(epoch) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n")
def main(args): save_dir = os.path.join("./save", time.strftime("%m%d%H%M%S")) if not os.path.exists(save_dir): os.makedirs(save_dir) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") if args.all_data: data_loader = get_ext_data_loader(tokenizer, "./data/train/", shuffle=True, args=args) else: data_loader, _, _ = get_data_loader(tokenizer, "./data/train-v1.1.json", shuffle=True, args=args) vocab_size = len(tokenizer.vocab) if args.bidaf: print("train bidaf") model = BiDAF(embedding_size=args.embedding_size, vocab_size=vocab_size, hidden_size=args.hidden_size, drop_prob=args.dropout) else: ntokens = len(tokenizer.vocab) model = QANet(ntokens, embedding=args.embedding, embedding_size=args.embedding_size, hidden_size=args.hidden_size, num_head=args.num_head) if args.load_model: state_dict = torch.load(args.model_path, map_location="cpu") model.load_state_dict(state_dict) print("load pre-trained model") device = torch.device("cuda") model = model.to(device) model.train() ema = EMA(model, args.decay) base_lr = 1 parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(0.9, 0.999), eps=1e-7, weight_decay=5e-8, params=parameters) cr = args.lr / math.log2(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < args.lr_warm_up_num else args.lr) step = 0 num_batches = len(data_loader) avg_loss = 0 best_f1 = 0 for epoch in range(1, args.num_epochs + 1): step += 1 start = time.time() model.train() for i, batch in enumerate(data_loader, start=1): c_ids, q_ids, start_positions, end_positions = batch c_len = torch.sum(torch.sign(c_ids), 1) max_c_len = torch.max(c_len) c_ids = c_ids[:, :max_c_len].to(device) q_len = torch.sum(torch.sign(q_ids), 1) max_q_len = torch.max(q_len) q_ids = q_ids[:, :max_q_len].to(device) start_positions = start_positions.to(device) end_positions = end_positions.to(device) optimizer.zero_grad() loss = model(c_ids, q_ids, start_positions=start_positions, end_positions=end_positions) loss.backward() avg_loss = cal_running_avg_loss(loss.item(), avg_loss) nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step) ema(model, step // args.batch_size) batch_size = c_ids.size(0) step += batch_size msg = "{}/{} {} - ETA : {} - qa_loss: {:.4f}" \ .format(i, num_batches, progress_bar(i, num_batches), eta(start, i, num_batches), avg_loss) print(msg, end="\r") if not args.debug: metric_dict = eval_qa(args, model) f1 = metric_dict["f1"] em = metric_dict["exact_match"] print("epoch: {}, final loss: {:.4f}, F1:{:.2f}, EM:{:.2f}".format( epoch, avg_loss, f1, em)) if args.bidaf: model_name = "bidaf" else: model_name = "qanet" if f1 > best_f1: best_f1 = f1 state_dict = model.state_dict() save_file = "{}_{:.2f}_{:.2f}".format(model_name, f1, em) path = os.path.join(save_dir, save_file) torch.save(state_dict, path)
def train(self): step = 1 avg_loss = 0 global_step = 1 iter_lst = [self.get_iter(self.features_lst, self.args)] num_batches = sum([len(iterator) for iterator in iter_lst]) for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): self.model.train() start = time.time() batch_step = 1 for data_loader in iter_lst: for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, _ = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions) loss = loss.mean() loss = loss / self.args.gradient_accumulation_steps loss.backward() avg_loss = self.cal_running_avg_loss( loss.item() * self.args.gradient_accumulation_steps, avg_loss) if step % self.args.gradient_accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() if epoch != 0 and i % 2000 == 0: result_dict = self.evaluate_model(i) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") global_step += 1 batch_step += 1 msg = "{}/{} {} - ETA : {} - loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_loss) print(msg, end="\r") print("[GPU Num: {}, epoch: {}, Final loss: {:.4f}]".format( self.args.gpu, epoch, avg_loss)) # save model if self.args.rank == 0: self.save_model(epoch, avg_loss) if self.args.do_valid: result_dict = self.evaluate_model(epoch) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n")
def train(self): self.model.zero_grad() for epoch in range(0, self.args.pretrain_epochs): num_batches = len(self.pretrain_loader) self.pretrain_sampler.set_epoch(epoch) start = time.time() # pretrain with unsupervised dataset for step, batch in enumerate(self.pretrain_loader, start=1): self.model.train() input_ids, input_mask, seg_ids, start_positions, end_positions = batch seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone().cuda( self.args.gpu, non_blocking=True) input_mask = input_mask[:, :max_len].clone().cuda( self.args.gpu, non_blocking=True) seg_ids = seg_ids[:, :max_len].clone().cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.clone().cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.clone().cuda(self.args.gpu, non_blocking=True) inputs = { "input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": seg_ids, "start_positions": start_positions, "end_positions": end_positions } loss = self.model(**inputs)[0] loss.backward() clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) self.optimizer.step() self.scheduler.step() self.model.zero_grad() if self.args.rank == 0: msg = "PRETRAIN {}/{} {} - ETA : {} - LOSS : {:.4f}".format( step, num_batches, progress_bar(step, num_batches), eta(start, step, num_batches), float(loss.item())) print(msg, end="\r") if self.args.debug: break # save model if self.args.rank == 0: result_dict = self.evaluate_model(msg) em = result_dict["exact_match"] f1 = result_dict["f1"] print( "\nPRETRAIN took {} DEV - F1: {:.4f}, EM: {:.4f}\n".format( user_friendly_time(time_since(start)), f1, em)) if self.args.rank == 0: result_dict = self.evaluate_model("TEST", False) em = result_dict["exact_match"] f1 = result_dict["f1"] print("\nFINAL TEST - F1: {:.4f}, EM: {:.4f}\n".format(f1, em))
def train(self, consolidate=True, fisher_estimation_sample_size=1024): step = 1 avg_loss = 0 global_step = 1 iter_lst = [self.get_iter(self.features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): self.model.train() start = time.time() batch_step = 1 for data_loader, sampler in iter_lst: if self.args.distributed: sampler.set_epoch(epoch) for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, _ = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions) loss = loss.mean() loss = loss / self.args.gradient_accumulation_steps ewc_loss = self.ewc_loss(cuda=True) loss = loss + ewc_loss loss.backward() avg_loss = self.cal_running_avg_loss( loss.item() * self.args.gradient_accumulation_steps, avg_loss) if step % self.args.gradient_accumulation_steps == 0: self.optimizer.step() self.optimizer.zero_grad() if epoch != 0 and i % 2000 == 0: result_dict = self.evaluate_model(i) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") global_step += 1 batch_step += 1 msg = "{}/{} {} - ETA : {} - loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_loss) print(msg, end="\r") print("[GPU Num: {}, epoch: {}, Final loss: {:.4f}]".format( self.args.gpu, epoch, avg_loss)) # save model if self.args.rank == 0: self.save_model(epoch, avg_loss) if self.args.do_valid: result_dict = self.evaluate_model(epoch) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") if consolidate: # estimate the fisher information of the parameters and consolidate # them in the network. print( '=> Estimating diagonals of the fisher information matrix...', flush=True, end='', ) # ATTENTION!!! the data_loader should entire training set!!!! self.consolidate( self.estimate_fisher( self.get_data_loader(self.features_lst, self.args), fisher_estimation_sample_size)) print('EWC Loaded!')
def train(self): step = 1 avg_qa_loss = 0 avg_dis_loss = 0 iter_lst = [self.get_iter(self.features_lst, self.args)] num_batches = sum([len(iterator[0]) for iterator in iter_lst]) for epoch in range(self.args.start_epoch, self.args.start_epoch + self.args.epochs): start = time.time() self.model.train() batch_step = 1 for data_loader, sampler in iter_lst: if self.args.distributed: sampler.set_epoch(epoch) for i, batch in enumerate(data_loader, start=1): input_ids, input_mask, seg_ids, start_positions, end_positions, labels = batch # remove unnecessary pad token seq_len = torch.sum(torch.sign(input_ids), 1) max_len = torch.max(seq_len) input_ids = input_ids[:, :max_len].clone() input_mask = input_mask[:, :max_len].clone() seg_ids = seg_ids[:, :max_len].clone() start_positions = start_positions.clone() end_positions = end_positions.clone() if self.args.use_cuda: input_ids = input_ids.cuda(self.args.gpu, non_blocking=True) input_mask = input_mask.cuda(self.args.gpu, non_blocking=True) seg_ids = seg_ids.cuda(self.args.gpu, non_blocking=True) start_positions = start_positions.cuda( self.args.gpu, non_blocking=True) end_positions = end_positions.cuda(self.args.gpu, non_blocking=True) qa_loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="qa", global_step=step) qa_loss = qa_loss.mean() qa_loss.backward() # update qa model avg_qa_loss = self.cal_running_avg_loss( qa_loss.item(), avg_qa_loss) self.qa_optimizer.step() self.qa_optimizer.zero_grad() # update discriminator dis_loss = self.model(input_ids, seg_ids, input_mask, start_positions, end_positions, labels, dtype="dis", global_step=step) dis_loss = dis_loss.mean() dis_loss.backward() avg_dis_loss = self.cal_running_avg_loss( dis_loss.item(), avg_dis_loss) self.dis_optimizer.step() self.dis_optimizer.zero_grad() step += 1 if epoch != 0 and i % 2000 == 0: result_dict = self.evaluate_model(i) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n") batch_step += 1 msg = "{}/{} {} - ETA : {} - QA loss: {:.4f}, DIS loss: {:.4f}" \ .format(batch_step, num_batches, progress_bar(batch_step, num_batches), eta(start, batch_step, num_batches), avg_qa_loss, avg_dis_loss) print(msg, end="\r") print( "[GPU Num: {}, Epoch: {}, Final QA loss: {:.4f}, Final DIS loss: {:.4f}]" .format(self.args.gpu, epoch, avg_qa_loss, avg_dis_loss)) # save model if not self.args.distributed or self.args.rank == 0: self.save_model(epoch, avg_qa_loss) if self.args.do_valid: result_dict = self.evaluate_model(epoch) for dev_file, f1 in result_dict.items(): print("GPU/CPU {} evaluated {}: {:.2f}".format( self.args.gpu, dev_file, f1), end="\n")
def lista_run(y, We, S, theta, T, **kwargs): B = np.matmul(We, y) xhat = ut.eta(B, theta[0]) for t in range(1, T): xhat = ut.eta(B + np.matmul(S[t], xhat), theta[t]) return xhat