def main(_): """ Typical usage For <model_name> see your folder name in ../checkpoints. Training ``` sh $ python main.py --mode train --model <model> (if restoring or naming a model: --model_name <model_name>) ``` Evaluation ``` sh $ python main.py --mode eval --model <model> --model_name <model_name> ``` Shell ``` sh $ python main.py --mode shell --model <model> --model_name <model_name> ``` """ # Load data train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length) dev = SquadDataset(*get_data_paths(FLAGS.data_dir, name='val'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length ) # change to eval to zero if too long logging.info(f'Train/Dev size {train.length}/{dev.length}') # Load embeddings embed_path = FLAGS.embed_path or pjoin( FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path)['glove'] # 115373 # Build model if FLAGS.model in ('baseline', 'mixed', 'dcnplus', 'dcn'): model = DCN(embeddings, FLAGS.__flags) elif FLAGS.model == 'cat': from networks.cat import Graph model = Graph(embeddings) else: raise ValueError(f'{FLAGS.model} is not a supported model') # Run mode if FLAGS.mode == 'train': save_flags() do_train(model, train, dev) elif FLAGS.mode == 'eval': do_eval(model, train, dev) elif FLAGS.mode == 'overfit': test_overfit(model, train) elif FLAGS.mode == 'shell': do_shell(model, dev) else: raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
def train(): """ Training function for Squad QA BERT model Implement the Squad QA trainer which trains the model you have made. Note: There are useful tools for your implementation below. Memory tip 1: If you delete the output tensors explictly after every loss calculation like "del out, loss", tensors are garbage-collected before next loss calculation so you can cut memory usage. Memory tip 2: If you want to keep batch_size while reducing memory usage, creating a virtual batch is a good solution. Explanation: https://medium.com/@davidlmorton/increasing-mini-batch-size-without-increasing-memory-6794e10db672 Useful readings: https://blog.paperspace.com/pytorch-memory-multi-gpu-debugging/ """ # Below options are just our recommendation. You can choose your own options if you want. epochs = 3 learning_rate = 5e-5 batch_size = 8 bert_type = 'bert-base-uncased' # Change the lazy option if you want fast debugging. dataset = SquadFeatureDataset(SquadDataset(), bert_type=bert_type, lazy=False) model = BertForSquad.from_pretrained(bert_type) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) ### YOUR CODE HERE ### END YOUR CODE # Save the model in the checkpoint folder model.save_pretrained('./checkpoint')
def quantative_analysis(tokenizer, model): print("======Quantitative Analysis======") dataset = SquadDataset('data/dev-v1.1-TA.json') dataset = SquadFeatureDataset(dataset, bert_type=bert_type, lazy=True, return_sample=True, eval=True) answers = dict() for index in trange(len(dataset), desc="Answering"): (input_ids, token_type_ids, _, _), sample = dataset[index] answers[sample['id']] = \ inference_model(model, tokenizer, sample['context'], sample['question'], input_ids, token_type_ids) with open('dev-v1.1-TA-answers.json', mode='w') as f: json.dump(answers, f) with open('data/dev-v1.1-TA.json', mode='r') as f: dataset = json.load(f)['data'] results = evaluate(dataset, answers) print( f"Exact Match: {results['exact_match']}. This should be upper than 60.0. TA score: 75.2" ) print( f"F1 score: {results['f1']}. This should be upper than 70.0. TA score: 83.9" )
def test_overfit(): """ Tests that model can overfit on small datasets. """ data_hparams = {'max_paragraph_length': 300, 'max_question_length': 25} train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'), **data_hparams) dev = SquadDataset(*get_data_paths(FLAGS.data_dir, name='val'), **data_hparams) # probably not cut embed_path = FLAGS.embed_path or pjoin( FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path)['glove'] # 115373 test_hparams = { 'learning_rate': 0.01, 'keep_prob': 1.0, 'trainable_embeddings': False, 'clip_gradients': True, 'max_gradient_norm': 5.0 } model = Baseline(embeddings, test_hparams) epochs = 100 test_size = 32 steps_per_epoch = 10 train.question, train.paragraph, train.question_length, train.paragraph_length, train.answer = train[: test_size] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(epochs): epoch_start = timer() for step in range(steps_per_epoch): loss, _ = model.training_step(sess, *train[:test_size]) if (step == 0 and epoch == 0): print( f'Entropy - Result: {loss:.2f}, Expected (approx.): {2*np.log(FLAGS.max_paragraph_length):.2f}' ) if step == steps_per_epoch - 1: print(f'Cross entropy: {loss}') train.length = 32 print(evaluate(sess, model, train, size=test_size)) global_step = tf.train.get_global_step().eval() print( f'Epoch took {timer() - epoch_start:.2f} s (step: {global_step})' )
def main(_): # Load data train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length) dev = SquadDataset( *get_data_paths(FLAGS.data_dir, name='val'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length) # probably not cut # TODO convert to TF Dataset API # train = tf.convert_to_tensor(train) # dev = tf.convert_to_tensor(dev) # tf.contrib.data.Dataset() # logging.info(f'Train/Dev size {train.length}/{dev.length}') # Load embeddings embed_path = FLAGS.embed_path or pjoin( FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path)['glove'] # 115373 is_training = (FLAGS.mode == 'train' or FLAGS.mode == 'overfit') # Build model if FLAGS.model == 'dcnplus': model = DCNPlus(embeddings, FLAGS.__flags, is_training=is_training) elif FLAGS.model == 'baseline': model = Baseline(embeddings, FLAGS.__flags) elif FLAGS.model == 'cat': model = Graph(embeddings, is_training=is_training) else: raise ValueError(f'{FLAGS.model} is not a supported model') # Run mode if FLAGS.mode == 'train': save_flags() do_train(model, train) elif FLAGS.mode == 'eval': do_eval(model, train, dev, evaluate) elif FLAGS.mode == 'overfit': test_overfit(model, train, evaluate) elif FLAGS.mode == 'shell': do_shell(model, dev) else: raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
def main(_): # Load data train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length) dev = SquadDataset( *get_data_paths(FLAGS.data_dir, name='val'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length) # probably not cut # TODO convert to TF Dataset API # train = tf.convert_to_tensor(train) # dev = tf.convert_to_tensor(dev) # tf.contrib.data.Dataset() logging.info(f'Train/Dev size {train.length}/{dev.length}') # Load embeddings embed_path = FLAGS.embed_path or pjoin( FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path)['glove'] # 115373 # vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat") # vocab, rev_vocab = initialize_vocab(vocab_path) # dict, list is_training = FLAGS.mode == 'train' # Build model if FLAGS.model == 'dcnplus': model = DCNPlus(embeddings, FLAGS.__flags, is_training=is_training) elif FLAGS.model == 'baseline': model = Baseline(embeddings, FLAGS.__flags) elif FLAGS.model == 'cat': model = Graph(embeddings, is_training=is_training) else: raise ValueError(f'{FLAGS.model} is not a supported model') # Run mode if FLAGS.mode == 'train': with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as f: json.dump(FLAGS.__flags, f, indent=4) do_train(model, train, dev, evaluate) elif FLAGS.mode == 'eval': do_eval(model, train, dev, evaluate) else: raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
def main(_): """ Typical usage For <model_name> see your folder name in ../checkpoints. Training ``` sh $ python main.py --mode train --model <model> (if restoring or naming a model: --model_name <model_name>) ``` Evaluation ``` sh $ python main.py --mode eval --model <model> --model_name <model_name> ``` Shell ``` sh $ python main.py --mode shell --model <model> --model_name <model_name> ``` """ # Load data train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length) dev = SquadDataset(*get_data_paths(FLAGS.data_dir, name='val'), max_question_length=FLAGS.max_question_length, max_paragraph_length=FLAGS.max_paragraph_length ) # change to eval to zero if too long logging.info(f'Train/Dev size {train.length}/{dev.length}') # Load embeddings embed_path = FLAGS.embed_path or pjoin( FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) embeddings = np.load(embed_path)['glove'] # 115373 if FLAGS.use_siamese: # get config file for siamese model siamese_config = '../../paraphrase-id-tensorflow-master/logs/baseline_siamese/{}/trainparams.json'.format( FLAGS.siamese_model_num) with open(siamese_config, 'r') as f: siamese_config = json.load(f) siamese_config['mode'] = 'test' checkpoint_dir = '../../paraphrase-id-tensorflow-master/models/baseline_siamese/{}/'.format( FLAGS.siamese_model_num) # siamese_graph = ImportGraph(checkpoint_dir, embeddings) siamese_graph = ImportModel(checkpoint_dir, siamese_config, embeddings) # Build model if FLAGS.model in ('baseline', 'mixed', 'dcnplus', 'dcn'): # with tf.variable_scope('dcn'): model = DCN(embeddings, FLAGS.__flags, siamese_output_dim=siamese_config['rnn_hidden_size']) elif FLAGS.model == 'cat': from networks.cat import Graph model = Graph(embeddings) else: raise ValueError(f'{FLAGS.model} is not a supported model') # Run mode if FLAGS.mode == 'train': save_flags() do_train(model, train, dev, input_model=siamese_graph) elif FLAGS.mode == 'eval': do_eval(model, train, dev, input_model=siamese_graph) elif FLAGS.mode == 'overfit': test_overfit(model, train, input_model=siamese_graph) elif FLAGS.mode == 'shell': do_shell(model, dev, input_model=siamese_graph) else: raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
def train(): """ Training function for Squad QA BERT model Implement the Squad QA trainer which trains the model you have made. Note: There are useful tools for your implementation below. Memory tip 1: If you delete the output tensors explictly after every loss calculation like "del out, loss", tensors are garbage-collected before next loss calculation so you can cut memory usage. Memory tip 2: If you want to keep batch_size while reducing memory usage, creating a virtual batch is a good solution. Explanation: https://medium.com/@davidlmorton/increasing-mini-batch-size-without-increasing-memory-6794e10db672 Useful readings: https://blog.paperspace.com/pytorch-memory-multi-gpu-debugging/ """ # Below options are just our recommendation. You can choose your own options if you want. epochs = 3 learning_rate = 5e-5 batch_size = 8 bert_type = 'bert-base-uncased' # Change the lazy option if you want fast debugging. dataset = SquadFeatureDataset(SquadDataset(), bert_type=bert_type, lazy=True) model = BertForSquad.from_pretrained(bert_type).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) train_loader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, collate_fn=squad_feature_collate_fn) train_iterator = tqdm(train_loader, leave=False) start_loss_obj = torch.nn.CrossEntropyLoss() end_loss_obj = torch.nn.CrossEntropyLoss() for epoch in range(epochs): print('epoch:{}'.format(epoch)) for data in train_iterator: optimizer.zero_grad() input_ids, attention_mask, token_type_ids, start_token_pos, end_token_pos = data input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) token_type_ids = token_type_ids.to(device) start_token_pos = start_token_pos.to(device) end_token_pos = end_token_pos.to(device) # start_predict, end_predict = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # # start_loss = start_loss_obj(start_predict, start_token_pos) # end_loss = end_loss_obj(end_predict, end_token_pos) # loss = start_loss + end_loss # optimizer.zero_grad() # loss.backward() # optimizer.step() # print(loss.item()) start_logits, end_logits = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) start_logits = start_logits.log() end_logits = end_logits.log() ignored_index = start_logits.size(1) start_token_pos.clamp_(0, ignored_index) end_token_pos.clamp_(0, ignored_index) loss_fct = NLLLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_token_pos) end_loss = loss_fct(end_logits, end_token_pos) loss = (start_loss + end_loss) / 2 loss.backward() optimizer.step() print(loss.item()) # Save the model in the checkpoint folder model.save_pretrained('./checkpoint')
val_questions, truncation=True, padding=True) ''' last step preparing model inputs ''' add_token_positions(train_encodings, train_answers) add_token_positions(val_encodings, val_answers) device = 'cuda' if torch.cuda.is_available() else 'cpu' if device == 'cuda': torch.cuda.set_device(DEVICE_ID) # use an unoccupied GPU ''' Torch dataset object ''' train_dataset = SquadDataset(train_encodings, device) val_dataset = SquadDataset(val_encodings, device) model.to(device) model.train() train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) for epoch in range(NUM_EPOCH): for batch in tqdm(train_loader): optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) start_positions = batch['start_positions'].to(device)
jieba_sgns_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl')) jieba_flag_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'), os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl')) trainset_roots = [ os.path.join(data_root_folder, 'val.txt') ] embed_lists = { 'jieba': [jieba_base_v.embeddings, jieba_sgns_v.embeddings, jieba_flag_v.embeddings], 'pyltp': [] } transform = SquadTransform(jieba_base_v, jieba_sgns_v, jieba_flag_v) train_dataset = SquadDataset(train_file, transform, use_rouge=True, max_size=None) dev_dataset = SquadDataset(val_file, transform, use_rouge=True, max_size=None) num_workers = 0 train_loader = DataLoader( dataset=train_dataset, batch_size=BATCH_SIZE, num_workers=num_workers, collate_fn=transform.batchify, ) dev_loader = DataLoader( dataset=dev_dataset, batch_size=BATCH_SIZE, num_workers=num_workers,
def train(): """ Training function for Squad QA BERT model Implement the Squad QA trainer which trains the model you have made. Note: There are useful tools for your implementation below. Memory tip 1: If you delete the output tensors explictly after every loss calculation like "del out, loss", tensors are garbage-collected before next loss calculation so you can cut memory usage. Memory tip 2: If you want to keep batch_size while reducing memory usage, creating a virtual batch is a good solution. Explanation: https://medium.com/@davidlmorton/increasing-mini-batch-size-without-increasing-memory-6794e10db672 Useful readings: https://blog.paperspace.com/pytorch-memory-multi-gpu-debugging/ """ # Below options are just our recommendation. You can choose your own options if you want. epochs = 3 learning_rate = 5e-5 batch_size = 6 bert_type = 'bert-base-uncased' device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Change the lazy option if you want fast debugging. dataset = SquadFeatureDataset(SquadDataset(), bert_type=bert_type, lazy=False) model = BertForSquad.from_pretrained(bert_type) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) ### YOUR CODE HERE batch_sampler = SquadBucketSampler(dataset, batch_size, shuffle=True) data_loader = DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=squad_feature_collate_fn) loss_fct = torch.nn.CrossEntropyLoss() model.train() losses = [] for epoch in range(epochs): batch_loss = 0.0 for input_ids, attention_mask, token_type_ids, start_pos, end_pos in tqdm( data_loader): input_ids = input_ids.to(device) attention_mask = attention_mask.to(device) token_type_ids = token_type_ids.to(device) start_pos = start_pos.to(device) end_pos = end_pos.to(device) optimizer.zero_grad() start_logits, end_logits = model(input_ids, attention_mask, token_type_ids) start_loss = loss_fct(start_logits, start_pos) end_loss = loss_fct(end_logits, end_pos) loss = start_loss + end_loss loss.backward() optimizer.step() batch_loss += loss.item() del start_logits, end_logits, loss # MemoryError losses.append(batch_loss) ### END YOUR CODE # Save the model in the checkpoint folder model.save_pretrained('./checkpoint')
print(f"Using {device}.") print(f"Reading {sys.argv[1]}...") df = extract_data(sys.argv[1], contain_answers=False).set_index(['id']) print(f"DataFrame created.") print("Tokenizing the DataFrame...") model = DistilBertKnowledge(alpha=0.5) DistilBertTokenizer.from_pretrained( model.info.pretrained_model).save_pretrained('slow_tokenizer/') tokenizer = BertWordPieceTokenizer('slow_tokenizer/vocab.txt', lowercase=True) df = process_dataframe(df, tokenizer, contain_answers=False) print("Tokenization complete.") dataset = SquadDataset(df, model.info, contain_answers=False) loader = DataLoader(dataset, batch_size=16, num_workers=4, pin_memory=True) print("Loading model weights...") model.load_state_dict(torch.load('model.pt')) model = model.to(device) print("Model loaded.") model.eval() print("Starting evaluation...") starts, ends = [], [] num_batches = len(loader) for idx, input in enumerate(loader): if (idx + 1) % 100 == 0: print(f'Batch {idx + 1:{len(str(num_batches))}}/{num_batches}') with torch.no_grad():
def train_model(preprocessor, base_model, frac_train_data, frac_val_data, batch_size=8, n_epoch=10, log_every=1, eval_every=10, save_every=300, checkpoint_fn=None, force_cpu=False, save_model_prefix="" ) -> None: """ Fine-tunes transformer model with custom head on custom data. Parameters ---------- preprocessor (SquadPreprocessor, SquadPlausibleAnswersPreprocessor) - pre-processor class. base_model (nn.Module)- model class, sub-class of nn.Module. frac_train_data (float) - fraction of training data to sample randomly. Useful with limited memory. frac_val_data (float) - fraction of validation data to sample randomly. batch_size (int) - batch size for training. n_epoch (int) - number of epochs for training. log_every (int) - steps frequency to print training loss. eval_every (int) - steps frequency to print eval loss. save_every (int) - steps frequency to save checkpoint. checkpoint_fn (None or str) - if str, uses as filename to load a checkpoint model, to continue training. force_cpu - forces CPU, even on systems with detectable CUDA. Useful for old CUDA architectures, which aren't supported anymore save_model_prefix (str) - prefix to save the model checkpoint """ sp = preprocessor() train_enc, val_enc = sp.get_encodings(random_sample_train=frac_train_data, random_sample_val=frac_val_data, return_tensors="pt") train_ds = SquadDataset(train_enc) val_ds = SquadDataset(val_enc) train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True) eval_dl = DataLoader(val_ds, batch_size=64, shuffle=True) dbm = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True) # Freeze all parameters of the DistilBert # for name, param in dbm.named_parameters(): # if name.startswith('embeddings'): # param.requires_grad = False if force_cpu: device = torch.device("cpu") else: device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # torch.device("cpu") epoch = 0 train_iter = 0 loss_eval = 1000 if checkpoint_fn is not None: checkpoint = torch.load(checkpoint_fn, map_location=device) epoch = checkpoint['epoch'] - 1.0 train_iter = checkpoint['train_iter'] else: checkpoint = None model = base_model(transformer_model=dbm, device=device) if checkpoint: model.load_state_dict(checkpoint['model_state_dict']) # optimizer = torch.optim.Adam(model.parameters(), lr = 0.0002) logging.info(f"Using device: {device}") model.to(device) model.train() optimizer = AdamW(model.parameters(), lr=5e-5) # torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) if checkpoint: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) while epoch < n_epoch: epoch += 1 for train_data in train_dl: train_iter += 1 optimizer.zero_grad() model_out = model(train_data) loss = model.compute_loss(*model_out) loss.backward() optimizer.step() if train_iter % log_every == 0: print('Train: Epoch: %d, iter: %d, avg. loss: %.2f' % (epoch, train_iter, loss)) if train_iter % eval_every == 0: with torch.no_grad(): # Disable gradient tracking for evaluation model.eval() eval_data = next(iter(eval_dl)) model_out = model(eval_data) loss_eval = model.compute_loss(*model_out) print('\nEval: Epoch: %d, iter: %d, avg. loss: %.2f\n' % (epoch, train_iter, loss_eval)) model.train() if train_iter % save_every == 0: model.save(f"model_checkpoint/{save_model_prefix}_model_{train_iter}.pt", train_iter=train_iter, epoch=epoch, optimizer=optimizer, train_loss=loss, eval_loss=loss_eval)
def fit(self, config, device): logging.info(json.dumps(config, indent=4, sort_keys=True)) if config["char_embeddings"]: fields = SquadDataset.prepare_fields_char() else: fields = SquadDataset.prepare_fields() train, val = SquadDataset.splits(fields) fields = dict(fields) fields["question"].build_vocab(train, val, vectors=GloVe(name='6B', dim=config["embedding_size"])) if not type(fields["question_char"]) == torchtext.data.field.RawField: fields["question_char"].build_vocab(train, val, max_size=config["char_maxsize_vocab"]) # Make if shuffle train_iter = BucketIterator(train, sort_key=lambda x: -(len(x.question) + len(x.document)), shuffle=True, sort=False, sort_within_batch=True, batch_size=config["batch_size"], train=True, repeat=False, device=device) val_iter = BucketIterator(val, sort_key=lambda x: -(len(x.question) + len(x.document)), sort=True, batch_size=config["batch_size"], repeat=False, device=device) # # model = torch.load( # "saved/65F1_checkpoint_<class 'trainer.ModelFramework'>_L_2.1954014434733815_2019-06-28_10:06_pcknot2.pt").to( # device) if config["modelname"] == "baseline": model = Baseline(config, fields["question"].vocab).to(device) elif config["modelname"] == "bidaf_simplified": model = BidafSimplified(config, fields["question"].vocab).to(device) elif config["modelname"] == "bidaf": model = BidAF(config, fields['question'].vocab, fields["question_char"].vocab).to(device) # glorot_param_init(model) logging.info(f"Models has {count_parameters(model)} parameters") param_sizes, param_shapes = report_parameters(model) param_sizes = "\n'".join(str(param_sizes).split(", '")) param_shapes = "\n'".join(str(param_shapes).split(", '")) logging.debug(f"Model structure:\n{param_sizes}\n{param_shapes}\n") if config["optimizer"] == "adam": optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config["learning_rate"]) else: raise NotImplementedError(f"Option {config['optimizer']} for \"optimizer\" setting is undefined.") start_time = time.time() try: best_val_loss = math.inf best_val_f1 = 0 best_em = 0 ema_active = False for it in range(config["max_iterations"]): logging.info(f"Iteration {it}") if "ema" in config and config["ema"]: ema = EMA.ema_register(config, model) ema_active = True self.train_epoch(model, CrossEntropyLoss(), optimizer, train_iter) if ema_active: EMA.ema_update(ema, model) validation_loss, em, f1 = self.validate(model, CrossEntropyLoss(reduction='none'), val_iter, ema=ema if "ema" in config and config[ "ema"] and ema_active else None) if validation_loss < best_val_loss: best_val_loss = validation_loss if f1 > best_val_f1: best_val_f1 = validation_loss if em > best_em: best_em = em logging.info(f"BEST L/F1/EM = {best_val_loss:.2f}/{best_val_f1:.2f}/{best_em:.2f}") if em > 65: # Do all this on CPU, this is memory exhaustive! model.to(torch.device("cpu")) if ema_active: # backup current params and load ema params backup_params = EMA.ema_backup_and_loadavg(ema, model) torch.save(model, f"saved/checkpoint" f"_{str(self.__class__)}" f"_EM_{em:.2f}_F1_{f1:.2f}_L_{validation_loss:.2f}_{get_timestamp()}" f"_{socket.gethostname()}.pt") # load back backed up params EMA.ema_restore_backed_params(backup_params, model) else: torch.save(model, f"saved/checkpoint" f"_{str(self.__class__)}" f"_EM_{em:.2}_F1_{f1:.2}_L_{validation_loss:.2}_{get_timestamp()}" f"_{socket.gethostname()}.pt") model.to(device) logging.info(f"Validation loss: {validation_loss}") except KeyboardInterrupt: logging.info('-' * 120) logging.info('Exit from training early.') finally: logging.info(f'Finished after {(time.time() - start_time) / 60} minutes.')