config.__dict__.update(corpus_config_dict) # Set random seeds torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) random.seed(config.seed) np.random.seed(config.seed) # Tokenizers special_token_dict = { "speaker1_token": "<speaker1>", "speaker2_token": "<speaker2>" } if config.tokenizer == "ws": tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict) elif config.tokenizer == "roberta": tokenizer = ModRobertaTokenizer(model_size=config.model_size, special_token_dict=special_token_dict) # Data loaders with open(config.dataset_path, encoding="utf-8") as f: dataset = json.load(f) data_source = DataSourceSupervised( data=dataset["test"], config=config, tokenizer=tokenizer, ) print(data_source.statistics)
print(s) # set random seeds torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) random.seed(config.seed) np.random.seed(config.seed) # tokenizers special_token_dict = { "speaker1_token": "<speaker1>", "speaker2_token": "<speaker2>" } if config.tokenizer == "ws": tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict) eval_tokenizer = WhiteSpaceTokenizer(config.word_count_path, 100000) # data loaders intrinsic_stat_reporter = StatisticsReporter() with open(config.dataset_path, encoding="utf-8") as f: dataset = json.load(f) mlog("----- Loading test data -----") test_data_source = DataSource(data=dataset["test"], config=config, tokenizer=tokenizer) mlog(str(test_data_source.statistics)) # metrics calculator metrics = SentenceMetrics(config.eval_word_embedding_path, eval_tokenizer)
log_f.write(s+"\n") print(s) # set random seeds torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) random.seed(config.seed) np.random.seed(config.seed) # tokenizers special_token_dict = { } if config.tokenizer == "ws": tokenizer = WhiteSpaceTokenizer( word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict ) # data loaders & number reporters trn_reporter = StatisticsReporter() dev_reporter = StatisticsReporter() with open(config.dataset_path, encoding="utf-8") as f: dataset = json.load(f) mlog("----- Loading training data -----") train_data_source = DataSource( data=dataset["train"], config=config, tokenizer=tokenizer ) mlog(str(train_data_source.statistics))
print(s) # set random seeds torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) random.seed(config.seed) np.random.seed(config.seed) # tokenizers special_token_dict = { "speaker1_token": "<speaker1>", "speaker2_token": "<speaker2>" } if config.tokenizer == "ws": tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict) elif config.tokenizer == "gpt2": tokenizer = ModGPT2Tokenizer(model_size=config.model_size, special_token_dict=special_token_dict) # data loaders & number reporters trn_reporter = StatisticsReporter() dev_reporter = StatisticsReporter() with open(f"{config.dataset_path}.aggregate", encoding="utf-8") as f: dataset = json.load(f) mlog("----- Loading training data -----") train_data_source = DataSource( data=dataset["train"], config=config, tokenizer=tokenizer,
print(s) # set random seeds torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) random.seed(config.seed) np.random.seed(config.seed) # tokenizers special_token_dict = { "speaker1_token": "<speaker1>", "speaker2_token": "<speaker2>" } if config.tokenizer == "ws": tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict) elif config.tokenizer == "roberta": tokenizer = ModRobertaTokenizer(model_size=config.model_size, special_token_dict=special_token_dict) # data loaders & number reporters trn_reporter = StatisticsReporter() dev_reporter = StatisticsReporter() with open(config.dataset_path, encoding="utf-8") as f: dataset = json.load(f) mlog("----- Loading unsupervised training data -----") train_data_source = DataSourceUnsupervised(data=dataset["train"], config=config, tokenizer=tokenizer) mlog(str(train_data_source.statistics))
def run_train(config): # tokenizers special_token_dict = { "speaker1_token": "<speaker1>", "speaker2_token": "<speaker2>" } tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict) label_token_dict = { f"label_{label_idx}_token": label for label_idx, label in enumerate(config.joint_da_seg_recog_labels) } label_token_dict.update({ "pad_token": "<pad>", "bos_token": "<t>", "eos_token": "</t>" }) label_tokenizer = CustomizedTokenizer(token_dict=label_token_dict) # metrics calculator metrics = DAMetrics() # define logger MODEL_NAME = config.model LOG_FILE_NAME = "{}.seed_{}.{}".format( MODEL_NAME, config.seed, time.strftime("%Y%m%d-%H%M%S", time.localtime())) if config.filename_note: LOG_FILE_NAME += f".{config.filename_note}" # data loaders & number reporters trn_reporter = StatisticsReporter() dev_reporter = StatisticsReporter() with open(config.dataset_path, encoding="utf-8") as f: dataset = json.load(f) mlog("----- Loading training data -----", config, LOG_FILE_NAME) train_data_source = DataSource(data=dataset["train"], config=config, tokenizer=tokenizer, label_tokenizer=label_tokenizer) mlog(str(train_data_source.statistics), config, LOG_FILE_NAME) mlog("----- Loading dev data -----", config, LOG_FILE_NAME) dev_data_source = DataSource(data=dataset["dev"], config=config, tokenizer=tokenizer, label_tokenizer=label_tokenizer) mlog(str(dev_data_source.statistics), config, LOG_FILE_NAME) # build model if config.model == "ed": Model = EDSeqLabeler elif config.model == "attn_ed": Model = AttnEDSeqLabeler model = Model(config, tokenizer, label_tokenizer) # model adaption if torch.cuda.is_available(): mlog("----- Using GPU -----", config, LOG_FILE_NAME) model = model.cuda() if config.model_path: model.load_model(config.model_path) mlog("----- Model loaded -----", config, LOG_FILE_NAME) mlog(f"model path: {config.model_path}", config, LOG_FILE_NAME) # Build optimizer optimizer = optim.AdamW(model.parameters(), lr=config.init_lr, weight_decay=config.l2_penalty) # Build lr scheduler lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer, mode="min", factor=config.lr_decay_rate, patience=2, ) # log hyper parameters start_time = time.time() mlog("----- Hyper-parameters -----", config, LOG_FILE_NAME) for k, v in sorted(dict(config.__dict__).items()): mlog("{}: {}".format(k, v), config, LOG_FILE_NAME) # here we go n_step = 0 for epoch in range(1, config.n_epochs + 1): lr = list(lr_scheduler.optimizer.param_groups)[0]["lr"] if lr <= config.min_lr: break # Train n_batch = 0 train_data_source.epoch_init(shuffle=True) while True: batch_data = train_data_source.next(config.batch_size) if batch_data is None: break # Forward model.train() ret_data, ret_stat = model.train_step(batch_data) trn_reporter.update_data(ret_stat) # Backward loss = ret_data["loss"] loss.backward() if config.gradient_clip > 0.0: torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip) optimizer.step() optimizer.zero_grad() # update trn_reporter.update_data(ret_stat) # Check loss if n_step > 0 and n_step % config.check_loss_after_n_step == 0: log_s = f"{time.time()-start_time:.2f}s Epoch {epoch} batch {n_batch} - " log_s += trn_reporter.to_string() mlog(log_s, config, LOG_FILE_NAME) trn_reporter.clear() # Evaluate on dev dataset if n_step > 0 and n_step % config.validate_after_n_step == 0: model.eval() log_s = f"<Dev> learning rate: {lr}\n" mlog(log_s, config, LOG_FILE_NAME) pred_labels, true_labels = [], [] dev_data_source.epoch_init(shuffle=False) while True: batch_data = dev_data_source.next(config.eval_batch_size) if batch_data is None: break ret_data, ret_stat = model.evaluate_step(batch_data) dev_reporter.update_data(ret_stat) ret_data, ret_stat = model.test_step(batch_data) refs = batch_data["Y"][:, 1:].tolist() hyps = ret_data["symbols"].tolist() for true_label_ids, pred_label_ids in zip(refs, hyps): end_idx = true_label_ids.index( label_tokenizer.eos_token_id) true_labels.append([ label_tokenizer.id2word[label_id] for label_id in true_label_ids[:end_idx] ]) pred_labels.append([ label_tokenizer.id2word[label_id] for label_id in pred_label_ids[:end_idx] ]) log_s = f"\n<Dev> - {time.time()-start_time:.3f}s - " log_s += dev_reporter.to_string() mlog(log_s, config, LOG_FILE_NAME) metrics_results = metrics.batch_metrics( true_labels, pred_labels) experiment.log_metrics(metrics_results) log_s = \ f"\tDSER: {100*metrics_results['DSER']:.2f}\n" \ f"\tseg WER: {100*metrics_results['strict segmentation error']:.2f}\n" \ f"\tDER: {100*metrics_results['DER']:.2f}\n" \ f"\tjoint WER: {100*metrics_results['strict joint error']:.2f}\n" \ f"\tMacro F1: {100*metrics_results['Macro F1']:.2f}\n" \ f"\tMicro F1: {100*metrics_results['Micro F1']:.2f}\n" \ f"\tMacro LWER: {100*metrics_results['Macro LWER']:.2f}\n" \ f"\tMicro LWER: {100*metrics_results['Micro LWER']:.2f}\n" mlog(log_s, config, LOG_FILE_NAME) # Save model if it has better monitor measurement if config.save_model: if not os.path.exists(f"{config.task_data_dir}/model/"): os.makedirs(f"{config.task_data_dir}/model/") torch.save( model.state_dict(), f"{config.task_data_dir}/model/{LOG_FILE_NAME}.model.pt" ) mlog( f"model saved to {config.task_data_dir}/model/{LOG_FILE_NAME}.model.pt", config, LOG_FILE_NAME) if torch.cuda.is_available(): model = model.cuda() # Decay learning rate lr_scheduler.step(dev_reporter.get_value("monitor")) dev_reporter.clear() # Finished a step n_batch += 1 n_step += 1 # Evaluate on test dataset at the end of training mlog("----- EVALUATING at end of training -----", config, LOG_FILE_NAME) mlog("----- Loading test data -----", config, LOG_FILE_NAME) test_data_source = DataSource(data=dataset["test"], config=config, tokenizer=tokenizer, label_tokenizer=label_tokenizer) mlog(str(test_data_source.statistics), config, LOG_FILE_NAME) model.eval() for set_name, data_source in [("DEV", dev_data_source), ("TEST", test_data_source)]: pred_labels, true_labels = [], [] data_source.epoch_init(shuffle=False) RES_FILE_NAME = set_name + "_" + LOG_FILE_NAME s = "LABELS\tPREDS" reslog(s, RES_FILE_NAME) while True: batch_data = data_source.next(config.eval_batch_size) if batch_data is None: break ret_data, ret_stat = model.test_step(batch_data) refs = batch_data["Y"][:, 1:].tolist() hyps = ret_data["symbols"].tolist() for true_label_ids, pred_label_ids in zip(refs, hyps): end_idx = true_label_ids.index(label_tokenizer.eos_token_id) true_syms = [ label_tokenizer.id2word[label_id] for label_id in true_label_ids[:end_idx] ] pred_syms = [ label_tokenizer.id2word[label_id] for label_id in pred_label_ids[:end_idx] ] s = " ".join(true_syms) + "\t" + " ".join(pred_syms) reslog(s, RES_FILE_NAME) true_labels.append(true_syms) pred_labels.append(pred_syms) log_s = f"\n<{set_name}> - {time.time()-start_time:.3f}s - " mlog(log_s, config, LOG_FILE_NAME) metrics_results = metrics.batch_metrics(true_labels, pred_labels) log_s = \ f"\tDSER: {100*metrics_results['DSER']:.2f}\n" \ f"\tseg WER: {100*metrics_results['strict segmentation error']:.2f}\n" \ f"\tDER: {100*metrics_results['DER']:.2f}\n" \ f"\tjoint WER: {100*metrics_results['strict joint error']:.2f}\n" \ f"\tMacro F1: {100*metrics_results['Macro F1']:.2f}\n" \ f"\tMicro F1: {100*metrics_results['Micro F1']:.2f}\n" \ f"\tMacro LWER: {100*metrics_results['Macro LWER']:.2f}\n" \ f"\tMicro LWER: {100*metrics_results['Micro LWER']:.2f}\n" mlog(log_s, config, LOG_FILE_NAME)
def run_test(config): # tokenizers special_token_dict = { "speaker1_token": "<speaker1>", "speaker2_token": "<speaker2>" } tokenizer = WhiteSpaceTokenizer(word_count_path=config.word_count_path, vocab_size=config.vocab_size, special_token_dict=special_token_dict) label_token_dict = { f"label_{label_idx}_token": label for label_idx, label in enumerate(config.joint_da_seg_recog_labels) } label_token_dict.update({ "pad_token": "<pad>", "bos_token": "<t>", "eos_token": "</t>" }) label_tokenizer = CustomizedTokenizer(token_dict=label_token_dict) # metrics calculator metrics = DAMetrics() mlog("----- Loading dev data -----", config, config.LOG_FILE_NAME) dev_data_source = DataSource(data=dataset["dev"], config=config, tokenizer=tokenizer, label_tokenizer=label_tokenizer) mlog(str(dev_data_source.statistics), config, config.LOG_FILE_NAME) # build model if config.model == "ed": Model = EDSeqLabeler elif config.model == "attn_ed": Model = AttnEDSeqLabeler model = Model(config, tokenizer, label_tokenizer) # model adaption if torch.cuda.is_available(): mlog("----- Using GPU -----", config, config.LOG_FILE_NAME) model = model.cuda() if not config.model_path: print("NEED TO PROVIDE PATH") exit(0) model.load_model(config.model_path) mlog("----- Model loaded -----", config, config.LOG_FILE_NAME) mlog(f"model path: {config.model_path}", config, config.LOG_FILE_NAME) mlog("----- Loading test data -----", config, config.LOG_FILE_NAME) test_data_source = DataSource(data=dataset["test"], config=config, tokenizer=tokenizer, label_tokenizer=label_tokenizer) mlog(str(test_data_source.statistics), config, config.LOG_FILE_NAME) model.eval() for set_name, data_source in [("DEV", dev_data_source), ("TEST", test_data_source)]: pred_labels, true_labels = [], [] data_source.epoch_init(shuffle=False) RES_FILE_NAME = set_name + "_" + config.LOG_FILE_NAME s = "LABELS\tPREDS" reslog(s, RES_FILE_NAME) while True: batch_data = data_source.next(config.eval_batch_size) if batch_data is None: break ret_data, ret_stat = model.test_step(batch_data) refs = batch_data["Y"][:, 1:].tolist() hyps = ret_data["symbols"].tolist() for true_label_ids, pred_label_ids in zip(refs, hyps): end_idx = true_label_ids.index(label_tokenizer.eos_token_id) true_syms = [ label_tokenizer.id2word[label_id] for label_id in true_label_ids[:end_idx] ] pred_syms = [ label_tokenizer.id2word[label_id] for label_id in pred_label_ids[:end_idx] ] s = " ".join(true_syms) + "\t" + " ".join(pred_syms) reslog(s, RES_FILE_NAME) true_labels.append(true_syms) pred_labels.append(pred_syms) log_s = f"\n<{set_name}> - {time.time()-start_time:.3f}s - " mlog(log_s, config, config.LOG_FILE_NAME) metrics_results = metrics.batch_metrics(true_labels, pred_labels) log_s = \ f"\tDSER: {100*metrics_results['DSER']:.2f}\n" \ f"\tseg WER: {100*metrics_results['strict segmentation error']:.2f}\n" \ f"\tDER: {100*metrics_results['DER']:.2f}\n" \ f"\tjoint WER: {100*metrics_results['strict joint error']:.2f}\n" \ f"\tMacro F1: {100*metrics_results['Macro F1']:.2f}\n" \ f"\tMicro F1: {100*metrics_results['Micro F1']:.2f}\n" mlog(log_s, config, config.LOG_FILE_NAME)