def train_and_validate(args): set_seed(args.seed) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build model. model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model = load_model(model, args.pretrained_model_path) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) if args.dist_train: # Multiprocessing distributed mode. mp.spawn(worker, nprocs=args.ranks_num, args=(args.gpu_ranks, args, model), daemon=False) elif args.single_gpu: # Single GPU mode. worker(args.gpu_id, None, args, model) else: # CPU mode. worker(None, None, args, model)
def train_and_validate(args): set_seed(args.seed) # Load vocabulary. if args.spm_model_path: try: import sentencepiece as spm except ImportError: raise ImportError( "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece") sp_model = spm.SentencePieceProcessor() sp_model.Load(args.spm_model_path) args.vocab = { sp_model.IdToPiece(i): i for i in range(sp_model.GetPieceSize()) } if args.target == "mt": tgt_sp_model = spm.SentencePieceProcessor() tgt_sp_model.Load(args.tgt_spm_model_path) args.tgt_vocab = { tgt_sp_model.IdToPiece(i): i for i in range(tgt_sp_model.GetPieceSize()) } else: vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab.w2i if args.target == "mt": tgt_vocab = Vocab() tgt_vocab.load(args.tgt_vocab_path) args.tgt_vocab = tgt_vocab.w2i # Build model. model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model = load_model(model, args.pretrained_model_path) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) if args.dist_train: # Multiprocessing distributed mode. mp.spawn(worker, nprocs=args.ranks_num, args=(args.gpu_ranks, args, model), daemon=False) elif args.single_gpu: # Single GPU mode. worker(args.gpu_id, None, args, model) else: # CPU mode. worker(None, None, args, model)
def train_and_validate(args): set_seed(args.seed) # Load vocabulary. if args.data_processor == "mt": args.tgt_tokenizer = str2tokenizer[args.tgt_tokenizer](args, is_src=False) args.tgt_vocab = args.tgt_tokenizer.vocab args.tokenizer = str2tokenizer[args.tokenizer](args) args.vocab = args.tokenizer.vocab # Build model. model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model = load_model(model, args.pretrained_model_path) else: # Initialize with normal distribution. if args.deep_init: scaled_factor = 1 / math.sqrt(2.0 * args.layers_num) for n, p in list(model.named_parameters()): if "gamma" not in n and "beta" not in n: if "linear_2.weight" in n or "final_linear.weight" in n: p.data.normal_(0, 0.02 * scaled_factor) elif "linear_2.bias" in n or "final_linear.bias" in n: p.data.zero_() else: p.data.normal_(0, 0.02) else: for n, p in list(model.named_parameters()): if "gamma" not in n and "beta" not in n: p.data.normal_(0, 0.02) if args.deepspeed: worker(args.local_rank, None, args, model) elif args.dist_train: # Multiprocessing distributed mode. mp.spawn(worker, nprocs=args.ranks_num, args=(args.gpu_ranks, args, model), daemon=False) elif args.single_gpu: # Single GPU mode. worker(args.gpu_id, None, args, model) else: # CPU mode. worker(None, None, args, model)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the classfier model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=512, help="Sequence length.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build model and load parameters. model = MachineReadingComprehension(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset, examples = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) start_position = torch.LongTensor([sample[2] for sample in dataset]) end_position = torch.LongTensor([sample[3] for sample in dataset]) batch_size = args.batch_size instances_num = len(dataset) print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: start_prob_all, end_prob_all = [], [] for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) start_position_batch = start_position_batch.to(device) end_position_batch = end_position_batch.to(device) with torch.no_grad(): loss, start_logits, end_logits = model(src_batch, seg_batch, start_position_batch, end_position_batch) start_prob = nn.Softmax(dim=1)(start_logits) end_prob = nn.Softmax(dim=1)(end_logits) for j in range(start_prob.size()[0]): start_prob_all.append(start_prob[j]) end_prob_all.append(end_prob[j]) pred_answers = get_answers(dataset, start_prob_all, end_prob_all) output = {} for i in range(len(examples)): question_id = examples[i][2] start_pred_pos = pred_answers[i][1] end_pred_pos = pred_answers[i][2] prediction = examples[i][0][start_pred_pos:end_pred_pos] output[question_id] = prediction f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." ) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False model = Classifier(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build model and load parameters. model = MachineReadingComprehension(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset, examples = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) start_position = torch.LongTensor([sample[2] for sample in dataset]) end_position = torch.LongTensor([sample[3] for sample in dataset]) batch_size = args.batch_size instances_num = len(dataset) print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: start_prob_all, end_prob_all = [], [] for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) start_position_batch = start_position_batch.to(device) end_position_batch = end_position_batch.to(device) with torch.no_grad(): loss, start_logits, end_logits = model(src_batch, seg_batch, start_position_batch, end_position_batch) start_prob = nn.Softmax(dim=1)(start_logits) end_prob = nn.Softmax(dim=1)(end_logits) for j in range(start_prob.size()[0]): start_prob_all.append(start_prob[j]) end_prob_all.append(end_prob[j]) pred_answers = get_answers(dataset, start_prob_all, end_prob_all) output = {} for i in range(len(examples)): question_id = examples[i][2] start_pred_pos = pred_answers[i][1] end_pred_pos = pred_answers[i][2] prediction = examples[i][0][start_pred_pos:end_pred_pos + 1] output[question_id] = prediction f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
default="first", help="Pooling Type.") parser.add_argument("--whitening_size", type=int, default=None, help="Output vector size after whitening.") tokenizer_opts(parser) args = parser.parse_args() args = load_hyperparam(args) args.tokenizer = str2tokenizer[args.tokenizer](args) # Build feature extractor model. model = FeatureExtractor(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model.eval() dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset])
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default="./models/google_vocab.txt", type=str, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") # Evaluation options. parser.add_argument("--mean_reciprocal_rank", action="store_true", help="Evaluation metrics for DBQA dataset.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Count the number of labels. labels_set = set() columns = {} with open(args.train_path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): try: line = line.strip().split("\t") if line_id == 0: for i, column_name in enumerate(line): columns[column_name] = i continue label = int(line[columns["label"]]) labels_set.add(label) except: pass args.labels_num = len(labels_set) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build bert model. # A pseudo target is added. args.target = "bert" model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model.load_state_dict(torch.load(args.pretrained_model_path), strict=False) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) # Build classification model. model = BertClassifier(args, model) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Datset loader. def batch_loader(batch_size, input_ids, label_ids, mask_ids): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] yield input_ids_batch, label_ids_batch, mask_ids_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] label_ids_batch = label_ids[instances_num // batch_size * batch_size:] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] yield input_ids_batch, label_ids_batch, mask_ids_batch # Build tokenizer. tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Read dataset. def read_dataset(path): dataset = [] with open(path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue try: line = line.strip().split('\t') if len(line) == 2: label = int(line[columns["label"]]) text = line[columns["text_a"]] tokens = [ vocab.get(t) for t in tokenizer.tokenize(text) ] tokens = [CLS_ID] + tokens mask = [1] * len(tokens) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append((tokens, label, mask)) elif len(line) == 3: # For sentence pair input. label = int(line[columns["label"]]) text_a, text_b = line[columns["text_a"]], line[ columns["text_b"]] tokens_a = [ vocab.get(t) for t in tokenizer.tokenize(text_a) ] tokens_a = [CLS_ID] + tokens_a + [SEP_ID] tokens_b = [ vocab.get(t) for t in tokenizer.tokenize(text_b) ] tokens_b = tokens_b + [SEP_ID] tokens = tokens_a + tokens_b mask = [1] * len(tokens_a) + [2] * len(tokens_b) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append((tokens, label, mask)) elif len(line) == 4: # For dbqa input. qid = int(line[columns["qid"]]) label = int(line[columns["label"]]) text_a, text_b = line[columns["text_a"]], line[ columns["text_b"]] tokens_a = [ vocab.get(t) for t in tokenizer.tokenize(text_a) ] tokens_a = [CLS_ID] + tokens_a + [SEP_ID] tokens_b = [ vocab.get(t) for t in tokenizer.tokenize(text_b) ] tokens_b = tokens_b + [SEP_ID] tokens = tokens_a + tokens_b mask = [1] * len(tokens_a) + [2] * len(tokens_b) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) mask.append(0) dataset.append((tokens, label, mask, qid)) else: pass except: pass return dataset # Evaluation function. def evaluate(args, is_test): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) input_ids = torch.LongTensor([sample[0] for sample in dataset]) label_ids = torch.LongTensor([sample[1] for sample in dataset]) mask_ids = torch.LongTensor([sample[2] for sample in dataset]) batch_size = args.batch_size instances_num = input_ids.size()[0] if is_test: print("The number of evaluation instances: ", instances_num) correct = 0 # Confusion matrix. confusion = torch.zeros(args.labels_num, args.labels_num, dtype=torch.long) model.eval() if not args.mean_reciprocal_rank: for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) with torch.no_grad(): loss, logits = model(input_ids_batch, label_ids_batch, mask_ids_batch) logits = nn.Softmax(dim=1)(logits) pred = torch.argmax(logits, dim=1) gold = label_ids_batch for j in range(pred.size()[0]): confusion[pred[j], gold[j]] += 1 correct += torch.sum(pred == gold).item() if is_test: print("Confusion matrix:") print(confusion) print("Report precision, recall, and f1:") for i in range(confusion.size()[0]): p = confusion[i, i].item() / confusion[i, :].sum().item() r = confusion[i, i].item() / confusion[:, i].sum().item() f1 = 2 * p * r / (p + r) if is_test: print("Label {}: {:.3f}, {:.3f}, {:.3f}".format( i, p, r, f1)) print("Acc. (Correct/Total): {:.4f} ({}/{}) ".format( correct / len(dataset), correct, len(dataset))) return correct / len(dataset) else: for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) with torch.no_grad(): loss, logits = model(input_ids_batch, label_ids_batch, mask_ids_batch) logits = nn.Softmax(dim=1)(logits) if i == 0: logits_all = logits if i >= 1: logits_all = torch.cat((logits_all, logits), 0) order = -1 gold = [] for i in range(len(dataset)): qid = dataset[i][3] label = dataset[i][1] if qid == order: j += 1 if label == 1: gold.append((qid, j)) else: order = qid j = 0 if label == 1: gold.append((qid, j)) label_order = [] order = -1 for i in range(len(gold)): if gold[i][0] == order: templist.append(gold[i][1]) elif gold[i][0] != order: order = gold[i][0] if i > 0: label_order.append(templist) templist = [] templist.append(gold[i][1]) label_order.append(templist) order = -1 score_list = [] for i in range(len(logits_all)): score = float(logits_all[i][1]) qid = int(dataset[i][3]) if qid == order: templist.append(score) else: order = qid if i > 0: score_list.append(templist) templist = [] templist.append(score) score_list.append(templist) rank = [] pred = [] for i in range(len(score_list)): if len(label_order[i]) == 1: if label_order[i][0] < len(score_list[i]): true_score = score_list[i][label_order[i][0]] score_list[i].sort(reverse=True) for j in range(len(score_list[i])): if score_list[i][j] == true_score: rank.append(1 / (j + 1)) else: rank.append(0) else: true_rank = len(score_list[i]) for k in range(len(label_order[i])): if label_order[i][k] < len(score_list[i]): true_score = score_list[i][label_order[i][k]] temp = sorted(score_list[i], reverse=True) for j in range(len(temp)): if temp[j] == true_score: if j < true_rank: true_rank = j if true_rank < len(score_list[i]): rank.append(1 / (true_rank + 1)) else: rank.append(0) MRR = sum(rank) / len(rank) print(MRR) return MRR # Training phase. print("Start training.") trainset = read_dataset(args.train_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size input_ids = torch.LongTensor([example[0] for example in trainset]) label_ids = torch.LongTensor([example[1] for example in trainset]) mask_ids = torch.LongTensor([example[2] for example in trainset]) train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=train_steps) total_loss = 0. result = 0.0 best_result = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) loss, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. loss.backward() optimizer.step() result = evaluate(args, False) if result > best_result: best_result = result save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") model = load_model(model, args.output_model_path) evaluate(args, True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the classfier model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=128, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Output options. parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build classification model and load parameters. args.soft_targets = False model = Classifier(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: if args.output_logits: f.write("label" + "\t" + "logits" + "\n") else: f.write("label" + "\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() logits = logits.cpu().numpy().tolist() if args.output_logits: for j in range(len(pred)): f.write( str(pred[j]) + "\t" + " ".join([str(v) for v in logits[j]]) + "\n") else: for j in range(len(pred)): f.write(str(pred[j]) + "\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") tokenizer_opts(parser) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False model = SiameseClassifier(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src_a = torch.LongTensor([example[0][0] for example in dataset]) src_b = torch.LongTensor([example[0][1] for example in dataset]) seg_a = torch.LongTensor([example[1][0] for example in dataset]) seg_b = torch.LongTensor([example[1][1] for example in dataset]) batch_size = args.batch_size instances_num = src_a.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate( batch_loader(batch_size, (src_a, src_b), (seg_a, seg_b))): src_a_batch, src_b_batch = src_batch seg_a_batch, seg_b_batch = seg_batch src_a_batch = src_a_batch.to(device) src_b_batch = src_b_batch.to(device) seg_a_batch = seg_a_batch.to(device) seg_b_batch = seg_b_batch.to(device) with torch.no_grad(): _, logits = model((src_a_batch, src_b_batch), None, (seg_a_batch, seg_b_batch)) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the NER model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") parser.add_argument("--label2id_path", type=str, required=True, help="Path of the label2id file.") # Model options. parser.add_argument("--batch_size", type=int, default=128, help="Batch_size.") parser.add_argument("--seq_length", default=128, type=int, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) with open(args.label2id_path, mode="r", encoding="utf-8") as f: l2i = json.load(f) print("Labels: ", l2i) l2i["[PAD]"] = len(l2i) i2l = {} for key, value in l2i.items(): i2l[value] = key args.l2i = l2i args.labels_num = len(l2i) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build sequence labeling model. model = NerTagger(args) model = load_model(model, args.load_model_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) instances = read_dataset(args, args.test_path) src = torch.LongTensor([ins[0] for ins in instances]) seg = torch.LongTensor([ins[1] for ins in instances]) instances_num = src.size(0) batch_size = args.batch_size print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("pred_label" + "\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = logits.argmax(dim=-1) # Storing sequence length of instances in a batch. seq_length_batch = [] for seg in seg_batch.cpu().numpy().tolist(): for j in range(len(seg) - 1, -1, -1): if seg[j] != 0: break seq_length_batch.append(j + 1) pred = pred.cpu().numpy().tolist() for j in range(0, len(pred), args.seq_length): for label_id in pred[j:j + seq_length_batch[j // args.seq_length]]: f.write(i2l[label_id] + " ") f.write("\n")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) tokenizer_opts(parser) parser.add_argument("--tgt_seq_length", type=int, default=32, help="Output sequence length.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model. model = Text2text(args) model = load_model(model, args.load_model_path) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(args.device) seg_batch = seg_batch.to(args.device) tgt_in_batch = torch.zeros(src_batch.size()[0], 1, dtype = torch.long, device = args.device) for j in range(tgt_in_batch.size()[0]): tgt_in_batch[j][-1] = args.tokenizer.vocab.get(CLS_TOKEN) with torch.no_grad(): memory_bank = model(src_batch, None, seg_batch, only_use_encoder=True) for _ in range(args.tgt_seq_length): with torch.no_grad(): outputs = model(src_batch, (tgt_in_batch, None, src_batch), None, memory_bank=memory_bank) next_token_logits = outputs[:, -1] next_tokens = torch.argmax(next_token_logits, dim=1).unsqueeze(1) tgt_in_batch = torch.cat([tgt_in_batch, next_tokens], dim=1) for j in range(len(outputs)): f.write("".join([args.tokenizer.inv_vocab[token_id.item()] for token_id in tgt_in_batch[j][1:]]) .split(SEP_TOKEN)[0]) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) tokenizer_opts(parser) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") parser.add_argument("--prompt_id", type=str, default="chnsenticorp_char") parser.add_argument("--prompt_path", type=str, default="models/prompts.json") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) process_prompt_template(args) answer_position = [0] * len(args.tokenizer.vocab) for answer in args.answer_word_dict_inv: answer_position[int(args.tokenizer.vocab[answer])] = 1 args.answer_position = torch.LongTensor(answer_position) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Build classification model and load parameters. model = ClozeTest(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. model = model.to(args.device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) tgt = torch.LongTensor([sample[1] for sample in dataset]) seg = torch.LongTensor([sample[2] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for _, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(args.device) tgt_batch = tgt_batch.to(args.device) seg_batch = seg_batch.to(args.device) with torch.no_grad(): _, pred, logits = model(src_batch, tgt_batch, seg_batch) logits = logits[:, args.answer_position > 0] prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the multiple choice model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=512, help="Sequence length.") parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build classification model and load parameters. args.soft_targets = False model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([example[0] for example in dataset]) tgt = torch.LongTensor([example[1] for example in dataset]) seg = torch.LongTensor([example[2] for example in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.test_path) as f: data = json.load(f) question_ids = [] for i in range(len(data)): questions = data[i][1] for question in questions: question_ids.append(question['id']) index = 0 with open(args.prediction_path, 'w') as f: for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): output = {} output['id'] = question_ids[index] index += 1 output['label'] = int(pred[j]) f.write(json.dumps(output)) f.write('\n')
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the classfier model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--test_features_path", default=None, type=str, help="Path of the test features for stacking.") parser.add_argument("--config_path", default="models/bert/base_config.json", type=str, help="Path of the config file.") # Model options. model_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") # Inference options. parser.add_argument("--batch_size", type=int, default=64, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") # Tokenizer options. tokenizer_opts(parser) # Output options. parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") # Cross validation options. parser.add_argument("--folds_num", type=int, default=5, help="The number of folds for cross validation.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) test_features = [[] for _ in range(args.folds_num)] for fold_id in range(args.folds_num): load_model_name = ".".join(args.load_model_path.split(".")[:-1]) load_model_suffix = args.load_model_path.split(".")[-1] model = Classifier(args) model = load_model( model, load_model_name + "-fold_" + str(fold_id) + "." + load_model_suffix) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) model.eval() for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) prob = nn.Softmax(dim=1)(logits) prob = prob.cpu().numpy().tolist() test_features[fold_id].extend(prob) test_features = np.array(test_features) test_features = np.mean(test_features, axis=0) np.save(args.test_features_path, test_features)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the multiple choice model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=64, help="Sequence length.") parser.add_argument( "--max_choices_num", default=10, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="char", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append( (examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--max_choices_num", default=10, type=int, help="The maximum number of cadicate answer, shorter than this will be padded.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append((examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([example[0] for example in dataset]) tgt = torch.LongTensor([example[1] for example in dataset]) seg = torch.LongTensor([example[2] for example in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.test_path) as f: data = json.load(f) question_ids = [] for i in range(len(data)): questions = data[i][1] for question in questions: question_ids.append(question["id"]) index = 0 with open(args.prediction_path, "w") as f: for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): output = {} output["id"] = question_ids[index] index += 1 output["label"] = int(pred[j]) f.write(json.dumps(output)) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--label2id_path", type=str, required=True, help="Path of the label2id file.") parser.add_argument( "--crf_target", action="store_true", help="Use CRF loss as the target function or not, default False.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) with open(args.label2id_path, mode="r", encoding="utf-8") as f: l2i = json.load(f) print("Labels: ", l2i) l2i["[PAD]"] = len(l2i) i2l = {} for key, value in l2i.items(): i2l[value] = key args.l2i = l2i args.labels_num = len(l2i) # Load tokenizer. args.tokenizer = SpaceTokenizer(args) # Build sequence labeling model. model = NerTagger(args) model = load_model(model, args.load_model_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) instances = read_dataset(args, args.test_path) src = torch.LongTensor([ins[0] for ins in instances]) seg = torch.LongTensor([ins[1] for ins in instances]) instances_num = src.size(0) batch_size = args.batch_size print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("pred_label" + "\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, pred = model(src_batch, None, seg_batch) # Storing sequence length of instances in a batch. seq_length_batch = [] for seg in seg_batch.cpu().numpy().tolist(): for j in range(len(seg) - 1, -1, -1): if seg[j] != 0: break seq_length_batch.append(j + 1) pred = pred.cpu().numpy().tolist() for j in range(0, len(pred), args.seq_length): for label_id in pred[j:j + seq_length_batch[j // args.seq_length]]: f.write(i2l[label_id] + " ") f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/ner_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--train_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch_size.") parser.add_argument("--seq_length", default=128, type=int, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") # Subword options. parser.add_argument("--subword_type", choices=["none", "char"], default="none", help="Subword feature type.") parser.add_argument("--sub_vocab_path", type=str, default="models/sub_vocab.txt", help="Path of the subword vocabulary file.") parser.add_argument("--subencoder", choices=["avg", "lstm", "gru", "cnn"], default="avg", help="Subencoder type.") parser.add_argument("--sub_layers_num", type=int, default=2, help="The number of subencoder layers.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=3, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) set_seed(args.seed) labels_map = {"[PAD]": 0} begin_ids = [] # Find tagging labels with open(args.train_path, mode="r", encoding="utf-8") as f: for line_id, line in enumerate(f): if line_id == 0: continue labels = line.strip().split("\t")[1].split() for l in labels: if l not in labels_map: if l.startswith("B") or l.startswith("S"): begin_ids.append(len(labels_map)) labels_map[l] = len(labels_map) print("Labels: ", labels_map) args.labels_num = len(labels_map) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build bert model. # A pseudo target is added. args.target = "bert" model = build_model(args) # Load or initialize parameters. if args.pretrained_model_path is not None: # Initialize with pretrained model. model.load_state_dict(torch.load(args.pretrained_model_path), strict=False) else: # Initialize with normal distribution. for n, p in list(model.named_parameters()): if 'gamma' not in n and 'beta' not in n: p.data.normal_(0, 0.02) # Build sequence labeling model. model = BertTagger(args, model) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # Datset loader. def batch_loader(batch_size, input_ids, label_ids, mask_ids): instances_num = input_ids.size()[0] for i in range(instances_num // batch_size): input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :] label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :] mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :] yield input_ids_batch, label_ids_batch, mask_ids_batch if instances_num > instances_num // batch_size * batch_size: input_ids_batch = input_ids[instances_num // batch_size * batch_size:, :] label_ids_batch = label_ids[instances_num // batch_size * batch_size:, :] mask_ids_batch = mask_ids[instances_num // batch_size * batch_size:, :] yield input_ids_batch, label_ids_batch, mask_ids_batch # Read dataset. def read_dataset(path): dataset = [] with open(path, mode="r", encoding="utf-8") as f: f.readline() tokens, labels = [], [] for line_id, line in enumerate(f): tokens, labels = line.strip().split("\t") tokens = [vocab.get(t) for t in tokens.split(" ")] labels = [labels_map[l] for l in labels.split(" ")] mask = [1] * len(tokens) if len(tokens) > args.seq_length: tokens = tokens[:args.seq_length] labels = labels[:args.seq_length] mask = mask[:args.seq_length] while len(tokens) < args.seq_length: tokens.append(0) labels.append(0) mask.append(0) dataset.append([tokens, labels, mask]) return dataset # Evaluation function. def evaluate(args, is_test): if is_test: dataset = read_dataset(args.test_path) else: dataset = read_dataset(args.dev_path) input_ids = torch.LongTensor([sample[0] for sample in dataset]) label_ids = torch.LongTensor([sample[1] for sample in dataset]) mask_ids = torch.LongTensor([sample[2] for sample in dataset]) instances_num = input_ids.size(0) batch_size = args.batch_size if is_test: print("Batch size: ", batch_size) print("The number of test instances:", instances_num) correct = 0 gold_entities_num = 0 pred_entities_num = 0 confusion = torch.zeros(len(labels_map), len(labels_map), dtype=torch.long) model.eval() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) loss, _, pred, gold = model(input_ids_batch, label_ids_batch, mask_ids_batch) for j in range(gold.size()[0]): if gold[j].item() in begin_ids: gold_entities_num += 1 for j in range(pred.size()[0]): if pred[j].item( ) in begin_ids and gold[j].item() != labels_map["[PAD]"]: pred_entities_num += 1 pred_entities_pos = [] gold_entities_pos = [] start, end = 0, 0 for j in range(gold.size()[0]): if gold[j].item() in begin_ids: start = j for k in range(j + 1, gold.size()[0]): if gold[k].item( ) == labels_map["[PAD]"] or gold[k].item( ) == labels_map["O"] or gold[k].item() in begin_ids: end = k - 1 break else: end = gold.size()[0] - 1 gold_entities_pos.append((start, end)) for j in range(pred.size()[0]): if pred[j].item( ) in begin_ids and gold[j].item() != labels_map["[PAD]"]: start = j for k in range(j + 1, pred.size()[0]): if pred[k].item( ) == labels_map["[PAD]"] or pred[k].item( ) == labels_map["O"] or pred[k].item() in begin_ids: end = k - 1 break else: end = pred.size()[0] - 1 pred_entities_pos.append((start, end)) for entity in pred_entities_pos: if entity not in gold_entities_pos: continue for j in range(entity[0], entity[1] + 1): if gold[j].item() != pred[j].item(): break else: correct += 1 print("Report precision, recall, and f1:") p = correct / pred_entities_num r = correct / gold_entities_num f1 = 2 * p * r / (p + r) print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1)) return f1 # Training phase. print("Start training.") instances = read_dataset(args.train_path) input_ids = torch.LongTensor([ins[0] for ins in instances]) label_ids = torch.LongTensor([ins[1] for ins in instances]) mask_ids = torch.LongTensor([ins[2] for ins in instances]) instances_num = input_ids.size(0) batch_size = args.batch_size train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] ooptimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup, t_total=train_steps) total_loss = 0. f1 = 0.0 best_f1 = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate( batch_loader(batch_size, input_ids, label_ids, mask_ids)): model.zero_grad() input_ids_batch = input_ids_batch.to(device) label_ids_batch = label_ids_batch.to(device) mask_ids_batch = mask_ids_batch.to(device) loss, _, _, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch) if torch.cuda.device_count() > 1: loss = torch.mean(loss) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. loss.backward() optimizer.step() scheduler.step() f1 = evaluate(args, False) if f1 > best_f1: best_f1 = f1 save_model(model, args.output_model_path) else: continue # Evaluation phase. if args.test_path is not None: print("Test set evaluation.") model = load_model(model, args.output_model_path) evaluate(args, True)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the classfier model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--test_features_path", default=None, type=str, help="Path of the test features for stacking.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=128, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." ) # Output options. parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") # Cross validation options. parser.add_argument("--folds_num", type=int, default=5, help="The number of folds for cross validation.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False #model = Classifier(args) #model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #model = model.to(device) #if torch.cuda.device_count() > 1: # print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) # model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) test_features = [[] for _ in range(args.folds_num)] for fold_id in range(args.folds_num): load_model_name = ".".join(args.load_model_path.split(".")[:-1]) load_model_suffix = args.load_model_path.split(".")[-1] model = Classifier(args) model = load_model(model, load_model_name+"-fold_"+str(fold_id)+"."+load_model_suffix) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) model.eval() for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) prob = nn.Softmax(dim=1)(logits) prob = prob.cpu().numpy().tolist() test_features[fold_id].extend(prob) test_features = np.array(test_features) test_features = np.mean(test_features, axis=0) print(test_features.shape) np.save(args.test_features_path, test_features)