def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([example[0] for example in dataset]) tgt = torch.LongTensor([example[1] for example in dataset]) seg = torch.LongTensor([example[2] for example in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.test_path) as f: data = json.load(f) question_ids = [] for i in range(len(data)): questions = data[i][1] for question in questions: question_ids.append(question["id"]) index = 0 with open(args.prediction_path, "w") as f: for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): output = {} output["id"] = question_ids[index] index += 1 output["label"] = int(pred[j]) f.write(json.dumps(output)) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) finetune_opts(parser) parser.add_argument("--train_answer_path", type=str, required=True, help="Path of the answers for trainset.") parser.add_argument("--dev_answer_path", type=str, required=True, help="Path of the answers for devset.") parser.add_argument( "--max_choices_num", default=10, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) args = parser.parse_args() args.labels_num = args.max_choices_num if args.output_model_path == None: args.output_model_path = "./models/chid_model.bin" # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build multiple choice model. model = MultipleChoice(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_path, args.train_answer_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0., 0., 0. print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. result = evaluate( args, read_dataset(args, args.dev_path, args.dev_answer_path)) if result[0] > best_result: best_result = result[0] save_model(model, args.output_model_path)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the multiple choice model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=64, help="Sequence length.") parser.add_argument( "--max_choices_num", default=10, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="char", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append( (examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--pretrained_model_path", default=None, type=str, help="Path of the pretrained model.") parser.add_argument("--output_model_path", default="./models/multichoice_model.bin", type=str, help="Path of the output model.") parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--train_data_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--train_label_path", type=str, required=True, help="Path of the trainset.") parser.add_argument("--dev_data_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--dev_label_path", type=str, required=True, help="Path of the devset.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=512, help="Sequence length.") parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") parser.add_argument( "--max_choices_num", default=10, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") parser.add_argument( "--fp16", action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit." ) parser.add_argument( "--fp16_opt_level", choices=["O0", "O1", "O2", "O3"], default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") # Training options. parser.add_argument("--dropout", type=float, default=0.2, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=8, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") args = parser.parse_args() args.labels_num = args.max_choices_num # Load the hyperparameters from the config file. args = load_hyperparam(args) set_seed(args.seed) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build multiple choice model. model = MultipleChoice(args) # Load or initialize parameters. load_or_initialize_parameters(args, model) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) # Training phase. trainset = read_dataset(args, args.train_data_path, args.train_label_path) random.shuffle(trainset) instances_num = len(trainset) batch_size = args.batch_size src = torch.LongTensor([example[0] for example in trainset]) tgt = torch.LongTensor([example[1] for example in trainset]) seg = torch.LongTensor([example[2] for example in trainset]) args.train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) optimizer, scheduler = build_optimizer(args, model) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) args.amp = amp if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) args.model = model total_loss, result, best_result = 0., 0., 0. print("Start training.") for epoch in range(1, args.epochs_num + 1): model.train() for i, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): loss = train_model(args, model, optimizer, scheduler, src_batch, tgt_batch, seg_batch) total_loss += loss.item() if (i + 1) % args.report_steps == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, i + 1, total_loss / args.report_steps)) total_loss = 0. result = evaluate( args, read_dataset(args, args.dev_data_path, args.dev_label_path)) if result > best_result: best_result = result save_model(model, args.output_model_path)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--max_choices_num", default=10, type=int, help="The maximum number of cadicate answer, shorter than this will be padded.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append((examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path options. parser.add_argument("--load_model_path", default=None, type=str, help="Path of the multiple choice model.") parser.add_argument("--vocab_path", type=str, required=True, help="Path of the vocabulary file.") parser.add_argument("--test_path", type=str, help="Path of the testset.") parser.add_argument("--prediction_path", default=None, type=str, help="Path of the prediction file.") parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str, help="Path of the config file.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=512, help="Sequence length.") parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument("--embedding", choices=["bert", "word"], default="bert", help="Emebdding type.") parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \ "cnn", "gatedcnn", "attn", "synt", \ "rcnn", "crnn", "gpt", "bilstm"], \ default="bert", help="Encoder type.") parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.") parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.") parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.") # Tokenizer options. parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Load vocabulary. vocab = Vocab() vocab.load(args.vocab_path) args.vocab = vocab # Build classification model and load parameters. args.soft_targets = False model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([example[0] for example in dataset]) tgt = torch.LongTensor([example[1] for example in dataset]) seg = torch.LongTensor([example[2] for example in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.test_path) as f: data = json.load(f) question_ids = [] for i in range(len(data)): questions = data[i][1] for question in questions: question_ids.append(question['id']) index = 0 with open(args.prediction_path, 'w') as f: for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): output = {} output['id'] = question_ids[index] index += 1 output['label'] = int(pred[j]) f.write(json.dumps(output)) f.write('\n')