def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument( "--max_choices_num", default=4, type=int, help= "The maximum number of cadicate answer, shorter than this will be padded." ) parser.add_argument( "--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([example[0] for example in dataset]) tgt = torch.LongTensor([example[1] for example in dataset]) seg = torch.LongTensor([example[2] for example in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.test_path) as f: data = json.load(f) question_ids = [] for i in range(len(data)): questions = data[i][1] for question in questions: question_ids.append(question["id"]) index = 0 with open(args.prediction_path, "w") as f: for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): output = {} output["id"] = question_ids[index] index += 1 output["label"] = int(pred[j]) f.write(json.dumps(output)) f.write("\n")
elif torch.is_tensor(vec): vec = vec.detach().numpy() elif isinstance(vec, np.ndarray): vec = vec else: raise Exception('Unknown vec type.') vecs_np.append(vec) vecs_np = np.array(vecs_np) return vecs_np if __name__ == '__main__': parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--pooling", choices=["first", "last", "max", "mean"], \ default="first", help="Pooling Type.") parser.add_argument("--whitening_size", type=int, default=None, help="Output vector size after whitening.") tokenizer_opts(parser) args = parser.parse_args() args = load_hyperparam(args) args.tokenizer = str2tokenizer[args.tokenizer](args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build model and load parameters. model = MachineReadingComprehension(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset, examples = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) start_position = torch.LongTensor([sample[2] for sample in dataset]) end_position = torch.LongTensor([sample[3] for sample in dataset]) batch_size = args.batch_size instances_num = len(dataset) print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: start_prob_all, end_prob_all = [], [] for i, (src_batch, seg_batch, start_position_batch, end_position_batch) in enumerate( batch_loader(batch_size, src, seg, start_position, end_position)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) start_position_batch = start_position_batch.to(device) end_position_batch = end_position_batch.to(device) with torch.no_grad(): loss, start_logits, end_logits = model(src_batch, seg_batch, start_position_batch, end_position_batch) start_prob = nn.Softmax(dim=1)(start_logits) end_prob = nn.Softmax(dim=1)(end_logits) for j in range(start_prob.size()[0]): start_prob_all.append(start_prob[j]) end_prob_all.append(end_prob[j]) pred_answers = get_answers(dataset, start_prob_all, end_prob_all) output = {} for i in range(len(examples)): question_id = examples[i][2] start_pred_pos = pred_answers[i][1] end_pred_pos = pred_answers[i][2] prediction = examples[i][0][start_pred_pos:end_pred_pos + 1] output[question_id] = prediction f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first", help="Pooling type.") parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert", help="Specify the tokenizer." "Original Google BERT uses bert tokenizer on Chinese corpus." "Char tokenizer segments sentences into characters." "Space tokenizer segments sentences into words according to space." ) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False model = Classifier(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--vocab_path", default=None, type=str, help="Path of the vocabulary file.") parser.add_argument("--spm_model_path", default=None, type=str, help="Path of the sentence piece model.") parser.add_argument("--label2id_path", type=str, required=True, help="Path of the label2id file.") parser.add_argument( "--crf_target", action="store_true", help="Use CRF loss as the target function or not, default False.") args = parser.parse_args() # Load the hyperparameters of the config file. args = load_hyperparam(args) with open(args.label2id_path, mode="r", encoding="utf-8") as f: l2i = json.load(f) print("Labels: ", l2i) l2i["[PAD]"] = len(l2i) i2l = {} for key, value in l2i.items(): i2l[value] = key args.l2i = l2i args.labels_num = len(l2i) # Load tokenizer. args.tokenizer = SpaceTokenizer(args) # Build sequence labeling model. model = NerTagger(args) model = load_model(model, args.load_model_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) instances = read_dataset(args, args.test_path) src = torch.LongTensor([ins[0] for ins in instances]) seg = torch.LongTensor([ins[1] for ins in instances]) instances_num = src.size(0) batch_size = args.batch_size print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("pred_label" + "\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, pred = model(src_batch, None, seg_batch) # Storing sequence length of instances in a batch. seq_length_batch = [] for seg in seg_batch.cpu().numpy().tolist(): for j in range(len(seg) - 1, -1, -1): if seg[j] != 0: break seq_length_batch.append(j + 1) pred = pred.cpu().numpy().tolist() for j in range(0, len(pred), args.seq_length): for label_id in pred[j:j + seq_length_batch[j // args.seq_length]]: f.write(i2l[label_id] + " ") f.write("\n")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--max_choices_num", default=10, type=int, help="The maximum number of cadicate answer, shorter than this will be padded.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = CharTokenizer(args) # Build classification model and load parameters. model = MultipleChoice(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path, None) model.eval() batch_size = args.batch_size results_final = [] dataset_by_group = {} print("The number of prediction instances: ", len(dataset)) for example in dataset: if example[-1] not in dataset_by_group: dataset_by_group[example[-1]] = [example] else: dataset_by_group[example[-1]].append(example) for group_index, examples in dataset_by_group.items(): src = torch.LongTensor([example[0] for example in examples]) tgt = torch.LongTensor([example[1] for example in examples]) seg = torch.LongTensor([example[2] for example in examples]) index = 0 results = [] for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() for j in range(len(pred)): results.append((examples[index][-2], logits[index].cpu().numpy())) index += 1 results_final.extend(postprocess_chid_predictions(results)) with open(args.prediction_path, 'w') as f: json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") tokenizer_opts(parser) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False model = SiameseClassifier(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src_a = torch.LongTensor([example[0][0] for example in dataset]) src_b = torch.LongTensor([example[0][1] for example in dataset]) seg_a = torch.LongTensor([example[1][0] for example in dataset]) seg_b = torch.LongTensor([example[1][1] for example in dataset]) batch_size = args.batch_size instances_num = src_a.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate( batch_loader(batch_size, (src_a, src_b), (seg_a, seg_b))): src_a_batch, src_b_batch = src_batch seg_a_batch, seg_b_batch = seg_batch src_a_batch = src_a_batch.to(device) src_b_batch = src_b_batch.to(device) seg_a_batch = seg_a_batch.to(device) seg_b_batch = seg_b_batch.to(device) with torch.no_grad(): _, logits = model((src_a_batch, src_b_batch), None, (seg_a_batch, seg_b_batch)) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) tokenizer_opts(parser) parser.add_argument("--tgt_seq_length", type=int, default=32, help="Output sequence length.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model. model = Text2text(args) model = load_model(model, args.load_model_path) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(args.device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(args.device) seg_batch = seg_batch.to(args.device) tgt_in_batch = torch.zeros(src_batch.size()[0], 1, dtype = torch.long, device = args.device) for j in range(tgt_in_batch.size()[0]): tgt_in_batch[j][-1] = args.tokenizer.vocab.get(CLS_TOKEN) with torch.no_grad(): memory_bank = model(src_batch, None, seg_batch, only_use_encoder=True) for _ in range(args.tgt_seq_length): with torch.no_grad(): outputs = model(src_batch, (tgt_in_batch, None, src_batch), None, memory_bank=memory_bank) next_token_logits = outputs[:, -1] next_tokens = torch.argmax(next_token_logits, dim=1).unsqueeze(1) tgt_in_batch = torch.cat([tgt_in_batch, next_tokens], dim=1) for j in range(len(outputs)): f.write("".join([args.tokenizer.inv_vocab[token_id.item()] for token_id in tgt_in_batch[j][1:]]) .split(SEP_TOKEN)[0]) f.write("\n")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) tokenizer_opts(parser) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") parser.add_argument("--prompt_id", type=str, default="chnsenticorp_char") parser.add_argument("--prompt_path", type=str, default="models/prompts.json") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) process_prompt_template(args) answer_position = [0] * len(args.tokenizer.vocab) for answer in args.answer_word_dict_inv: answer_position[int(args.tokenizer.vocab[answer])] = 1 args.answer_position = torch.LongTensor(answer_position) args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Build classification model and load parameters. model = ClozeTest(args) model = load_model(model, args.load_model_path) # For simplicity, we use DataParallel wrapper to use multiple GPUs. model = model.to(args.device) if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = torch.nn.DataParallel(model) dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) tgt = torch.LongTensor([sample[1] for sample in dataset]) seg = torch.LongTensor([sample[2] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for _, (src_batch, tgt_batch, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)): src_batch = src_batch.to(args.device) tgt_batch = tgt_batch.to(args.device) seg_batch = seg_batch.to(args.device) with torch.no_grad(): _, pred, logits = model(src_batch, tgt_batch, seg_batch) logits = logits[:, args.answer_position > 0] prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")