예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument(
        "--max_choices_num",
        default=4,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )

    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([example[0] for example in dataset])
    tgt = torch.LongTensor([example[1] for example in dataset])
    seg = torch.LongTensor([example[2] for example in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.test_path) as f:
        data = json.load(f)

    question_ids = []
    for i in range(len(data)):
        questions = data[i][1]
        for question in questions:
            question_ids.append(question["id"])

    index = 0
    with open(args.prediction_path, "w") as f:
        for i, (src_batch, _, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    output = {}
                    output["id"] = question_ids[index]
                    index += 1
                    output["label"] = int(pred[j])
                    f.write(json.dumps(output))
                    f.write("\n")
예제 #2
0
            elif torch.is_tensor(vec):
                vec = vec.detach().numpy()
            elif isinstance(vec, np.ndarray):
                vec = vec
            else:
                raise Exception('Unknown vec type.')
            vecs_np.append(vec)
        vecs_np = np.array(vecs_np)
        return vecs_np


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--pooling", choices=["first", "last", "max", "mean"], \
                                              default="first", help="Pooling Type.")
    parser.add_argument("--whitening_size",
                        type=int,
                        default=None,
                        help="Output vector size after whitening.")

    tokenizer_opts(parser)

    args = parser.parse_args()
    args = load_hyperparam(args)

    args.tokenizer = str2tokenizer[args.tokenizer](args)
예제 #3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build model and load parameters.
    model = MachineReadingComprehension(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset, examples = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])
    start_position = torch.LongTensor([sample[2] for sample in dataset])
    end_position = torch.LongTensor([sample[3] for sample in dataset])

    batch_size = args.batch_size
    instances_num = len(dataset)

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:

        start_prob_all, end_prob_all = [], []

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            start_position_batch = start_position_batch.to(device)
            end_position_batch = end_position_batch.to(device)

            with torch.no_grad():
                loss, start_logits, end_logits = model(src_batch, seg_batch,
                                                       start_position_batch,
                                                       end_position_batch)

            start_prob = nn.Softmax(dim=1)(start_logits)
            end_prob = nn.Softmax(dim=1)(end_logits)

            for j in range(start_prob.size()[0]):
                start_prob_all.append(start_prob[j])
                end_prob_all.append(end_prob[j])

        pred_answers = get_answers(dataset, start_prob_all, end_prob_all)

        output = {}
        for i in range(len(examples)):
            question_id = examples[i][2]
            start_pred_pos = pred_answers[i][1]
            end_pred_pos = pred_answers[i][2]

            prediction = examples[i][0][start_pred_pos:end_pred_pos + 1]
            output[question_id] = prediction

        f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
예제 #4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")

    parser.add_argument("--labels_num", type=int, required=True,
                        help="Number of prediction labels.")

    parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer on Chinese corpus."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             )

    parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
    parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")
    
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    model = Classifier(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
            
            pred = torch.argmax(logits, dim=1)
            pred = pred.cpu().numpy().tolist()
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()
            
            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--label2id_path",
                        type=str,
                        required=True,
                        help="Path of the label2id file.")
    parser.add_argument(
        "--crf_target",
        action="store_true",
        help="Use CRF loss as the target function or not, default False.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    with open(args.label2id_path, mode="r", encoding="utf-8") as f:
        l2i = json.load(f)
        print("Labels: ", l2i)
        l2i["[PAD]"] = len(l2i)

    i2l = {}
    for key, value in l2i.items():
        i2l[value] = key

    args.l2i = l2i

    args.labels_num = len(l2i)

    # Load tokenizer.
    args.tokenizer = SpaceTokenizer(args)

    # Build sequence labeling model.
    model = NerTagger(args)
    model = load_model(model, args.load_model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    instances = read_dataset(args, args.test_path)

    src = torch.LongTensor([ins[0] for ins in instances])
    seg = torch.LongTensor([ins[1] for ins in instances])

    instances_num = src.size(0)
    batch_size = args.batch_size

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("pred_label" + "\n")
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, pred = model(src_batch, None, seg_batch)

            # Storing sequence length of instances in a batch.
            seq_length_batch = []
            for seg in seg_batch.cpu().numpy().tolist():
                for j in range(len(seg) - 1, -1, -1):
                    if seg[j] != 0:
                        break
                seq_length_batch.append(j + 1)
            pred = pred.cpu().numpy().tolist()
            for j in range(0, len(pred), args.seq_length):
                for label_id in pred[j:j +
                                     seq_length_batch[j // args.seq_length]]:
                    f.write(i2l[label_id] + " ")
                f.write("\n")
예제 #6
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--max_choices_num", default=10, type=int,
                        help="The maximum number of cadicate answer, shorter than this will be padded.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path, None)

    model.eval()
    batch_size = args.batch_size
    results_final = []
    dataset_by_group = {}
    print("The number of prediction instances: ", len(dataset))

    for example in dataset:
        if example[-1] not in dataset_by_group:
            dataset_by_group[example[-1]] = [example]
        else:
            dataset_by_group[example[-1]].append(example)

    for group_index, examples in dataset_by_group.items():
        src = torch.LongTensor([example[0] for example in examples])
        tgt = torch.LongTensor([example[1] for example in examples])
        seg = torch.LongTensor([example[2] for example in examples])
        index = 0
        results = []
        for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    results.append((examples[index][-2], logits[index].cpu().numpy()))
                    index += 1
        results_final.extend(postprocess_chid_predictions(results))

    with open(args.prediction_path, 'w') as f:
        json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--labels_num",
                        type=int,
                        required=True,
                        help="Number of prediction labels.")
    tokenizer_opts(parser)

    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")
    parser.add_argument("--output_prob",
                        action="store_true",
                        help="Write probabilities to output file.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    model = SiameseClassifier(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src_a = torch.LongTensor([example[0][0] for example in dataset])
    src_b = torch.LongTensor([example[0][1] for example in dataset])
    seg_a = torch.LongTensor([example[1][0] for example in dataset])
    seg_b = torch.LongTensor([example[1][1] for example in dataset])

    batch_size = args.batch_size
    instances_num = src_a.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(
                batch_loader(batch_size, (src_a, src_b), (seg_a, seg_b))):

            src_a_batch, src_b_batch = src_batch
            seg_a_batch, seg_b_batch = seg_batch

            src_a_batch = src_a_batch.to(device)
            src_b_batch = src_b_batch.to(device)

            seg_a_batch = seg_a_batch.to(device)
            seg_b_batch = seg_b_batch.to(device)

            with torch.no_grad():
                _, logits = model((src_a_batch, src_b_batch), None,
                                  (seg_a_batch, seg_b_batch))

            pred = torch.argmax(logits, dim=1)
            pred = pred.cpu().numpy().tolist()
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()

            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")
예제 #8
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    tokenizer_opts(parser)

    parser.add_argument("--tgt_seq_length", type=int, default=32,
                        help="Output sequence length.")
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model.
    model = Text2text(args)
    model = load_model(model, args.load_model_path)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)


    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(args.device)
            seg_batch = seg_batch.to(args.device)
            tgt_in_batch = torch.zeros(src_batch.size()[0], 1, dtype = torch.long, device = args.device)
            for j in range(tgt_in_batch.size()[0]):
                tgt_in_batch[j][-1] = args.tokenizer.vocab.get(CLS_TOKEN)

            with torch.no_grad():
                memory_bank = model(src_batch, None, seg_batch, only_use_encoder=True)

            for _ in range(args.tgt_seq_length):
                with torch.no_grad():
                    outputs = model(src_batch, (tgt_in_batch, None, src_batch), None, memory_bank=memory_bank)

                next_token_logits = outputs[:, -1]
                next_tokens = torch.argmax(next_token_logits, dim=1).unsqueeze(1)
                tgt_in_batch = torch.cat([tgt_in_batch, next_tokens], dim=1)

            for j in range(len(outputs)):
                f.write("".join([args.tokenizer.inv_vocab[token_id.item()] for token_id in tgt_in_batch[j][1:]])
                        .split(SEP_TOKEN)[0])
                f.write("\n")
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    tokenizer_opts(parser)

    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")
    parser.add_argument("--output_prob",
                        action="store_true",
                        help="Write probabilities to output file.")

    parser.add_argument("--prompt_id", type=str, default="chnsenticorp_char")
    parser.add_argument("--prompt_path",
                        type=str,
                        default="models/prompts.json")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    process_prompt_template(args)

    answer_position = [0] * len(args.tokenizer.vocab)
    for answer in args.answer_word_dict_inv:
        answer_position[int(args.tokenizer.vocab[answer])] = 1
    args.answer_position = torch.LongTensor(answer_position)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Build classification model and load parameters.
    model = ClozeTest(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    model = model.to(args.device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    tgt = torch.LongTensor([sample[1] for sample in dataset])
    seg = torch.LongTensor([sample[2] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for _, (src_batch, tgt_batch, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):
            src_batch = src_batch.to(args.device)
            tgt_batch = tgt_batch.to(args.device)
            seg_batch = seg_batch.to(args.device)
            with torch.no_grad():
                _, pred, logits = model(src_batch, tgt_batch, seg_batch)

            logits = logits[:, args.answer_position > 0]
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()

            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")