Пример #1
0
def train_and_validate(args):
    set_seed(args.seed)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build model.
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model = load_model(model, args.pretrained_model_path)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    if args.dist_train:
        # Multiprocessing distributed mode.
        mp.spawn(worker,
                 nprocs=args.ranks_num,
                 args=(args.gpu_ranks, args, model),
                 daemon=False)
    elif args.single_gpu:
        # Single GPU mode.
        worker(args.gpu_id, None, args, model)
    else:
        # CPU mode.
        worker(None, None, args, model)
Пример #2
0
def train_and_validate(args):
    set_seed(args.seed)

    # Load vocabulary.
    if args.spm_model_path:
        try:
            import sentencepiece as spm
        except ImportError:
            raise ImportError(
                "You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece")
        sp_model = spm.SentencePieceProcessor()
        sp_model.Load(args.spm_model_path)
        args.vocab = {
            sp_model.IdToPiece(i): i
            for i in range(sp_model.GetPieceSize())
        }
        if args.target == "mt":
            tgt_sp_model = spm.SentencePieceProcessor()
            tgt_sp_model.Load(args.tgt_spm_model_path)
            args.tgt_vocab = {
                tgt_sp_model.IdToPiece(i): i
                for i in range(tgt_sp_model.GetPieceSize())
            }
    else:
        vocab = Vocab()
        vocab.load(args.vocab_path)
        args.vocab = vocab.w2i
        if args.target == "mt":
            tgt_vocab = Vocab()
            tgt_vocab.load(args.tgt_vocab_path)
            args.tgt_vocab = tgt_vocab.w2i

    # Build model.
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model = load_model(model, args.pretrained_model_path)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    if args.dist_train:
        # Multiprocessing distributed mode.
        mp.spawn(worker,
                 nprocs=args.ranks_num,
                 args=(args.gpu_ranks, args, model),
                 daemon=False)
    elif args.single_gpu:
        # Single GPU mode.
        worker(args.gpu_id, None, args, model)
    else:
        # CPU mode.
        worker(None, None, args, model)
Пример #3
0
def train_and_validate(args):
    set_seed(args.seed)

    # Load vocabulary.
    if args.data_processor == "mt":
        args.tgt_tokenizer = str2tokenizer[args.tgt_tokenizer](args,
                                                               is_src=False)
        args.tgt_vocab = args.tgt_tokenizer.vocab

    args.tokenizer = str2tokenizer[args.tokenizer](args)
    args.vocab = args.tokenizer.vocab

    # Build model.
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model = load_model(model, args.pretrained_model_path)
    else:
        # Initialize with normal distribution.
        if args.deep_init:
            scaled_factor = 1 / math.sqrt(2.0 * args.layers_num)
            for n, p in list(model.named_parameters()):
                if "gamma" not in n and "beta" not in n:
                    if "linear_2.weight" in n or "final_linear.weight" in n:
                        p.data.normal_(0, 0.02 * scaled_factor)
                    elif "linear_2.bias" in n or "final_linear.bias" in n:
                        p.data.zero_()
                    else:
                        p.data.normal_(0, 0.02)
        else:
            for n, p in list(model.named_parameters()):
                if "gamma" not in n and "beta" not in n:
                    p.data.normal_(0, 0.02)

    if args.deepspeed:
        worker(args.local_rank, None, args, model)
    elif args.dist_train:
        # Multiprocessing distributed mode.
        mp.spawn(worker,
                 nprocs=args.ranks_num,
                 args=(args.gpu_ranks, args, model),
                 daemon=False)
    elif args.single_gpu:
        # Single GPU mode.
        worker(args.gpu_id, None, args, model)
    else:
        # CPU mode.
        worker(None, None, args, model)
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the classfier model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=512,
                        help="Sequence length.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build model and load parameters.
    model = MachineReadingComprehension(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset, examples = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])
    start_position = torch.LongTensor([sample[2] for sample in dataset])
    end_position = torch.LongTensor([sample[3] for sample in dataset])

    batch_size = args.batch_size
    instances_num = len(dataset)

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:

        start_prob_all, end_prob_all = [], []

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            start_position_batch = start_position_batch.to(device)
            end_position_batch = end_position_batch.to(device)

            with torch.no_grad():
                loss, start_logits, end_logits = model(src_batch, seg_batch,
                                                       start_position_batch,
                                                       end_position_batch)

            start_prob = nn.Softmax(dim=1)(start_logits)
            end_prob = nn.Softmax(dim=1)(end_logits)

            for j in range(start_prob.size()[0]):
                start_prob_all.append(start_prob[j])
                end_prob_all.append(end_prob[j])

        pred_answers = get_answers(dataset, start_prob_all, end_prob_all)

        output = {}
        for i in range(len(examples)):
            question_id = examples[i][2]
            start_pred_pos = pred_answers[i][1]
            end_pred_pos = pred_answers[i][2]

            prediction = examples[i][0][start_pred_pos:end_pred_pos]
            output[question_id] = prediction

        f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
Пример #5
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")

    parser.add_argument("--labels_num", type=int, required=True,
                        help="Number of prediction labels.")

    parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer on Chinese corpus."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             )

    parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
    parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")
    
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    model = Classifier(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
            
            pred = torch.argmax(logits, dim=1)
            pred = pred.cpu().numpy().tolist()
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()
            
            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")
Пример #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build model and load parameters.
    model = MachineReadingComprehension(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset, examples = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])
    start_position = torch.LongTensor([sample[2] for sample in dataset])
    end_position = torch.LongTensor([sample[3] for sample in dataset])

    batch_size = args.batch_size
    instances_num = len(dataset)

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:

        start_prob_all, end_prob_all = [], []

        for i, (src_batch, seg_batch, start_position_batch,
                end_position_batch) in enumerate(
                    batch_loader(batch_size, src, seg, start_position,
                                 end_position)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            start_position_batch = start_position_batch.to(device)
            end_position_batch = end_position_batch.to(device)

            with torch.no_grad():
                loss, start_logits, end_logits = model(src_batch, seg_batch,
                                                       start_position_batch,
                                                       end_position_batch)

            start_prob = nn.Softmax(dim=1)(start_logits)
            end_prob = nn.Softmax(dim=1)(end_logits)

            for j in range(start_prob.size()[0]):
                start_prob_all.append(start_prob[j])
                end_prob_all.append(end_prob[j])

        pred_answers = get_answers(dataset, start_prob_all, end_prob_all)

        output = {}
        for i in range(len(examples)):
            question_id = examples[i][2]
            start_pred_pos = pred_answers[i][1]
            end_pred_pos = pred_answers[i][2]

            prediction = examples[i][0][start_pred_pos:end_pred_pos + 1]
            output[question_id] = prediction

        f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")
Пример #7
0
                                              default="first", help="Pooling Type.")
    parser.add_argument("--whitening_size",
                        type=int,
                        default=None,
                        help="Output vector size after whitening.")

    tokenizer_opts(parser)

    args = parser.parse_args()
    args = load_hyperparam(args)

    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build feature extractor model.
    model = FeatureExtractor(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)
    model.eval()

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])
Пример #8
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/classifier_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        default="./models/google_vocab.txt",
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=128,
                        help="Sequence length.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    # Evaluation options.
    parser.add_argument("--mean_reciprocal_rank",
                        action="store_true",
                        help="Evaluation metrics for DBQA dataset.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    # Count the number of labels.
    labels_set = set()
    columns = {}
    with open(args.train_path, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            try:
                line = line.strip().split("\t")
                if line_id == 0:
                    for i, column_name in enumerate(line):
                        columns[column_name] = i
                    continue
                label = int(line[columns["label"]])
                labels_set.add(label)
            except:
                pass
    args.labels_num = len(labels_set)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build bert model.
    # A pseudo target is added.
    args.target = "bert"
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model.load_state_dict(torch.load(args.pretrained_model_path),
                              strict=False)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    # Build classification model.
    model = BertClassifier(args, model)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    # Datset loader.
    def batch_loader(batch_size, input_ids, label_ids, mask_ids):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            label_ids_batch = label_ids[instances_num // batch_size *
                                        batch_size:]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch

    # Build tokenizer.
    tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Read dataset.
    def read_dataset(path):
        dataset = []
        with open(path, mode="r", encoding="utf-8") as f:
            for line_id, line in enumerate(f):
                if line_id == 0:
                    continue
                try:
                    line = line.strip().split('\t')
                    if len(line) == 2:
                        label = int(line[columns["label"]])
                        text = line[columns["text_a"]]
                        tokens = [
                            vocab.get(t) for t in tokenizer.tokenize(text)
                        ]
                        tokens = [CLS_ID] + tokens
                        mask = [1] * len(tokens)
                        if len(tokens) > args.seq_length:
                            tokens = tokens[:args.seq_length]
                            mask = mask[:args.seq_length]
                        while len(tokens) < args.seq_length:
                            tokens.append(0)
                            mask.append(0)
                        dataset.append((tokens, label, mask))
                    elif len(line) == 3:  # For sentence pair input.
                        label = int(line[columns["label"]])
                        text_a, text_b = line[columns["text_a"]], line[
                            columns["text_b"]]

                        tokens_a = [
                            vocab.get(t) for t in tokenizer.tokenize(text_a)
                        ]
                        tokens_a = [CLS_ID] + tokens_a + [SEP_ID]
                        tokens_b = [
                            vocab.get(t) for t in tokenizer.tokenize(text_b)
                        ]
                        tokens_b = tokens_b + [SEP_ID]

                        tokens = tokens_a + tokens_b
                        mask = [1] * len(tokens_a) + [2] * len(tokens_b)

                        if len(tokens) > args.seq_length:
                            tokens = tokens[:args.seq_length]
                            mask = mask[:args.seq_length]
                        while len(tokens) < args.seq_length:
                            tokens.append(0)
                            mask.append(0)
                        dataset.append((tokens, label, mask))
                    elif len(line) == 4:  # For dbqa input.
                        qid = int(line[columns["qid"]])
                        label = int(line[columns["label"]])
                        text_a, text_b = line[columns["text_a"]], line[
                            columns["text_b"]]

                        tokens_a = [
                            vocab.get(t) for t in tokenizer.tokenize(text_a)
                        ]
                        tokens_a = [CLS_ID] + tokens_a + [SEP_ID]
                        tokens_b = [
                            vocab.get(t) for t in tokenizer.tokenize(text_b)
                        ]
                        tokens_b = tokens_b + [SEP_ID]

                        tokens = tokens_a + tokens_b
                        mask = [1] * len(tokens_a) + [2] * len(tokens_b)

                        if len(tokens) > args.seq_length:
                            tokens = tokens[:args.seq_length]
                            mask = mask[:args.seq_length]
                        while len(tokens) < args.seq_length:
                            tokens.append(0)
                            mask.append(0)
                        dataset.append((tokens, label, mask, qid))
                    else:
                        pass

                except:
                    pass
        return dataset

    # Evaluation function.
    def evaluate(args, is_test):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        label_ids = torch.LongTensor([sample[1] for sample in dataset])
        mask_ids = torch.LongTensor([sample[2] for sample in dataset])

        batch_size = args.batch_size
        instances_num = input_ids.size()[0]
        if is_test:
            print("The number of evaluation instances: ", instances_num)

        correct = 0
        # Confusion matrix.
        confusion = torch.zeros(args.labels_num,
                                args.labels_num,
                                dtype=torch.long)

        model.eval()

        if not args.mean_reciprocal_rank:
            for i, (input_ids_batch, label_ids_batch,
                    mask_ids_batch) in enumerate(
                        batch_loader(batch_size, input_ids, label_ids,
                                     mask_ids)):
                input_ids_batch = input_ids_batch.to(device)
                label_ids_batch = label_ids_batch.to(device)
                mask_ids_batch = mask_ids_batch.to(device)
                with torch.no_grad():
                    loss, logits = model(input_ids_batch, label_ids_batch,
                                         mask_ids_batch)
                logits = nn.Softmax(dim=1)(logits)
                pred = torch.argmax(logits, dim=1)
                gold = label_ids_batch
                for j in range(pred.size()[0]):
                    confusion[pred[j], gold[j]] += 1
                correct += torch.sum(pred == gold).item()

            if is_test:
                print("Confusion matrix:")
                print(confusion)
                print("Report precision, recall, and f1:")
            for i in range(confusion.size()[0]):
                p = confusion[i, i].item() / confusion[i, :].sum().item()
                r = confusion[i, i].item() / confusion[:, i].sum().item()
                f1 = 2 * p * r / (p + r)
                if is_test:
                    print("Label {}: {:.3f}, {:.3f}, {:.3f}".format(
                        i, p, r, f1))
            print("Acc. (Correct/Total): {:.4f} ({}/{}) ".format(
                correct / len(dataset), correct, len(dataset)))
            return correct / len(dataset)
        else:
            for i, (input_ids_batch, label_ids_batch,
                    mask_ids_batch) in enumerate(
                        batch_loader(batch_size, input_ids, label_ids,
                                     mask_ids)):
                input_ids_batch = input_ids_batch.to(device)
                label_ids_batch = label_ids_batch.to(device)
                mask_ids_batch = mask_ids_batch.to(device)
                with torch.no_grad():
                    loss, logits = model(input_ids_batch, label_ids_batch,
                                         mask_ids_batch)
                logits = nn.Softmax(dim=1)(logits)
                if i == 0:
                    logits_all = logits
                if i >= 1:
                    logits_all = torch.cat((logits_all, logits), 0)

            order = -1
            gold = []
            for i in range(len(dataset)):
                qid = dataset[i][3]
                label = dataset[i][1]
                if qid == order:
                    j += 1
                    if label == 1:
                        gold.append((qid, j))
                else:
                    order = qid
                    j = 0
                    if label == 1:
                        gold.append((qid, j))

            label_order = []
            order = -1
            for i in range(len(gold)):
                if gold[i][0] == order:
                    templist.append(gold[i][1])
                elif gold[i][0] != order:
                    order = gold[i][0]
                    if i > 0:
                        label_order.append(templist)
                    templist = []
                    templist.append(gold[i][1])
            label_order.append(templist)

            order = -1
            score_list = []
            for i in range(len(logits_all)):
                score = float(logits_all[i][1])
                qid = int(dataset[i][3])
                if qid == order:
                    templist.append(score)
                else:
                    order = qid
                    if i > 0:
                        score_list.append(templist)
                    templist = []
                    templist.append(score)
            score_list.append(templist)

            rank = []
            pred = []
            for i in range(len(score_list)):
                if len(label_order[i]) == 1:
                    if label_order[i][0] < len(score_list[i]):
                        true_score = score_list[i][label_order[i][0]]
                        score_list[i].sort(reverse=True)
                        for j in range(len(score_list[i])):
                            if score_list[i][j] == true_score:
                                rank.append(1 / (j + 1))
                    else:
                        rank.append(0)

                else:
                    true_rank = len(score_list[i])
                    for k in range(len(label_order[i])):
                        if label_order[i][k] < len(score_list[i]):
                            true_score = score_list[i][label_order[i][k]]
                            temp = sorted(score_list[i], reverse=True)
                            for j in range(len(temp)):
                                if temp[j] == true_score:
                                    if j < true_rank:
                                        true_rank = j
                    if true_rank < len(score_list[i]):
                        rank.append(1 / (true_rank + 1))
                    else:
                        rank.append(0)
            MRR = sum(rank) / len(rank)
            print(MRR)
            return MRR

    # Training phase.
    print("Start training.")
    trainset = read_dataset(args.train_path)
    random.shuffle(trainset)
    instances_num = len(trainset)
    batch_size = args.batch_size

    input_ids = torch.LongTensor([example[0] for example in trainset])
    label_ids = torch.LongTensor([example[1] for example in trainset])
    mask_ids = torch.LongTensor([example[2] for example in trainset])

    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup,
                         t_total=train_steps)

    total_loss = 0.
    result = 0.0
    best_result = 0.0

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            model.zero_grad()

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)

            loss, _ = model(input_ids_batch, label_ids_batch, mask_ids_batch)
            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.
            loss.backward()
            optimizer.step()
        result = evaluate(args, False)
        if result > best_result:
            best_result = result
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        model = load_model(model, args.output_model_path)
        evaluate(args, True)
Пример #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the classfier model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=128,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=128,
                        help="Sequence length.")
    parser.add_argument("--labels_num",
                        type=int,
                        required=True,
                        help="Number of prediction labels.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    # Output options.
    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build classification model and load parameters.
    args.soft_targets = False
    model = Classifier(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        if args.output_logits:
            f.write("label" + "\t" + "logits" + "\n")
        else:
            f.write("label" + "\n")
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

            pred = torch.argmax(logits, dim=1)
            pred = pred.cpu().numpy().tolist()
            logits = logits.cpu().numpy().tolist()

            if args.output_logits:
                for j in range(len(pred)):
                    f.write(
                        str(pred[j]) + "\t" +
                        " ".join([str(v) for v in logits[j]]) + "\n")
            else:
                for j in range(len(pred)):
                    f.write(str(pred[j]) + "\n")
Пример #10
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--labels_num",
                        type=int,
                        required=True,
                        help="Number of prediction labels.")
    tokenizer_opts(parser)

    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")
    parser.add_argument("--output_prob",
                        action="store_true",
                        help="Write probabilities to output file.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    model = SiameseClassifier(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src_a = torch.LongTensor([example[0][0] for example in dataset])
    src_b = torch.LongTensor([example[0][1] for example in dataset])
    seg_a = torch.LongTensor([example[1][0] for example in dataset])
    seg_b = torch.LongTensor([example[1][1] for example in dataset])

    batch_size = args.batch_size
    instances_num = src_a.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(
                batch_loader(batch_size, (src_a, src_b), (seg_a, seg_b))):

            src_a_batch, src_b_batch = src_batch
            seg_a_batch, seg_b_batch = seg_batch

            src_a_batch = src_a_batch.to(device)
            src_b_batch = src_b_batch.to(device)

            seg_a_batch = seg_a_batch.to(device)
            seg_b_batch = seg_b_batch.to(device)

            with torch.no_grad():
                _, logits = model((src_a_batch, src_b_batch), None,
                                  (seg_a_batch, seg_b_batch))

            pred = torch.argmax(logits, dim=1)
            pred = pred.cpu().numpy().tolist()
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()

            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")
Пример #11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the NER model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")
    parser.add_argument("--label2id_path",
                        type=str,
                        required=True,
                        help="Path of the label2id file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=128,
                        help="Batch_size.")
    parser.add_argument("--seq_length",
                        default=128,
                        type=int,
                        help="Sequence length.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    with open(args.label2id_path, mode="r", encoding="utf-8") as f:
        l2i = json.load(f)
        print("Labels: ", l2i)
        l2i["[PAD]"] = len(l2i)

    i2l = {}
    for key, value in l2i.items():
        i2l[value] = key

    args.l2i = l2i

    args.labels_num = len(l2i)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build sequence labeling model.
    model = NerTagger(args)
    model = load_model(model, args.load_model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    instances = read_dataset(args, args.test_path)

    src = torch.LongTensor([ins[0] for ins in instances])
    seg = torch.LongTensor([ins[1] for ins in instances])

    instances_num = src.size(0)
    batch_size = args.batch_size

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("pred_label" + "\n")
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
            pred = logits.argmax(dim=-1)
            # Storing sequence length of instances in a batch.
            seq_length_batch = []
            for seg in seg_batch.cpu().numpy().tolist():
                for j in range(len(seg) - 1, -1, -1):
                    if seg[j] != 0:
                        break
                seq_length_batch.append(j + 1)
            pred = pred.cpu().numpy().tolist()
            for j in range(0, len(pred), args.seq_length):
                for label_id in pred[j:j +
                                     seq_length_batch[j // args.seq_length]]:
                    f.write(i2l[label_id] + " ")
                f.write("\n")
Пример #12
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    tokenizer_opts(parser)

    parser.add_argument("--tgt_seq_length", type=int, default=32,
                        help="Output sequence length.")
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model.
    model = Text2text(args)
    model = load_model(model, args.load_model_path)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(args.device)


    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        f.write("\n")
        for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(args.device)
            seg_batch = seg_batch.to(args.device)
            tgt_in_batch = torch.zeros(src_batch.size()[0], 1, dtype = torch.long, device = args.device)
            for j in range(tgt_in_batch.size()[0]):
                tgt_in_batch[j][-1] = args.tokenizer.vocab.get(CLS_TOKEN)

            with torch.no_grad():
                memory_bank = model(src_batch, None, seg_batch, only_use_encoder=True)

            for _ in range(args.tgt_seq_length):
                with torch.no_grad():
                    outputs = model(src_batch, (tgt_in_batch, None, src_batch), None, memory_bank=memory_bank)

                next_token_logits = outputs[:, -1]
                next_tokens = torch.argmax(next_token_logits, dim=1).unsqueeze(1)
                tgt_in_batch = torch.cat([tgt_in_batch, next_tokens], dim=1)

            for j in range(len(outputs)):
                f.write("".join([args.tokenizer.inv_vocab[token_id.item()] for token_id in tgt_in_batch[j][1:]])
                        .split(SEP_TOKEN)[0])
                f.write("\n")
Пример #13
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    tokenizer_opts(parser)

    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")
    parser.add_argument("--output_prob",
                        action="store_true",
                        help="Write probabilities to output file.")

    parser.add_argument("--prompt_id", type=str, default="chnsenticorp_char")
    parser.add_argument("--prompt_path",
                        type=str,
                        default="models/prompts.json")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    process_prompt_template(args)

    answer_position = [0] * len(args.tokenizer.vocab)
    for answer in args.answer_word_dict_inv:
        answer_position[int(args.tokenizer.vocab[answer])] = 1
    args.answer_position = torch.LongTensor(answer_position)

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Build classification model and load parameters.
    model = ClozeTest(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    model = model.to(args.device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    tgt = torch.LongTensor([sample[1] for sample in dataset])
    seg = torch.LongTensor([sample[2] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("label")
        if args.output_logits:
            f.write("\t" + "logits")
        if args.output_prob:
            f.write("\t" + "prob")
        f.write("\n")
        for _, (src_batch, tgt_batch, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):
            src_batch = src_batch.to(args.device)
            tgt_batch = tgt_batch.to(args.device)
            seg_batch = seg_batch.to(args.device)
            with torch.no_grad():
                _, pred, logits = model(src_batch, tgt_batch, seg_batch)

            logits = logits[:, args.answer_position > 0]
            prob = nn.Softmax(dim=1)(logits)
            logits = logits.cpu().numpy().tolist()
            prob = prob.cpu().numpy().tolist()

            for j in range(len(pred)):
                f.write(str(pred[j]))
                if args.output_logits:
                    f.write("\t" + " ".join([str(v) for v in logits[j]]))
                if args.output_prob:
                    f.write("\t" + " ".join([str(v) for v in prob[j]]))
                f.write("\n")
Пример #14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the multiple choice model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=512,
                        help="Sequence length.")
    parser.add_argument(
        "--max_choices_num",
        default=4,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build classification model and load parameters.
    args.soft_targets = False
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([example[0] for example in dataset])
    tgt = torch.LongTensor([example[1] for example in dataset])
    seg = torch.LongTensor([example[2] for example in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.test_path) as f:
        data = json.load(f)

    question_ids = []
    for i in range(len(data)):
        questions = data[i][1]
        for question in questions:
            question_ids.append(question['id'])

    index = 0
    with open(args.prediction_path, 'w') as f:
        for i, (src_batch, _, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    output = {}
                    output['id'] = question_ids[index]
                    index += 1
                    output['label'] = int(pred[j])
                    f.write(json.dumps(output))
                    f.write('\n')
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the classfier model.")
    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--test_features_path",
                        default=None,
                        type=str,
                        help="Path of the test features for stacking.")
    parser.add_argument("--config_path",
                        default="models/bert/base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    model_opts(parser)
    parser.add_argument("--pooling",
                        choices=["mean", "max", "first", "last"],
                        default="first",
                        help="Pooling type.")

    # Inference options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=64,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=128,
                        help="Sequence length.")
    parser.add_argument("--labels_num",
                        type=int,
                        required=True,
                        help="Number of prediction labels.")

    # Tokenizer options.
    tokenizer_opts(parser)

    # Output options.
    parser.add_argument("--output_logits",
                        action="store_true",
                        help="Write logits to output file.")
    parser.add_argument("--output_prob",
                        action="store_true",
                        help="Write probabilities to output file.")

    # Cross validation options.
    parser.add_argument("--folds_num",
                        type=int,
                        default=5,
                        help="The number of folds for cross validation.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    test_features = [[] for _ in range(args.folds_num)]
    for fold_id in range(args.folds_num):
        load_model_name = ".".join(args.load_model_path.split(".")[:-1])
        load_model_suffix = args.load_model_path.split(".")[-1]

        model = Classifier(args)
        model = load_model(
            model, load_model_name + "-fold_" + str(fold_id) + "." +
            load_model_suffix)

        # For simplicity, we use DataParallel wrapper to use multiple GPUs.
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        if torch.cuda.device_count() > 1:
            print("{} GPUs are available. Let's use them.".format(
                torch.cuda.device_count()))
            model = torch.nn.DataParallel(model)

        model.eval()
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

            prob = nn.Softmax(dim=1)(logits)
            prob = prob.cpu().numpy().tolist()
            test_features[fold_id].extend(prob)

    test_features = np.array(test_features)
    test_features = np.mean(test_features, axis=0)
    np.save(args.test_features_path, test_features)
Пример #16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path",
                        default=None,
                        type=str,
                        help="Path of the multiple choice model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--prediction_path",
                        default=None,
                        type=str,
                        help="Path of the prediction file.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size.")
    parser.add_argument("--seq_length",
                        type=int,
                        default=64,
                        help="Sequence length.")
    parser.add_argument(
        "--max_choices_num",
        default=10,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")
    parser.add_argument("--factorized_embedding_parameterization",
                        action="store_true",
                        help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing",
                        action="store_true",
                        help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="char",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path, None)

    model.eval()
    batch_size = args.batch_size
    results_final = []
    dataset_by_group = {}
    print("The number of prediction instances: ", len(dataset))

    for example in dataset:
        if example[-1] not in dataset_by_group:
            dataset_by_group[example[-1]] = [example]
        else:
            dataset_by_group[example[-1]].append(example)

    for group_index, examples in dataset_by_group.items():
        src = torch.LongTensor([example[0] for example in examples])
        tgt = torch.LongTensor([example[1] for example in examples])
        seg = torch.LongTensor([example[2] for example in examples])
        index = 0
        results = []
        for i, (src_batch, _, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    results.append(
                        (examples[index][-2], logits[index].cpu().numpy()))
                    index += 1
        results_final.extend(postprocess_chid_predictions(results))

    with open(args.prediction_path, 'w') as f:
        json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
Пример #17
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--max_choices_num", default=10, type=int,
                        help="The maximum number of cadicate answer, shorter than this will be padded.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = CharTokenizer(args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path, None)

    model.eval()
    batch_size = args.batch_size
    results_final = []
    dataset_by_group = {}
    print("The number of prediction instances: ", len(dataset))

    for example in dataset:
        if example[-1] not in dataset_by_group:
            dataset_by_group[example[-1]] = [example]
        else:
            dataset_by_group[example[-1]].append(example)

    for group_index, examples in dataset_by_group.items():
        src = torch.LongTensor([example[0] for example in examples])
        tgt = torch.LongTensor([example[1] for example in examples])
        seg = torch.LongTensor([example[2] for example in examples])
        index = 0
        results = []
        for i, (src_batch, _, seg_batch, _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    results.append((examples[index][-2], logits[index].cpu().numpy()))
                    index += 1
        results_final.extend(postprocess_chid_predictions(results))

    with open(args.prediction_path, 'w') as f:
        json.dump({tag: pred for tag, pred in results_final}, f, indent=2)
Пример #18
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument(
        "--max_choices_num",
        default=4,
        type=int,
        help=
        "The maximum number of cadicate answer, shorter than this will be padded."
    )

    parser.add_argument(
        "--tokenizer",
        choices=["bert", "char", "space"],
        default="bert",
        help="Specify the tokenizer."
        "Original Google BERT uses bert tokenizer on Chinese corpus."
        "Char tokenizer segments sentences into characters."
        "Space tokenizer segments sentences into words according to space.")

    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = str2tokenizer[args.tokenizer](args)

    # Build classification model and load parameters.
    model = MultipleChoice(args)
    model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([example[0] for example in dataset])
    tgt = torch.LongTensor([example[1] for example in dataset])
    seg = torch.LongTensor([example[2] for example in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.test_path) as f:
        data = json.load(f)

    question_ids = []
    for i in range(len(data)):
        questions = data[i][1]
        for question in questions:
            question_ids.append(question["id"])

    index = 0
    with open(args.prediction_path, "w") as f:
        for i, (src_batch, _, seg_batch,
                _) in enumerate(batch_loader(batch_size, src, tgt, seg)):

            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)

            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)

                pred = torch.argmax(logits, dim=1)
                pred = pred.cpu().numpy().tolist()
                for j in range(len(pred)):
                    output = {}
                    output["id"] = question_ids[index]
                    index += 1
                    output["label"] = int(pred[j])
                    f.write(json.dumps(output))
                    f.write("\n")
Пример #19
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    infer_opts(parser)

    parser.add_argument("--vocab_path",
                        default=None,
                        type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path",
                        default=None,
                        type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--label2id_path",
                        type=str,
                        required=True,
                        help="Path of the label2id file.")
    parser.add_argument(
        "--crf_target",
        action="store_true",
        help="Use CRF loss as the target function or not, default False.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    with open(args.label2id_path, mode="r", encoding="utf-8") as f:
        l2i = json.load(f)
        print("Labels: ", l2i)
        l2i["[PAD]"] = len(l2i)

    i2l = {}
    for key, value in l2i.items():
        i2l[value] = key

    args.l2i = l2i

    args.labels_num = len(l2i)

    # Load tokenizer.
    args.tokenizer = SpaceTokenizer(args)

    # Build sequence labeling model.
    model = NerTagger(args)
    model = load_model(model, args.load_model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = torch.nn.DataParallel(model)

    instances = read_dataset(args, args.test_path)

    src = torch.LongTensor([ins[0] for ins in instances])
    seg = torch.LongTensor([ins[1] for ins in instances])

    instances_num = src.size(0)
    batch_size = args.batch_size

    print("The number of prediction instances: ", instances_num)

    model.eval()

    with open(args.prediction_path, mode="w", encoding="utf-8") as f:
        f.write("pred_label" + "\n")
        for i, (src_batch,
                seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, pred = model(src_batch, None, seg_batch)

            # Storing sequence length of instances in a batch.
            seq_length_batch = []
            for seg in seg_batch.cpu().numpy().tolist():
                for j in range(len(seg) - 1, -1, -1):
                    if seg[j] != 0:
                        break
                seq_length_batch.append(j + 1)
            pred = pred.cpu().numpy().tolist()
            for j in range(0, len(pred), args.seq_length):
                for label_id in pred[j:j +
                                     seq_length_batch[j // args.seq_length]]:
                    f.write(i2l[label_id] + " ")
                f.write("\n")
Пример #20
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--pretrained_model_path",
                        default=None,
                        type=str,
                        help="Path of the pretrained model.")
    parser.add_argument("--output_model_path",
                        default="./models/ner_model.bin",
                        type=str,
                        help="Path of the output model.")
    parser.add_argument("--vocab_path",
                        type=str,
                        required=True,
                        help="Path of the vocabulary file.")
    parser.add_argument("--train_path",
                        type=str,
                        required=True,
                        help="Path of the trainset.")
    parser.add_argument("--dev_path",
                        type=str,
                        required=True,
                        help="Path of the devset.")
    parser.add_argument("--test_path", type=str, help="Path of the testset.")
    parser.add_argument("--config_path",
                        default="./models/bert_base_config.json",
                        type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch_size.")
    parser.add_argument("--seq_length",
                        default=128,
                        type=int,
                        help="Sequence length.")
    parser.add_argument("--embedding",
                        choices=["bert", "word"],
                        default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                                   "cnn", "gatedcnn", "attn", \
                                                   "rcnn", "crnn", "gpt", "bilstm"], \
                                                   default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional",
                        action="store_true",
                        help="Specific to recurrent model.")

    # Subword options.
    parser.add_argument("--subword_type",
                        choices=["none", "char"],
                        default="none",
                        help="Subword feature type.")
    parser.add_argument("--sub_vocab_path",
                        type=str,
                        default="models/sub_vocab.txt",
                        help="Path of the subword vocabulary file.")
    parser.add_argument("--subencoder",
                        choices=["avg", "lstm", "gru", "cnn"],
                        default="avg",
                        help="Subencoder type.")
    parser.add_argument("--sub_layers_num",
                        type=int,
                        default=2,
                        help="The number of subencoder layers.")

    # Optimizer options.
    parser.add_argument("--learning_rate",
                        type=float,
                        default=2e-5,
                        help="Learning rate.")
    parser.add_argument("--warmup",
                        type=float,
                        default=0.1,
                        help="Warm up value.")

    # Training options.
    parser.add_argument("--dropout", type=float, default=0.1, help="Dropout.")
    parser.add_argument("--epochs_num",
                        type=int,
                        default=3,
                        help="Number of epochs.")
    parser.add_argument("--report_steps",
                        type=int,
                        default=100,
                        help="Specific steps to print prompt.")
    parser.add_argument("--seed", type=int, default=7, help="Random seed.")

    args = parser.parse_args()

    # Load the hyperparameters of the config file.
    args = load_hyperparam(args)

    set_seed(args.seed)

    labels_map = {"[PAD]": 0}
    begin_ids = []

    # Find tagging labels
    with open(args.train_path, mode="r", encoding="utf-8") as f:
        for line_id, line in enumerate(f):
            if line_id == 0:
                continue
            labels = line.strip().split("\t")[1].split()
            for l in labels:
                if l not in labels_map:
                    if l.startswith("B") or l.startswith("S"):
                        begin_ids.append(len(labels_map))
                    labels_map[l] = len(labels_map)

    print("Labels: ", labels_map)
    args.labels_num = len(labels_map)

    # Load vocabulary.
    vocab = Vocab()
    vocab.load(args.vocab_path)
    args.vocab = vocab

    # Build bert model.
    # A pseudo target is added.
    args.target = "bert"
    model = build_model(args)

    # Load or initialize parameters.
    if args.pretrained_model_path is not None:
        # Initialize with pretrained model.
        model.load_state_dict(torch.load(args.pretrained_model_path),
                              strict=False)
    else:
        # Initialize with normal distribution.
        for n, p in list(model.named_parameters()):
            if 'gamma' not in n and 'beta' not in n:
                p.data.normal_(0, 0.02)

    # Build sequence labeling model.
    model = BertTagger(args, model)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        print("{} GPUs are available. Let's use them.".format(
            torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    # Datset loader.
    def batch_loader(batch_size, input_ids, label_ids, mask_ids):
        instances_num = input_ids.size()[0]
        for i in range(instances_num // batch_size):
            input_ids_batch = input_ids[i * batch_size:(i + 1) * batch_size, :]
            label_ids_batch = label_ids[i * batch_size:(i + 1) * batch_size, :]
            mask_ids_batch = mask_ids[i * batch_size:(i + 1) * batch_size, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch
        if instances_num > instances_num // batch_size * batch_size:
            input_ids_batch = input_ids[instances_num // batch_size *
                                        batch_size:, :]
            label_ids_batch = label_ids[instances_num // batch_size *
                                        batch_size:, :]
            mask_ids_batch = mask_ids[instances_num // batch_size *
                                      batch_size:, :]
            yield input_ids_batch, label_ids_batch, mask_ids_batch

    # Read dataset.
    def read_dataset(path):
        dataset = []
        with open(path, mode="r", encoding="utf-8") as f:
            f.readline()
            tokens, labels = [], []
            for line_id, line in enumerate(f):
                tokens, labels = line.strip().split("\t")
                tokens = [vocab.get(t) for t in tokens.split(" ")]
                labels = [labels_map[l] for l in labels.split(" ")]
                mask = [1] * len(tokens)
                if len(tokens) > args.seq_length:
                    tokens = tokens[:args.seq_length]
                    labels = labels[:args.seq_length]
                    mask = mask[:args.seq_length]
                while len(tokens) < args.seq_length:
                    tokens.append(0)
                    labels.append(0)
                    mask.append(0)
                dataset.append([tokens, labels, mask])

        return dataset

    # Evaluation function.
    def evaluate(args, is_test):
        if is_test:
            dataset = read_dataset(args.test_path)
        else:
            dataset = read_dataset(args.dev_path)

        input_ids = torch.LongTensor([sample[0] for sample in dataset])
        label_ids = torch.LongTensor([sample[1] for sample in dataset])
        mask_ids = torch.LongTensor([sample[2] for sample in dataset])

        instances_num = input_ids.size(0)
        batch_size = args.batch_size

        if is_test:
            print("Batch size: ", batch_size)
            print("The number of test instances:", instances_num)

        correct = 0
        gold_entities_num = 0
        pred_entities_num = 0

        confusion = torch.zeros(len(labels_map),
                                len(labels_map),
                                dtype=torch.long)

        model.eval()

        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)
            loss, _, pred, gold = model(input_ids_batch, label_ids_batch,
                                        mask_ids_batch)

            for j in range(gold.size()[0]):
                if gold[j].item() in begin_ids:
                    gold_entities_num += 1

            for j in range(pred.size()[0]):
                if pred[j].item(
                ) in begin_ids and gold[j].item() != labels_map["[PAD]"]:
                    pred_entities_num += 1

            pred_entities_pos = []
            gold_entities_pos = []
            start, end = 0, 0

            for j in range(gold.size()[0]):
                if gold[j].item() in begin_ids:
                    start = j
                    for k in range(j + 1, gold.size()[0]):
                        if gold[k].item(
                        ) == labels_map["[PAD]"] or gold[k].item(
                        ) == labels_map["O"] or gold[k].item() in begin_ids:
                            end = k - 1
                            break
                    else:
                        end = gold.size()[0] - 1
                    gold_entities_pos.append((start, end))

            for j in range(pred.size()[0]):
                if pred[j].item(
                ) in begin_ids and gold[j].item() != labels_map["[PAD]"]:
                    start = j
                    for k in range(j + 1, pred.size()[0]):
                        if pred[k].item(
                        ) == labels_map["[PAD]"] or pred[k].item(
                        ) == labels_map["O"] or pred[k].item() in begin_ids:
                            end = k - 1
                            break
                    else:
                        end = pred.size()[0] - 1
                    pred_entities_pos.append((start, end))

            for entity in pred_entities_pos:
                if entity not in gold_entities_pos:
                    continue
                for j in range(entity[0], entity[1] + 1):
                    if gold[j].item() != pred[j].item():
                        break
                else:
                    correct += 1

        print("Report precision, recall, and f1:")
        p = correct / pred_entities_num
        r = correct / gold_entities_num
        f1 = 2 * p * r / (p + r)
        print("{:.3f}, {:.3f}, {:.3f}".format(p, r, f1))

        return f1

    # Training phase.
    print("Start training.")
    instances = read_dataset(args.train_path)

    input_ids = torch.LongTensor([ins[0] for ins in instances])
    label_ids = torch.LongTensor([ins[1] for ins in instances])
    mask_ids = torch.LongTensor([ins[2] for ins in instances])

    instances_num = input_ids.size(0)
    batch_size = args.batch_size
    train_steps = int(instances_num * args.epochs_num / batch_size) + 1

    print("Batch size: ", batch_size)
    print("The number of training instances:", instances_num)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    ooptimizer = AdamW(optimizer_grouped_parameters,
                       lr=args.learning_rate,
                       correct_bias=False)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=train_steps * args.warmup,
                                     t_total=train_steps)

    total_loss = 0.
    f1 = 0.0
    best_f1 = 0.0

    for epoch in range(1, args.epochs_num + 1):
        model.train()
        for i, (input_ids_batch, label_ids_batch, mask_ids_batch) in enumerate(
                batch_loader(batch_size, input_ids, label_ids, mask_ids)):
            model.zero_grad()

            input_ids_batch = input_ids_batch.to(device)
            label_ids_batch = label_ids_batch.to(device)
            mask_ids_batch = mask_ids_batch.to(device)

            loss, _, _, _ = model(input_ids_batch, label_ids_batch,
                                  mask_ids_batch)
            if torch.cuda.device_count() > 1:
                loss = torch.mean(loss)
            total_loss += loss.item()
            if (i + 1) % args.report_steps == 0:
                print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".
                      format(epoch, i + 1, total_loss / args.report_steps))
                total_loss = 0.

            loss.backward()
            optimizer.step()
            scheduler.step()

        f1 = evaluate(args, False)
        if f1 > best_f1:
            best_f1 = f1
            save_model(model, args.output_model_path)
        else:
            continue

    # Evaluation phase.
    if args.test_path is not None:
        print("Test set evaluation.")
        model = load_model(model, args.output_model_path)
        evaluate(args, True)
Пример #21
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    # Path options.
    parser.add_argument("--load_model_path", default=None, type=str,
                        help="Path of the classfier model.")
    parser.add_argument("--vocab_path", default=None, type=str,
                        help="Path of the vocabulary file.")
    parser.add_argument("--spm_model_path", default=None, type=str,
                        help="Path of the sentence piece model.")
    parser.add_argument("--test_path", type=str,
                        help="Path of the testset.")
    parser.add_argument("--test_features_path", default=None, type=str,
                        help="Path of the test features for stacking.")
    parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
                        help="Path of the config file.")

    # Model options.
    parser.add_argument("--batch_size", type=int, default=128,
                        help="Batch size.")
    parser.add_argument("--seq_length", type=int, default=128,
                        help="Sequence length.")
    parser.add_argument("--labels_num", type=int, required=True,
                        help="Number of prediction labels.")
    parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
                        help="Emebdding type.")
    parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
                                              "cnn", "gatedcnn", "attn", "synt", \
                                              "rcnn", "crnn", "gpt", "bilstm"], \
                                              default="bert", help="Encoder type.")
    parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
    parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
                        help="Pooling type.")
    parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
    parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

    # Tokenizer options.
    parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
                        help="Specify the tokenizer." 
                             "Original Google BERT uses bert tokenizer on Chinese corpus."
                             "Char tokenizer segments sentences into characters."
                             "Space tokenizer segments sentences into words according to space."
                             )

    # Output options.
    parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
    parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")

    # Cross validation options.
    parser.add_argument("--folds_num", type=int, default=5,
                        help="The number of folds for cross validation.")
    
    args = parser.parse_args()

    # Load the hyperparameters from the config file.
    args = load_hyperparam(args)

    # Build tokenizer.
    args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)

    # Build classification model and load parameters.
    args.soft_targets, args.soft_alpha = False, False
    #model = Classifier(args)
    #model = load_model(model, args.load_model_path)

    # For simplicity, we use DataParallel wrapper to use multiple GPUs.
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #model = model.to(device)
    #if torch.cuda.device_count() > 1:
    #    print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
    #    model = torch.nn.DataParallel(model)

    dataset = read_dataset(args, args.test_path)

    src = torch.LongTensor([sample[0] for sample in dataset])
    seg = torch.LongTensor([sample[1] for sample in dataset])

    batch_size = args.batch_size
    instances_num = src.size()[0]

    print("The number of prediction instances: ", instances_num)

    test_features = [[] for _ in range(args.folds_num)]
    for fold_id in range(args.folds_num):
        load_model_name = ".".join(args.load_model_path.split(".")[:-1])
        load_model_suffix = args.load_model_path.split(".")[-1]

        model = Classifier(args)
        model = load_model(model, load_model_name+"-fold_"+str(fold_id)+"."+load_model_suffix)

        # For simplicity, we use DataParallel wrapper to use multiple GPUs.
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        if torch.cuda.device_count() > 1:
            print("{} GPUs are available. Let's use them.".format(torch.cuda.device_count()))
            model = torch.nn.DataParallel(model)

        model.eval()
        for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)):
            src_batch = src_batch.to(device)
            seg_batch = seg_batch.to(device)
            with torch.no_grad():
                _, logits = model(src_batch, None, seg_batch)
            
            prob = nn.Softmax(dim=1)(logits)
            prob = prob.cpu().numpy().tolist()
            test_features[fold_id].extend(prob)

    test_features = np.array(test_features)
    test_features = np.mean(test_features, axis=0)
    print(test_features.shape)
    np.save(args.test_features_path, test_features)