def main(args):
    label2idx = json_load(os.path.join(args.pretrained_model,
                                       "label2idx.json"))
    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.label2idx = label2idx
    args.idx2label = idx2label
    # get config, model and tokenizer
    model_config, _, model_tokenizer = MODEL_CLASSES[args.model_type]
    tokenizer = model_tokenizer.from_pretrained(
        args.pretrained_model, do_lower_case=args.do_lower_case)
    args.tokenizer = tokenizer
    config = model_config.from_pretrained(args.pretrained_model,
                                          do_lower_case=args.do_lower_case)
    args.config = config
    args.use_crf = config.use_crf
    model = load_model(args, args.pretrained_model)
    model.to(args.device)

    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_logger(args.logger)
    ner_data_processor.set_data_dir(args.preprocessed_text_dir)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    # fids = [each.stem.split(".")[0] for each in Path(args.preprocessed_text_dir).glob("*.txt")]
    for each_file in Path(args.preprocessed_text_dir).glob("*.txt"):
        try:
            test_example = ner_data_processor.get_test_examples(
                file_name=each_file.name)
            test_features = transformer_convert_data_to_features(
                args=args,
                input_examples=test_example,
                label2idx=label2idx,
                tokenizer=tokenizer,
                max_seq_len=args.max_seq_length)
            predictions = predict(args, model, test_features)
            Path(args.output_dir).mkdir(parents=True, exist_ok=True)
            ofn = each_file.stem.split(".")[0] + ".bio.txt"
            args.predict_output_file = os.path.join(args.output_dir, ofn)
            _output_bio(args, test_example, predictions)
        except Exception as ex:
            args.logger.error(
                f"Encountered an error when processing predictions for file: {each_file.name}"
            )
            args.logger.error(traceback.format_exc())

    if args.do_format:
        base_path = Path(args.output_dir)
        output_formatted_dir = base_path.parent / f"{base_path.stem}_formatted_output"
        output_formatted_dir.mkdir(parents=True, exist_ok=True)
        format_converter(text_dir=args.raw_text_dir,
                         input_bio_dir=args.output_dir,
                         output_dir=output_formatted_dir,
                         formatter=args.do_format,
                         do_copy_text=args.do_copy)
def test2():
    args = Args()
    ner_data_processor = TransformerNerDataProcessor()
    conll_2003 = Path(
        __file__).resolve().parent.parent.parent / "test_data/conll-2003"
    ner_data_processor.set_data_dir(conll_2003)
    labels, label2idx = ner_data_processor.get_labels(default='roberta')
    # train_examples = roberta_ner_data_processor.get_train_examples()
    train_examples = ner_data_processor.get_test_examples()
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased")
    features = transformer_convert_data_to_features(args,
                                                    train_examples[:5],
                                                    label2idx,
                                                    tokenizer,
                                                    max_seq_len=10)

    model = AlbertNerModel.from_pretrained("albert-base-v2",
                                           num_labels=len(label2idx))

    for idx, each_batch in enumerate(
            ner_data_loader(features, batch_size=5, task='test', auto=True)):
        original_mask = each_batch[1].numpy()
        print(original_mask, original_mask.shape)
        inputs = batch_to_model_inputs(each_batch)
        with torch.no_grad():
            logits, flatted_logits, loss = model(**inputs)
        logits = logits.numpy()
        print(logits)
        print(logits.shape)
        break
def test():
    from pprint import pprint
    roberta_ner_data_processor = TransformerNerDataProcessor()
    conll_2003 = Path(
        __file__).resolve().parent.parent.parent / "test_data/conll-2003"
    roberta_ner_data_processor.set_data_dir(conll_2003)
    labels, label2idx = roberta_ner_data_processor.get_labels(
        default='roberta')
    print(labels, label2idx)

    # train_examples = roberta_ner_data_processor.get_train_examples()
    train_examples = roberta_ner_data_processor.get_test_examples()
    pprint(train_examples[:5], indent=1)
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    # tokenizer = XLNetTokenizer.from_pretrained("xlnet-base_uncased")
    features = transformer_convert_data_to_features(train_examples[:5],
                                                    label2idx,
                                                    tokenizer,
                                                    max_seq_len=10)

    model = RobertaNerModel.from_pretrained("roberta-base",
                                            num_labels=len(label2idx))
    # model = XLNetNerModel.from_pretrained("xlnet-base_uncased", num_labels=len(label2idx))

    y_trues, y_preds = [], []
    y_pred, y_true = [], []
    prev_gd = 0
    for idx, each_batch in enumerate(
            ner_data_loader(features, batch_size=5, task='test', auto=True)):
        # [idx*batch_size: (idx+1)*batch_size]
        print([(fea.input_tokens, fea.guards)
               for fea in features[idx * 2:(idx + 1) * 2]])
        print(each_batch)

        original_tkid = each_batch[0].numpy()
        original_mask = each_batch[1].numpy()
        original_labels = each_batch[3].numpy()
        guards = each_batch[4].numpy()
        print(guards)

        inputs = batch_to_model_inputs(each_batch)

        with torch.no_grad():
            logits, flatted_logits, loss = model(**inputs)
            # get softmax output of the raw logits (keep dimensions)
            raw_logits = torch.argmax(torch.nn.functional.log_softmax(logits,
                                                                      dim=2),
                                      dim=2)
            raw_logits = raw_logits.detach().cpu().numpy()

        logits = logits.numpy()
        loss = loss.numpy()

        print(logits.shape)
        # print(loss)

        # tk=token, mk=mask, lb=label, lgt=logits
        for mks, lbs, lgts, gds in zip(original_mask, original_labels,
                                       raw_logits, guards):
            connect_sent_flag = False
            for mk, lb, lgt, gd in zip(mks, lbs, lgts, gds):
                if mk == 0:  # after hit first mask, we can stop for the current sentence since all rest will be pad
                    break
                if gd == 0 or prev_gd == gd:
                    continue
                if gd == -2:
                    connect_sent_flag = True
                    break
                if prev_gd != gd:
                    y_true.append(lb)
                    y_pred.append(lgt)
                    prev_gd = gd
            if connect_sent_flag:
                continue
            y_trues.append(y_true)
            y_preds.append(y_pred)
            y_pred, y_true = [], []
            prev_gd = 0
        print(y_trues)
        print(y_preds)
Пример #4
0
def run_task(args):
    set_seed(args.seed)

    if os.path.exists(args.new_model_dir) and os.listdir(
            args.new_model_dir
    ) and args.do_train and not args.overwrite_model_dir:
        raise ValueError(
            'new model directory: {} exists. Use --overwrite_model_dir to overwrite the previous model or create another directory for the new model'
            .format(args.new_model_dir))

    # init data processor
    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_data_dir(args.data_dir)
    ner_data_processor.set_logger(args.logger)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    if args.do_train:
        labels, label2idx = ner_data_processor.get_labels(
            default=args.model_type)
    else:
        label2idx = json_load(
            os.path.join(args.new_model_dir, "label2idx.json"))

    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.label2idx = label2idx
    args.idx2label = idx2label

    # get config, model and tokenizer
    model_config, model_model, model_tokenizer = MODEL_CLASSES[args.model_type]
    args.logger.info("Training/evaluation parameters: {}".format(
        {k: v
         for k, v in vars(args).items()}))

    # training
    if args.do_train:
        tokenizer = model_tokenizer.from_pretrained(
            args.tokenizer_name, do_lower_case=args.do_lower_case)
        tokenizer.add_tokens([NEXT_TOKEN])
        config = model_config.from_pretrained(args.config_name,
                                              num_labels=num_labels)
        tf_ckpts = list(Path(args.pretrained_model).glob("*.ckpt.index"))
        from_tf_flag = True if tf_ckpts else False
        if args.use_crf:
            crf_layer = Transformer_CRF(num_labels=num_labels,
                                        start_label_id=label2idx['CLS'])
            model = model_model.from_pretrained(args.pretrained_model,
                                                from_tf=from_tf_flag,
                                                config=config,
                                                crf=crf_layer)
            model.active_using_crf()
        else:
            model = model_model.from_pretrained(args.pretrained_model,
                                                from_tf=from_tf_flag,
                                                config=config)

        # #add an control token for combine sentence if it is too long to fit max_seq_len
        model.resize_token_embeddings(len(tokenizer))
        args.tokenizer = tokenizer
        args.config = model.config
        model.to(args.device)

        train_examples = ner_data_processor.get_train_examples()
        train_features = transformer_convert_data_to_features(
            args,
            input_examples=train_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        dev_examples = ner_data_processor.get_dev_examples()
        dev_features = transformer_convert_data_to_features(
            args,
            input_examples=dev_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        # set up evaluation metrics
        args.eval_tool = set_up_eval_tool(args)
        # start training
        train(args, model, train_features, dev_features)
        # save config and tokenizer with new model
        args.tokenizer.save_pretrained(args.new_model_dir)
        args.config.save_pretrained(args.new_model_dir)

    # predict - test.txt file prediction (if you need predict many files, use 'run_transformer_batch_prediction')
    if args.do_predict:
        args.config = model_config.from_pretrained(args.new_model_dir,
                                                   num_labels=num_labels)
        args.tokenizer = model_tokenizer.from_pretrained(
            args.new_model_dir, do_lower_case=args.do_lower_case)
        model = load_model(args)
        model.to(args.device)

        test_example = ner_data_processor.get_test_examples()
        test_features = transformer_convert_data_to_features(
            args,
            input_examples=test_example,
            label2idx=label2idx,
            tokenizer=args.tokenizer,
            max_seq_len=args.max_seq_length)

        predictions = predict(args, model, test_features)
        _output_bio(args, test_example, predictions)
Пример #5
0
def run_task(args):
    set_seed(args.seed)

    if os.path.exists(args.new_model_dir) and os.listdir(
            args.new_model_dir
    ) and args.do_train and not args.overwrite_model_dir:
        raise ValueError("""new model directory: {} exists. 
            Use --overwrite_model_dir to overwrite the previous model. 
            Or create another directory for the new model""".format(
            args.new_model_dir))

    # init data processor
    ner_data_processor = TransformerNerDataProcessor()
    ner_data_processor.set_data_dir(args.data_dir)
    ner_data_processor.set_logger(args.logger)
    if args.data_has_offset_information:
        ner_data_processor.offset_info_available()

    if args.do_train:
        labels, label2idx = ner_data_processor.get_labels(
            default=args.model_type)
    else:
        label2idx = json_load(
            os.path.join(args.new_model_dir, "label2idx.json"))

    num_labels = len(label2idx)
    idx2label = {v: k for k, v in label2idx.items()}
    args.num_labels = num_labels
    args.label2idx = label2idx
    args.idx2label = idx2label

    # get config, model and tokenizer
    model_config, model_model, model_tokenizer = MODEL_CLASSES[args.model_type]
    args.logger.info("Training/evaluation parameters: {}".format(
        {k: v
         for k, v in vars(args).items()}))

    # training
    if args.do_train:
        if args.model_type in {"roberta", "bart", "longformer", "deberta"}:
            # we need to set add_prefix_space to True for roberta, longformer, and Bart (any tokenizer from BPE)
            tokenizer = model_tokenizer.from_pretrained(
                args.tokenizer_name,
                do_lower_case=args.do_lower_case,
                add_prefix_space=True)
        else:
            tokenizer = model_tokenizer.from_pretrained(
                args.tokenizer_name, do_lower_case=args.do_lower_case)
        tokenizer.add_tokens(NEXT_TOKEN)
        config = model_config.from_pretrained(args.config_name,
                                              num_labels=num_labels)
        config.use_crf = args.use_crf
        config.label2idx = args.label2idx
        args.logger.info("New Model Config:\n{}".format(config))

        if args.pretrained_model == "microsoft/deberta-xlarge-v2":
            raise NotImplementedError(
                """the deberta-xlarge-v2 tokenizer is different from other deberta models
            the support for deberta-xlarge-v2 is not implemented.
            you can try other debata models: microsoft/deberta-base, 
            microsoft/deberta-large, microsoft/deberta-xlarge""")

        model = model_model.from_pretrained(args.pretrained_model,
                                            config=config)

        # #add an control token for combine sentence if it is too long to fit max_seq_len
        model.resize_token_embeddings(len(tokenizer))
        config.vocab_size = len(tokenizer)
        args.tokenizer = tokenizer
        args.config = model.config
        model.to(args.device)

        train_examples = ner_data_processor.get_train_examples()
        train_features = transformer_convert_data_to_features(
            args,
            input_examples=train_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        dev_examples = ner_data_processor.get_dev_examples()
        dev_features = transformer_convert_data_to_features(
            args,
            input_examples=dev_examples,
            label2idx=label2idx,
            tokenizer=tokenizer,
            max_seq_len=args.max_seq_length)

        # set up evaluation metrics
        args.eval_tool = set_up_eval_tool(args)
        # start training
        train(args, model, train_features, dev_features)
        # save config and tokenizer with new model
        args.tokenizer.save_pretrained(args.new_model_dir)
        args.config.save_pretrained(args.new_model_dir)

    # predict - test.txt file prediction (if you need predict many files, use 'run_transformer_batch_prediction')
    if args.do_predict:
        args.config = model_config.from_pretrained(args.new_model_dir,
                                                   num_labels=num_labels)
        args.use_crf = args.config.use_crf
        # args.model_type = args.config.model_type
        if args.model_type in {"roberta", "bart", "longformer"}:
            # we need to set add_prefix_space to True for roberta, longformer, and Bart (any tokenizer from GPT-2)
            tokenizer = model_tokenizer.from_pretrained(
                args.tokenizer_name,
                do_lower_case=args.do_lower_case,
                add_prefix_space=True)
        else:
            args.tokenizer = model_tokenizer.from_pretrained(
                args.new_model_dir, do_lower_case=args.do_lower_case)
        model = load_model(args)
        model.to(args.device)

        test_example = ner_data_processor.get_test_examples()
        test_features = transformer_convert_data_to_features(
            args,
            input_examples=test_example,
            label2idx=label2idx,
            tokenizer=args.tokenizer,
            max_seq_len=args.max_seq_length)

        predictions = predict(args, model, test_features)
        _output_bio(args, test_example, predictions)