Exemplo n.º 1
0
def main(_params):
    global params
    params = _params

    train_df, test_df, dev_df, labels, num_labels, label_map, data_dir = prepare_data(
    )

    data_args, model_args, config, tokenizer = prepare_config_and_tokenizer(
        data_dir, labels, num_labels, label_map)

    # ## Create Dataset Objects

    train_dataset = NerDataset(
        data_dir=data_args['data_dir'],
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args['max_seq_length'],
        overwrite_cache=data_args['overwrite_cache'],  # True
        mode=Split.train,
        data_size=params["data_size"])

    eval_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=data_args['overwrite_cache'],
                              mode=Split.dev,
                              data_size=params["data_size"])

    print(train_dataset.__len__(), eval_dataset.__len__())

    # Train top-model using the Trainer API
    trainer, model = run_train(train_dataset, eval_dataset, config, model_args,
                               labels, num_labels, label_map)

    gc.collect()
    torch.cuda.empty_cache()

    # ## Prepare test data, run trainer over test data and print metrics

    # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt
    test_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=True,
                              mode=Split.test,
                              data_size=params["data_size"])

    run_test(trainer, model, train_dataset, train_df, label_map)
    run_test(trainer, model, eval_dataset, dev_df, label_map)
    run_test(trainer, model, test_dataset, test_df, label_map)
Exemplo n.º 2
0
    def set_data(self, tok_sents: List[List[str]]):
        """Expects a document given as a list of sentences
        where each sentence is tokenized already."""
        examples = []
        for guid, sent in enumerate(tok_sents):
            words = [x + "\n" for x in sent]
            labels = ["O" for _ in range(len(sent))]
            examples.append(InputExample(guid=f"pred-{guid}", words=words, labels=labels))

        data = NerDataset(
            tokenizer=self.tokenizer,
            examples=examples,
            labels=["B", "O"],
            model_type="BertForTokenClassification",
            max_seq_length=256,
            mode=Split.pred
        )

        self.data = data
Exemplo n.º 3
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
            os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir)
            and training_args.do_train
            and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    tui_ids = None
    if data_args.umls:
        tui_ids = create_cui_dict(voc_updated=data_args.med_document, tokenizer=tokenizer)

    # Get datasets
    train_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            tui_ids=tui_ids,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            tui_ids=tui_ids,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
        return {
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            tui_ids=tui_ids,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, _ = align_predictions(predictions, label_ids)

        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
                    example_id = 0
                    for line in f:
                        try:
                            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                                writer.write(line)
                                if not preds_list[example_id]:
                                    example_id += 1
                            elif preds_list[example_id]:
                                output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
                                writer.write(output_line)
                            else:
                                logger.warning(
                                    "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                                )
                        except:
                            break

    return results
Exemplo n.º 4
0
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Print/save training arguments
    os.makedirs(args.output_dir, exist_ok=True)
    torch.save(args, os.path.join(args.output_dir, "run_args.bin"))
    logger.info("Training/evaluation parameters %s", args)

    # Get datasets
    train_dataset = NerDataset(
            data_dir=args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=args.max_seq_length,
            overwrite_cache=False,
            mode=Split.train,
        )
    split = int(len(train_dataset) * 0.9)
    train_sampler = SequentialSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.batch_size, collate_fn=default_data_collator
    )
    val_dataset = Subset(train_dataset, list(range(split, len(train_dataset))))
    val_sampler = SequentialSampler(val_dataset) if args.local_rank == -1 else DistributedSampler(val_dataset)
    val_dataloader = DataLoader(
        val_dataset, sampler=val_sampler, batch_size=args.batch_size, collate_fn=default_data_collator
    )
    eval_dataset = NerDataset(
Exemplo n.º 5
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                        preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list
Exemplo n.º 6
0
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, _ = align_predictions(predictions, label_ids)

        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))
Exemplo n.º 7
0
def main(_params):
    global params
    params = _params
    '''
    params['seed_value'] = args.seed_value
    params['set_seed'] = args.set_seed
    '''
    wb_run = wandb.init(project="NER", name=params['exp_name'] + "_init")
    if params['set_seed']:
        random_seed_set(params['seed_value'])

    train_df, test_df, dev_df, labels, num_labels, label_map, data_dir, wt = prepare_data(
    )

    data_args, model_args, config, tokenizer = prepare_config_and_tokenizer(
        data_dir, labels, num_labels, label_map)

    if 'add_vocab' in params.keys():
        process_entity(tokenizer, train_df)
        process_entity(tokenizer, dev_df)
        process_entity(tokenizer, test_df)

    # ## Create Dataset Objects

    xargs = {}
    if params.get('xargs'):
        xargs = params['xargs']
    xargs['wt'] = wt
    print('Got class weights')
    xargs["top_model"] = params.get("top_model")

    train_dataset = NerDataset(
        data_dir=data_args['data_dir'],
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args['max_seq_length'],
        overwrite_cache=data_args['overwrite_cache'],  # True
        mode=Split.train,
        data_size=params["data_size"],
        xargs=xargs)

    eval_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=data_args['overwrite_cache'],
                              mode=Split.dev,
                              data_size=100)

    # ## Prepare test data, run trainer over test data and print metrics

    # we can pass overwrite_cache as True since we might like to make new predictions by just changing test.txt
    test_dataset = NerDataset(data_dir=data_args['data_dir'],
                              tokenizer=tokenizer,
                              labels=labels,
                              model_type=config.model_type,
                              max_seq_length=data_args['max_seq_length'],
                              overwrite_cache=True,
                              mode=Split.test,
                              data_size=100)

    print(train_dataset.__len__(), eval_dataset.__len__(),
          test_dataset.__len__())
    wb_run.finish()

    # Train top-model using the Trainer API
    if params.get("hyp"):
        run_hyperp(train_dataset, eval_dataset, config, model_args, labels,
                   num_labels, label_map, tokenizer, xargs)
        return

    trainer, model = run_train(train_dataset, eval_dataset, config, model_args,
                               labels, num_labels, label_map, tokenizer, xargs)

    gc.collect()
    torch.cuda.empty_cache()

    wb_run = wandb.init(project="NER", name=params['exp_name'] + "summary")
    report = run_test(trainer, model, train_dataset, train_df, label_map)
    wandb.run.summary["train_report"] = report
    report = run_test(trainer, model, eval_dataset, dev_df, label_map)
    wandb.run.summary["val_report"] = report
    report = run_test(trainer, model, test_dataset, test_df, label_map)
    wandb.run.summary["test_report"] = report
    wandb.run.summary["model"] = model.__repr__()
    wandb.run.summary["data"] = {
        "train": train_dataset.__len__(),
        "val": eval_dataset.__len__(),
        "test": test_dataset.__len__(),
        "wt": wt
    }
    params["model_type"] = params["model_type"].name
    wandb.run.summary["params"] = params
    wb_run.finish()
Exemplo n.º 8
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenMultiLabelClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (NerDataset(
        data_dir=data_args.data_dir,
        data_format=data_args.data_format,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
        multilabeling=True,
    ) if training_args.do_train else None)
    eval_dataset = (NerDataset(
        data_dir=data_args.data_dir,
        data_format=data_args.data_format,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
        multilabeling=True,
    ) if training_args.do_eval else None)

    def get_label_preds_refs(
            predictions: np.ndarray,
            label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]:
        """ Returns a list of labels for each token in each sequence in the dataset. """
        logit_threshold = 0.0  # Corresponds to a probability of 0.5 if fed through a sigmoid.
        preds = predictions > logit_threshold

        batch_size, seq_len, _ = preds.shape

        refs_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                preds_list[i].append(
                    [label_map[x] for x in np.where(preds[i][j] == 1)[0]])
                refs_list[i].append(
                    [label_map[x] for x in np.where(label_ids[i][j] == 1)[0]])

        return preds_list, refs_list

    def align_predictions(
            predictions: np.ndarray,
            label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]:
        logit_threshold = 0.0  # Corresponds to a probability of 0.5 if fed through a sigmoid.
        preds = predictions > logit_threshold
        batch_size, seq_len, _ = preds.shape

        # is_tagged indicates for each token whether it has an associated tag (i.e. a
        # label, including the O label) and should be assessed, otherwise it's
        # a padding or special token.
        is_tagged = label_ids.sum(axis=2) > 0

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if is_tagged[i, j]:
                    #out_label_list[i].append(label_map[label_ids[i][j]])
                    out_label_list[i].append([
                        label_map[x] for x in np.where(label_ids[i][j] == 1)[0]
                    ])
                    #preds_list[i].append(label_map[preds[i][j]])
                    preds_list[i].append(
                        [label_map[x] for x in np.where(preds[i][j] == 1)[0]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions,
                                                       p.label_ids)
        (chunk_prec, chunk_rec, chunk_f1, tok_prec, tok_rec,
         tok_f1) = fsn4nlp.utils.conlleval.evaluate_multilabel(
             out_label_list, preds_list)
        return {
            "chunk_precision": chunk_prec,
            "chunk_recall": chunk_rec,
            "chunk_f1": chunk_f1,
            "tok_precision": tok_prec,
            "tok_recall": tok_rec,
            "tok_f1": tok_f1,
        }

    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      compute_metrics=compute_metrics,
                      labels=labels)

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = NerDataset(
            data_dir=data_args.data_dir,
            data_format=data_args.data_format,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
            multilabeling=True,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, refs_list = get_label_preds_refs(predictions, label_ids)

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                for i, example in enumerate(test_dataset):
                    for tok_id in example.input_ids:
                        tok = tokenizer.convert_ids_to_tokens(tok_id)
                        if refs_list[i][0] == []:
                            output_line = f"{tok}\n"
                            refs_list[i].pop(0)
                        else:
                            output_line = f"{tok} {refs_list[i].pop(0)} {preds_list[i].pop(0)}\n"
                        writer.write(output_line)

    return results