Пример #1
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification_glove")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 3
    batch_size = 32
    evaluate_every = 100
    # load from a local path:
    lang_model = Path("../saved_models/glove-german-uncased")
    # or through s3
    #lang_model = "glove-german-uncased"
    do_lower_case = True

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        dev_split=0,
        test_filename="test.tsv",
        train_filename="train.tsv",
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)

    # 4. Create an AdaptiveModel
    # a) which consists of an embedding model as a basis.
    # Word embedding models only converts words it has seen during training to embedding vectors.
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[300, 600, len(label_list)],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()
Пример #2
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]
    train_filename = "train.tsv"
    dev_filename = "dev_200k.tsv"

    # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
    generate_data = False
    data_dir = Path("../data/msmarco_passage")
    predictions_raw_filename = "predictions_raw.txt"
    predictions_filename = "predictions.txt"
    train_source_filename = "triples.train.1m.tsv"
    qrels_filename = "qrels.dev.tsv"
    queries_filename = "queries.dev.tsv"
    passages_filename = "collection.tsv"
    top1000_filename = "top1000.dev"

    # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
    # The final format is a tsv file with 3 columns (text, text_b and label)
    if generate_data:
        reformat_msmarco_train(data_dir / train_source_filename,
                               data_dir / train_filename)
        reformat_msmarco_dev(data_dir / queries_filename,
                             data_dir / passages_filename,
                             data_dir / qrels_filename,
                             data_dir / top1000_filename,
                             data_dir / dev_filename)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    Evaluation during training will be performed on a slice of the train set
    #    We will be using the msmarco dev set as our final evaluation set
    processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                                label_list=label_list,
                                                metric="f1_macro",
                                                train_filename=train_filename,
                                                test_filename=None,
                                                dev_split=0.001,
                                                max_seq_len=128,
                                                data_dir=data_dir,
                                                delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list),
                                             class_weights=data_silo.calculate_class_weights(
                                                 task_name="text_classification"),
                                             )

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/passage_ranking_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128)
    result = model.inference_from_file(data_dir / dev_filename)

    write_msmarco_results(result, save_dir / predictions_raw_filename)

    msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                       dev_file=data_dir / dev_filename,
                       qrels_file=data_dir / qrels_filename,
                       output_file=save_dir / predictions_filename)

    model.close_multiprocessing_pool()
Пример #3
0
def xlmr_qa_demo():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa")

    #########################
    ######## Settings
    ########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 3
    grad_acc_steps = 8
    n_epochs = 2
    evaluate_every = 200
    base_LM_model = "xlm-roberta-large"

    data_dir = Path("../data/squad20")
    train_filename = Path("train-v2.0.json")
    dev_filename = Path("dev-v2.0.json")

    save_dir = Path("../saved_models/xlmr-large-qa")

    inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json")
    predictions_file = save_dir / "predictions.json"
    full_predictions_file = save_dir / "full_predictions.json"
    max_processes_for_inference = 8
    train = True
    inference = False

    if train:
        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model)
        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=tokenizer,
            max_seq_len=384,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=None,
            data_dir=data_dir,
            dev_split=0.0
        )

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(base_LM_model, n_added_tokens=3)
        # b) and a prediction head on top that is suited for our task => Question Answering
        prediction_head = QuestionAnsweringHead()

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=3e-5,
            schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            grad_acc_steps=grad_acc_steps,
            device=device
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )
        # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
        model = trainer.train(model)

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)


    if inference:
        model = Inferencer.load(save_dir, batch_size=32, gpu=True)
        full_result = model.inference_from_file(
            file=inference_file,
            max_processes=max_processes_for_inference,
        )

        for x in full_result:
            print(x)
            print()

        result = {r["id"]: r["preds"][0][0] for r in full_result}
        full_result = {r["id"]: r["preds"] for r in full_result}

        json.dump(result,
                  open(predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
        json.dump(full_result,
                  open(full_predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
Пример #4
0
def test_ner(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Paris is a town in France."
        },
    ]
    model = Inferencer.load(
        model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english",
        num_processes=0,
        task_type="ner")
    # labels arent correctly inserted from transformers
    # They are converted to LABEL_1 ... LABEL_N
    # For the inference result to contain predictions we need them in IOB NER format
    model.processor.tasks["ner"]["label_list"][-1] = "B-LOC"
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Paris"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Пример #5
0
def run_experiment(args):
    validate_args(args)
    distributed = bool(args.general.local_rank != -1)

    # Init device and distributed settings
    device, n_gpu = initialize_device_settings(
        use_cuda=args.general.cuda,
        local_rank=args.general.local_rank,
        fp16=args.general.fp16,
    )

    args.parameter.batch_size = int(args.parameter.batch_size //
                                    args.parameter.gradient_accumulation_steps)
    if n_gpu > 1:
        args.parameter.batch_size = args.parameter.batch_size * n_gpu
    set_all_seeds(args.general.seed)

    # Prepare Data
    tokenizer = BertTokenizer.from_pretrained(
        args.parameter.model, do_lower_case=args.parameter.lower_case)
    # processor = Processor.load(
    #     tokenizer=tokenizer,
    #     max_seq_len=args.parameter.max_seq_len,
    #     data_dir=args.general.data_dir,
    #     train_filename=args.task.train_filename,
    #     dev_filename=args.task.dev_filename,
    #     test_filename=args.task.test_filename,
    #     dev_split=args.task.dev_split,
    #     metrics=args.task.metrics,
    #     **args.task.toDict(),  # args is of type DotMap and needs conversion to std python dicts
    # )
    processor = Processor.load(
        tokenizer=tokenizer,
        max_seq_len=args.parameter.max_seq_len,
        data_dir=args.general.data_dir,
        **args.task.toDict(
        ),  # args is of type DotMap and needs conversion to std python dicts
    )

    data_silo = DataSilo(
        processor=processor,
        batch_size=args.parameter.batch_size,
        distributed=distributed,
    )

    class_weights = None
    if args.parameter.balance_classes:
        task_names = list(processor.tasks.keys())
        if len(task_names) > 1:
            raise NotImplementedError(
                f"Balancing classes is currently not supported for multitask experiments. Got tasks:  {task_names} "
            )
        class_weights = data_silo.calculate_class_weights(
            task_name=task_names[0])

    model = get_adaptive_model(
        lm_output_type=args.parameter.lm_output_type,
        prediction_heads=args.parameter.prediction_head,
        layer_dims=args.parameter.layer_dims,
        model=args.parameter.model,
        device=device,
        class_weights=class_weights,
        embeds_dropout_prob=args.parameter.embeds_dropout_prob,
    )

    # Init optimizer

    # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this?
    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=args.parameter.learning_rate,
        warmup_proportion=args.parameter.warmup_proportion,
        loss_scale=args.general.loss_scale,
        fp16=args.general.fp16,
        n_batches=len(data_silo.loaders["train"]),
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        n_epochs=args.parameter.epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=args.parameter.epochs,
        n_gpu=n_gpu,
        grad_acc_steps=args.parameter.gradient_accumulation_steps,
        fp16=args.general.fp16,
        local_rank=args.general.local_rank,
        warmup_linear=warmup_linear,
        evaluate_every=args.logging.eval_every,
        device=device,
    )

    model = trainer.train(model)

    model_name = (
        f"{model.language_model.name}-{model.language_model.language}-{args.task.name}"
    )
    processor.save(f"{args.general.output_dir}/{model_name}")
    model.save(f"{args.general.output_dir}/{model_name}")
Пример #6
0
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
    model.close_multiprocessing_pool()
Пример #7
0
    def train(
        self,
        data_dir: str,
        train_filename: str,
        dev_filename: Optional[str] = None,
        test_filename: Optional[str] = None,
        use_gpu: Optional[bool] = None,
        batch_size: int = 10,
        n_epochs: int = 2,
        learning_rate: float = 1e-5,
        max_seq_len: Optional[int] = None,
        warmup_proportion: float = 0.2,
        dev_split: float = 0,
        evaluate_every: int = 300,
        save_dir: Optional[str] = None,
        num_processes: Optional[int] = None,
        use_amp: str = None,
    ):
        """
        Fine-tune a model on a QA dataset. Options:

        - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data)
        - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool)

        :param data_dir: Path to directory containing your training data in SQuAD style
        :param train_filename: Filename of training data
        :param dev_filename: Filename of dev / eval data
        :param test_filename: Filename of test data
        :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
                          that gets split off from training data for eval.
        :param use_gpu: Whether to use GPU (if available)
        :param batch_size: Number of samples the model receives in one batch for training
        :param n_epochs: Number of iterations on the whole training data set
        :param learning_rate: Learning rate of the optimizer
        :param max_seq_len: Maximum text length (in tokens). Everything longer gets cut down.
        :param warmup_proportion: Proportion of training steps until maximum learning rate is reached.
                                  Until that point LR is increasing linearly. After that it's decreasing again linearly.
                                  Options for different schedules are available in FARM.
        :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset
        :param save_dir: Path to store the final model
        :param num_processes: The number of processes for `multiprocessing.Pool` during preprocessing.
                              Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set.
                              Set to None to use all CPU cores minus one.
        :param use_amp: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model.
                        Available options:
                        None (Don't use AMP)
                        "O0" (Normal FP32 training)
                        "O1" (Mixed Precision => Recommended)
                        "O2" (Almost FP16)
                        "O3" (Pure FP16).
                        See details on: https://nvidia.github.io/apex/amp.html
        :return: None
        """

        if dev_filename:
            dev_split = 0

        if num_processes is None:
            num_processes = multiprocessing.cpu_count() - 1 or 1

        set_all_seeds(seed=42)

        # For these variables, by default, we use the value set when initializing the FARMReader.
        # These can also be manually set when train() is called if you want a different value at train vs inference
        if use_gpu is None:
            use_gpu = self.use_gpu
        if max_seq_len is None:
            max_seq_len = self.max_seq_len

        device, n_gpu = initialize_device_settings(use_cuda=use_gpu,
                                                   use_amp=use_amp)

        if not save_dir:
            save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}"

        # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=self.inferencer.processor.tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            dev_split=dev_split,
            test_filename=test_filename,
            data_dir=Path(data_dir),
        )

        # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them
        # and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor,
                             batch_size=batch_size,
                             distributed=False,
                             max_processes=num_processes)

        # 3. Create an optimizer and pass the already initialized model
        model, optimizer, lr_schedule = initialize_optimizer(
            model=self.inferencer.model,
            # model=self.inferencer.model,
            learning_rate=learning_rate,
            schedule_opts={
                "name": "LinearWarmup",
                "warmup_proportion": warmup_proportion
            },
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device,
            use_amp=use_amp,
        )
        # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=data_silo,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          use_amp=use_amp,
                          disable_tqdm=not self.progress_bar)

        # 5. Let it grow!
        self.inferencer.model = trainer.train()
        self.save(Path(save_dir))
Пример #8
0
def train_from_scratch():
    args = parse_arguments()
    use_amp = "O2"  # using "O2" here allows roughly 30% larger batch_sizes and 45% speed up

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    # Only the main process should log here
    if args.local_rank in [-1, 0]:
        ml_logger = MLFlowLogger(
            tracking_uri="https://public-mlflow.deepset.ai/")
        ml_logger.init_experiment(experiment_name="train_from_scratch",
                                  run_name="run")

    set_all_seeds(seed=39)
    device, n_gpu = initialize_device_settings(use_cuda=True,
                                               local_rank=args.local_rank,
                                               use_amp=use_amp)

    save_dir = Path("saved_models/train_from_scratch")
    data_dir = Path("data/test")

    # Option A) just using a single file
    # train_filename = "train.txt"

    # Option B) (recommended when using StreamingDataSilo):
    # split and shuffle that file to have random order within and across epochs
    randomize_and_split_file(data_dir / "train.txt",
                             output_dir=Path("data/split_files"),
                             docs_per_file=1000)
    train_filename = Path("data/split_files")

    dev_filename = "dev.txt"

    distributed = args.local_rank != -1
    max_seq_len = 128
    batch_size = 8  #if distributed: this is per_gpu
    grad_acc = 1
    learning_rate = 1e-4
    warmup_proportion = 0.05
    n_epochs = 2
    evaluate_every = 15000
    log_loss_every = 2
    checkpoint_every = 500
    checkpoint_root_dir = Path("checkpoints")
    checkpoints_to_keep = 4
    next_sent_pred_style = "bert-style"  #or "sentence"
    max_docs = None

    # Choose enough workers to queue sufficient batches during training.
    # Optimal number depends on your GPU speed, CPU speed and number of cores
    # 16 works well on a 4x V100 machine with 16 cores (AWS: p3.8xlarge). For a single GPU you will need less.
    data_loader_workers = 1

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load("bert-base-uncased", do_lower_case=True)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset
    processor = BertStyleLMProcessor(data_dir=data_dir,
                                     tokenizer=tokenizer,
                                     max_seq_len=max_seq_len,
                                     train_filename=train_filename,
                                     dev_filename=dev_filename,
                                     test_filename=None,
                                     next_sent_pred_style=next_sent_pred_style,
                                     max_docs=max_docs)
    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and
    #    calculates a few descriptive statistics of our datasets
    # stream_data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed)
    stream_data_silo = StreamingDataSilo(
        processor=processor,
        batch_size=batch_size,
        distributed=distributed,
        dataloader_workers=data_loader_workers)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size)

    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead(768, tokenizer.vocab_size)
    next_sentence_head = NextSentenceHead(num_labels=2,
                                          task_name="nextsentence")

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=learning_rate,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": warmup_proportion
        },
        n_batches=len(stream_data_silo.get_data_loader("train")),
        n_epochs=n_epochs,
        device=device,
        grad_acc_steps=grad_acc,
        distributed=distributed,
        use_amp=use_amp,
        local_rank=args.local_rank)

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer.create_or_load_checkpoint(
        model=model,
        optimizer=optimizer,
        data_silo=stream_data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        log_loss_every=log_loss_every,
        device=device,
        grad_acc_steps=grad_acc,
        local_rank=args.local_rank,
        checkpoint_every=checkpoint_every,
        checkpoint_root_dir=checkpoint_root_dir,
        checkpoints_to_keep=checkpoints_to_keep,
        use_amp=use_amp)
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)
    if args.local_rank != -1:
        torch.distributed.destroy_process_group()
Пример #9
0
def test_doc_classification(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 8
    evaluate_every = 5
    lang_model = "bert-base-german-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[
        768, len(processor.tasks["text_classification"]["label_list"])
    ])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      warmup_linear=warmup_linear,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Martin Müller spielt Handball in Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }, {
        "text":
        "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar"
    }, {
        "text": "Neues Video von Designern macht im Netz die Runde"
    }, {
        "text":
        "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden"
    }, {
        "text":
        "Aufständische verwendeten Chemikalie bei Gefechten im August."
    }, {
        "text":
        "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung"
    }, {
        "text":
        "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt"
    }, {
        "text":
        "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben"
    }, {
        "text":
        "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht"
    }]
    #TODO enable loading here again after we have finished migration towards "processor.tasks"
    #inf = Inferencer.load(save_dir)
    inf = Inferencer(model=model, processor=processor)
    result = inf.run_inference(dicts=basic_texts)
    assert result[0]["predictions"][0]["label"] == "OTHER"
    assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1

    loaded_processor = TextClassificationProcessor.load_from_dir(save_dir)
    inf2 = Inferencer(model=model, processor=loaded_processor)
    result_2 = inf2.run_inference(dicts=basic_texts)
    pprint(list(zip(result, result_2)))
    for r1, r2 in list(zip(result, result_2)):
        assert r1 == r2


# if(__name__=="__main__"):
#     test_doc_classification()
Пример #10
0
def test_lm_finetuning_no_next_sentence(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=12,
        next_sent_pred=False
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1)

    language_model = LanguageModel.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={'name': 'CosineWarmup', 'warmup_proportion': 0.1}
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    # LM embeddings and weight of decoder in head are shared and should therefore be equal
    assert torch.all(
        torch.eq(model.language_model.model.embeddings.word_embeddings.weight, model.prediction_heads[0].decoder.weight))

    save_dir = "testsave/lm_finetuning_no_nsp"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Farmer's life is great."},
        {"text": "It's nothing for big city kids though."},
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.']
    assert result[0]["vec"].shape == (768,)
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
Пример #11
0
def test_ner_amp(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename=Path("train-sample.txt"),
                             dev_filename=Path("dev-sample.txt"),
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_token"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "1980 kam der Crown von Toyota"
        },
    ]
    model = Inferencer.load(save_dir, num_processes=0)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Пример #12
0
    def __init__(self,
                 model,
                 processor,
                 task_type,
                 batch_size=4,
                 gpu=False,
                 name=None,
                 return_class_probs=False,
                 extraction_strategy=None,
                 extraction_layer=None,
                 s3e_stats=None,
                 num_processes=None,
                 disable_tqdm=False,
                 benchmarking=False,
                 dummy_ph=False):
        """
        Initializes Inferencer from an AdaptiveModel and a Processor instance.

        :param model: AdaptiveModel to run in inference mode
        :type model: AdaptiveModel
        :param processor: A dataset specific Processor object which will turn input (file or dict) into a Pytorch Dataset.
        :type processor: Processor
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param name: Name for the current Inferencer model, displayed in the REST API
        :type name: string
        :param return_class_probs: either return probability distribution over all labels or the prob of the associated label
        :type return_class_probs: bool
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors),
                               's3e' (sentence vector via S3E pooling, see https://arxiv.org/abs/2002.09620)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`.
                              Set to value of 1 (or 0) to disable multiprocessing.
                              Set to None to let Inferencer use all CPU cores minus one.
                              If you want to debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :param dummy_ph: If True, methods of the prediction head will be replaced
                     with a dummy method. This is used to isolate lm run time from ph run time.
        :type dummy_ph: bool
        :param benchmarking: If True, a benchmarking object will be initialised within the class and
                             certain parts of the code will be timed for benchmarking. Should be kept
                             False if not benchmarking since these timing checkpoints require synchronization
                             of the asynchronous Pytorch operations and may slow down the model.
        :type benchmarking: bool
        :return: An instance of the Inferencer.

        """
        MLFlowLogger.disable()

        # For benchmarking
        if dummy_ph:
            model.bypass_ph()

        self.benchmarking = benchmarking
        if self.benchmarking:
            self.benchmarker = Benchmarker()

        # Init device and distributed settings
        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)

        self.processor = processor
        self.model = model
        self.model.eval()
        self.batch_size = batch_size
        self.device = device
        self.language = self.model.get_language()
        self.task_type = task_type
        self.disable_tqdm = disable_tqdm
        self.problematic_sample_ids = set()

        if task_type == "embeddings":
            if not extraction_layer or not extraction_strategy:
                logger.warning(
                    "Using task_type='embeddings', but couldn't find one of the args `extraction_layer` and `extraction_strategy`. "
                    "Since FARM 0.4.2, you set both when initializing the Inferencer and then call inferencer.inference_from_dicts() instead of inferencer.extract_vectors()"
                )
            self.model.prediction_heads = torch.nn.ModuleList([])
            self.model.language_model.extraction_layer = extraction_layer
            self.model.language_model.extraction_strategy = extraction_strategy
            self.model.language_model.s3e_stats = s3e_stats

        # TODO add support for multiple prediction heads

        self.name = name if name != None else f"anonymous-{self.task_type}"
        self.return_class_probs = return_class_probs

        model.connect_heads_with_processor(processor.tasks,
                                           require_labels=False)
        set_all_seeds(42)

        self._set_multiprocessing_pool(num_processes)
Пример #13
0
    def load(
        cls,
        model_name_or_path,
        revision=None,
        batch_size=4,
        gpu=False,
        task_type=None,
        return_class_probs=False,
        strict=True,
        max_seq_len=256,
        doc_stride=128,
        extraction_layer=None,
        extraction_strategy=None,
        s3e_stats=None,
        num_processes=None,
        disable_tqdm=False,
        tokenizer_class=None,
        use_fast=True,
        tokenizer_args=None,
        multithreading_rust=True,
        dummy_ph=False,
        benchmarking=False,
    ):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, True by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
        :type use_fast: bool
        :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
            See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
            on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
        :type tokenizer_args: dict
        :param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
                                    Note: Enabling multithreading in Rust AND multiprocessing in python might cause
                                    deadlocks.
        :type multithreading_rust: bool
        :param dummy_ph: If True, methods of the prediction head will be replaced
                             with a dummy method. This is used to isolate lm run time from ph run time.
        :type dummy_ph: bool
        :param benchmarking: If True, a benchmarking object will be initialised within the class and
                             certain parts of the code will be timed for benchmarking. Should be kept
                             False if not benchmarking since these timing checkpoints require synchronization
                             of the asynchronous Pytorch operations and may slow down the model.
        :type benchmarking: bool
        :return: An instance of the Inferencer.

        """
        if tokenizer_args is None:
            tokenizer_args = {}

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path,
                revision=revision,
                device=device,
                task_type=task_type)
            processor = Processor.convert_from_transformers(
                model_name_or_path,
                revision=revision,
                task_type=task_type,
                max_seq_len=max_seq_len,
                doc_stride=doc_stride,
                tokenizer_class=tokenizer_class,
                tokenizer_args=tokenizer_args,
                use_fast=use_fast)

        # override processor attributes loaded from config or HF with inferencer params
        processor.max_seq_len = max_seq_len
        processor.multithreading_rust = multithreading_rust
        if hasattr(processor, "doc_stride"):
            assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \
                                             "as the passage windows slide, causing the model to skip over parts of the document. " \
                                             "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) "
            processor.doc_stride = doc_stride

        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm,
                   benchmarking=benchmarking,
                   dummy_ph=dummy_ph)
    def perform_fine_tuning(current_info_need,
                            bert_model,
                            label_list,
                            num_epochs,
                            condition,
                            folds=10,
                            stratified=True,
                            learning_rate=2e-5,
                            batch_size=32,
                            embeds_dropout_prob=.1):

        ## Define evaluation metrics ##
        def evaluation_metrics(preds, labels):
            acc = simple_accuracy(preds, labels).get("acc")
            f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other")
            f1infoneed = f1_score(y_true=labels,
                                  y_pred=preds,
                                  pos_label=current_info_need)
            recall_infoneed = recall_score(y_true=labels,
                                           y_pred=preds,
                                           pos_label=current_info_need)
            precision_infoneed = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 pos_label=current_info_need)
            recall_other = recall_score(y_true=labels,
                                        y_pred=preds,
                                        pos_label="Other")
            precision_other = precision_score(y_true=labels,
                                              y_pred=preds,
                                              pos_label="Other")
            recall_macro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="macro")
            precision_macro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="macro")
            recall_micro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="micro")
            precision_micro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="micro")
            recall_weighted = recall_score(y_true=labels,
                                           y_pred=preds,
                                           average="weighted")
            precision_weighted = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 average="weighted")
            f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
            f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
            mcc = matthews_corrcoef(labels, preds)
            f1weighted = f1_score(y_true=labels,
                                  y_pred=preds,
                                  average="weighted")

            return {
                "info_need": current_info_need,
                "model": bert_model,
                "num_epochs": num_epochs,
                "condition": condition,
                "acc": acc,
                "f1_other": f1other,
                "f1_infoneed": f1infoneed,
                "precision_infoneed": precision_infoneed,
                "recall_infoneed": recall_infoneed,
                "recall_other": recall_other,
                "precision_other": precision_other,
                "recall_macro": recall_macro,
                "precision_macro": precision_macro,
                "recall_micro": recall_micro,
                "precision_micro": precision_micro,
                "recall_weighted": recall_weighted,
                "precision_weighted": precision_weighted,
                "f1_weighted": f1weighted,
                "f1_macro": f1macro,
                "f1_micro": f1micro,
                "f1_weighted": f1weighted,
                "mcc": mcc
            }

        register_metrics(
            f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs',
            evaluation_metrics)
        metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs'
        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        logger, ml_logger = init_logging()
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model,
                                   do_lower_case=False)

        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            max_seq_len=256,
            train_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv",
            test_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv",
            data_dir="data/",
            label_list=label_list,
            metric=metric,
            text_column_name="utterance",
            label_column_name=level,
            delimiter=";")

        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        silos = DataSiloForCrossVal.make(data_silo,
                                         n_splits=folds,
                                         sets=['train', 'test'])

        # the following steps should be run for each of the folds of the cross validation, so we put them
        # into a function
        def train_on_split(silo_to_use, n_fold, save_dir):
            logger.info(
                f"############ Crossvalidation: Fold {n_fold} ############")
            # Create an AdaptiveModel
            # a) which consists of a pretrained language model as a basis
            language_model = LanguageModel.load(bert_model)
            # b) and a prediction head on top that is suited for our task => Text classification
            prediction_head = TextClassificationHead(
                class_weights=data_silo.calculate_class_weights(
                    task_name="text_classification"),
                num_labels=len(label_list))

            model = AdaptiveModel(language_model=language_model,
                                  prediction_heads=[prediction_head],
                                  embeds_dropout_prob=embeds_dropout_prob,
                                  lm_output_types=["per_sequence"],
                                  device=device)

            # Create an optimizer
            model, optimizer, lr_schedule = initialize_optimizer(
                model=model,
                learning_rate=learning_rate,
                device=device,
                n_batches=len(silo_to_use.loaders["train"]),
                n_epochs=num_epochs,
                use_amp=None)

            # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
            # Also create an EarlyStopping instance and pass it on to the trainer

            # An early stopping instance can be used to save the model that performs best on the dev set
            # according to some metric and stop training when no improvement is happening for some iterations.
            # NOTE: Using a different save directory for each fold, allows us afterwards to use the
            # nfolds best models in an ensemble!
            save_dir = Path(str(save_dir) + f"-{n_fold}")
            earlystopping = EarlyStopping(
                metric="f1_infoneed",
                mode=
                "max",  # use the metric from our own metrics function instead of loss
                save_dir=save_dir,  # where to save the best model
                patience=
                5  # number of evaluations to wait for improvement before terminating the training
            )

            trainer = Trainer(model=model,
                              optimizer=optimizer,
                              data_silo=silo_to_use,
                              epochs=num_epochs,
                              n_gpu=n_gpu,
                              lr_schedule=lr_schedule,
                              evaluate_every=100,
                              device=device,
                              early_stopping=earlystopping,
                              evaluator_test=False)

            # train it
            trainer.train()

            return trainer.model

        # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
        # on the test set of each fold
        # Remember all the results for overall metrics over all predictions of all folds and for averaging
        allresults = []
        all_preds = []
        all_labels = []
        bestfold = None
        bestf1_info_need = -1
        language_model_name = bert_model
        if language_model_name.find("/") != -1:
            language_model_name = language_model_name.replace("/", "_")
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}"
        )
        for num_fold, silo in enumerate(silos):
            model = train_on_split(silo, num_fold, save_dir)

            # do eval on test set here (and not in Trainer),
            #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
            evaluator_test = Evaluator(
                data_loader=silo.get_data_loader("test"),
                tasks=silo.processor.tasks,
                device=device)
            result = evaluator_test.eval(model, return_preds_and_labels=True)
            evaluator_test.log_results(result,
                                       "Test",
                                       steps=len(silo.get_data_loader("test")),
                                       num_fold=num_fold)

            allresults.append(result)
            all_preds.extend(result[0].get("preds"))
            all_labels.extend(result[0].get("labels"))

            # keep track of best fold
            f1_info_need = result[0]["f1_infoneed"]
            if f1_info_need > bestf1_info_need:
                bestf1_info_need = f1_info_need
                bestfold = num_fold

            # emtpy cache to avoid memory leak and cuda OOM across multiple folds
            model.cpu()
            torch.cuda.empty_cache()

        # Save the per-fold results to json for a separate, more detailed analysis
        with open(
                f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json",
                "wt") as fp:
            json.dump(allresults, fp)

        # calculate overall metrics across all folds
        xval_f1_other = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 pos_label="Other")
        xval_f1_info_need = f1_score(all_labels,
                                     all_preds,
                                     labels=label_list,
                                     pos_label=current_info_need)
        xval_f1_micro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="micro")
        xval_f1_macro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="macro")
        xval_mcc = matthews_corrcoef(all_labels, all_preds)

        xval_overall_results = {
            "xval_f1_other": xval_f1_other,
            f"xval_f1_infoneed": xval_f1_info_need,
            "xval_f1_micro": xval_f1_micro,
            "xval_f1_macro": xval_f1_macro,
            "xval_f1_mcc": xval_mcc
        }

        logger.info(f"XVAL F1 MICRO: {xval_f1_micro}")
        logger.info(f"XVAL F1 MACRO: {xval_f1_macro}")
        logger.info(f"XVAL F1 OTHER: {xval_f1_other}")
        logger.info(
            f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs:   {xval_f1_info_need}"
        )
        logger.info(f"XVAL MCC: {xval_mcc}")

        # -----------------------------------------------------
        # Just for illustration, use the best model from the best xval val for evaluation on
        # the original (still unseen) test set.
        logger.info(
            "###### Final Eval on hold out test set using best model #####")
        evaluator_origtest = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)
        # restore model from the best fold
        lm_name = model.language_model.name
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}"
        )
        model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        result = evaluator_origtest.eval(model)
        logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"]))
        logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"]))
        logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"]))
        logger.info("TEST F1 {0}: {1}".format(current_info_need,
                                              result[0]["f1_infoneed"]))
        logger.info("TEST MCC:  {}".format(result[0]["mcc"]))

        test_set_results = {
            "test_f1_other": result[0]["f1_other"],
            "test_f1_infoneed": result[0][f"f1_infoneed"],
            "test_f1_micro": result[0]["f1_micro"],
            "test_f1_macro": result[0]["f1_macro"],
            "test_f1_mcc": result[0]["mcc"]
        }
Пример #15
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    dev_split = 0.1
    dev_stratification = True
    max_processes = 1    # 128 is default
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            dev_split=dev_split,
                                            dev_stratification=dev_stratification,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        max_processes=max_processes,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
        {"text": "Martin Müller spielt Handball in Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()
Пример #16
0
def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=
        "facebook/dpr-question_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)
    passage_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)

    processor = TextSimilarityProcessor(
        query_tokenizer=query_tokenizer,
        passage_tokenizer=passage_tokenizer,
        max_seq_len_query=256,
        max_seq_len_passage=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1)

    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRQuestionEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRContextEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPRQuestionEncoder
    assert type(passage_language_model) == DPRContextEncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][
        0, 0].item() - -0.010200000368058681 < 0.0001

    d = {
        'query':
        'big little lies season 2 how many episodes',
        'passages': [{
            'title': 'Big Little Lies (TV series)',
            'text':
            'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
            'label': 'positive',
            'external_id': '18768923'
        }, {
            'title': 'Little People, Big World',
            'text':
            'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
            'label': 'hard_negative',
            'external_id': '7459116'
        }, {
            'title': 'Cormac McCarthy',
            'text':
            'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
            'label': 'negative',
            'passage_id': '2145653'
        }]
    }

    dataset, tensor_names, _ = processor.dataset_from_dicts(
        dicts=[d], return_baskets=False)
    features = {
        key: val.unsqueeze(0).to(device)
        for key, val in zip(tensor_names, dataset[0])
    }

    # test features
    assert torch.all(
        torch.eq(
            features["query_input_ids"][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102])))
    assert torch.all(
        torch.eq(
            features["passage_input_ids"][0][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186])))
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(
        torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(),
                 torch.tensor(list(range(10)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(),
            torch.tensor(list(range(127)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(),
            torch.tensor(list(range(143)))))

    # test model encodings
    query_vector = model.language_model1(**features)[0]
    passage_vector = model.language_model2(**features)[0]
    assert torch.all(
        torch.le(
            query_vector[0, :10].cpu() - torch.tensor([
                -0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638,
                0.1405, 0.2285, 0.0893
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[0, :10].cpu() - torch.tensor([
                0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969,
                -0.0555, 0.3405, -0.8691
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[1, :10].cpu() - torch.tensor([
                -0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306,
                0.1156, 0.3350, -0.3412
            ]),
            torch.ones((1, 10)) * 0.0001))

    # test logits and loss
    embeddings = model(**features)
    query_emb, passage_emb = embeddings[0]
    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))

    loss = model.logits_to_loss_per_head(embeddings, **features)
    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(
        query_emb, passage_emb).cpu()
    assert torch.all(
        torch.le(
            similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]),
            torch.ones((1, 2)) * 0.0001))
    assert (loss[0].item() - 0.0018) <= 0.0001
Пример #17
0
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Albrecht Lehman ist eine Person"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    #assert result[0]["predictions"][0]["context"] == "sagte"
    #assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
    result2 = model.inference_from_dicts(dicts=basic_texts,
                                         rest_api_schema=True)
    assert result == result2
Пример #18
0
def train_evaluation_single(seed=42):
    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    # GPU utilization on 4x V100
    # 40*4, 14.3/16GB on master, 12.6/16 on others
    batch_size = 40 * 4
    n_epochs = 2
    evaluate_every = 2000000  # disabling dev eval
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # Load model and train
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("testsave/data/squad20"),
    )
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)
    language_model = LanguageModel.load(lang_model)
    prediction_head = QuestionAnsweringHead(n_best=5)
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    starttime = time()
    trainer.train()
    elapsed = time() - starttime

    save_dir = Path("testsave/roberta-qa-dev")
    model.save(save_dir)
    processor.save(save_dir)

    # Create Evaluator
    evaluator = Evaluator(data_loader=data_silo.get_data_loader("dev"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100

    print(results)
    print(elapsed)

    gold_f1 = 82.155
    gold_EM = 77.714
    gold_tnrecall = 97.3721  #
    gold_elapsed = 1135
    np.testing.assert_allclose(
        f1_score,
        gold_f1,
        rtol=0.01,
        err_msg=f"FARM Training changed for f1 score by: {f1_score - gold_f1}")
    np.testing.assert_allclose(
        em_score,
        gold_EM,
        rtol=0.01,
        err_msg=f"FARM Training changed for EM by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        tnacc,
        gold_tnrecall,
        rtol=0.01,
        err_msg=
        f"FARM Training changed for top 5 accuracy by: {em_score - gold_EM}")
    np.testing.assert_allclose(
        elapsed,
        gold_elapsed,
        rtol=0.1,
        err_msg=
        f"FARM Training speed changed significantly by: {elapsed - gold_elapsed} seconds"
    )
Пример #19
0
def test_doc_regression(data_dir_path, text_column_name, caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    rp_params = dict(tokenizer=tokenizer,
                     max_seq_len=8,
                     data_dir=Path(data_dir_path),
                     train_filename="train-sample.tsv",
                     dev_filename="test-sample.tsv",
                     test_filename=None,
                     label_column_name="label")

    if text_column_name is not None:
        rp_params["text_column_name"] = text_column_name

    processor = RegressionProcessor(**rp_params)

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = RegressionHead()
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'CosineWarmup',
            'warmup_proportion': 0.1
        })

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/doc_regr")
    model.save(save_dir)
    processor.save(save_dir)

    del model
    del processor
    del optimizer
    del data_silo
    del trainer

    basic_texts = [
        {
            "text":
            "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."
        },
        {
            "text":
            "it just did not fit right. The top is very thin showing everything."
        },
    ]

    model = Inferencer.load(save_dir, num_processes=0)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
    del model
Пример #20
0
def test_evaluation():
    ##########################
    ########## Settings
    ##########################
    lang_model = "deepset/roberta-base-squad2"
    do_lower_case = False

    test_assertions = True

    data_dir = Path("testsave/data/squad20")
    evaluation_filename = "dev-v2.0.json"

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # loading models and evals
    model = AdaptiveModel.convert_from_transformers(
        lang_model, device=device, task_type="question_answering")
    model.prediction_heads[0].no_ans_boost = 0
    model.prediction_heads[0].n_best = 1

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=256,
        label_list=["start_token", "end_token"],
        metric="squad",
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
        doc_stride=128,
    )

    starttime = time()

    data_silo = DataSilo(processor=processor, batch_size=40 * 4)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)
    model, _ = optimize_model(model=model,
                              device=device,
                              local_rank=-1,
                              optimizer=None,
                              distributed=False,
                              use_amp=None)

    evaluator = Evaluator(data_loader=data_silo.get_data_loader("test"),
                          tasks=data_silo.processor.tasks,
                          device=device)

    # 1. Test FARM internal evaluation
    results = evaluator.eval(model)
    f1_score = results[0]["f1"] * 100
    em_score = results[0]["EM"] * 100
    tnacc = results[0]["top_n_accuracy"] * 100
    elapsed = time() - starttime
    print(results)
    print(elapsed)

    gold_EM = 77.7478
    gold_f1 = 82.1557
    gold_tnacc = 84.0646  # top 1 recall
    gold_elapsed = 40  # 4x V100
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=f"FARM Eval changed for EM by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=f"FARM Eval changed for f1 score by: {f1_score-gold_f1}")
        np.testing.assert_allclose(
            tnacc,
            gold_tnacc,
            rtol=0.001,
            err_msg=
            f"FARM Eval changed for top 1 accuracy by: {em_score-gold_EM}")
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"FARM Eval speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )

    # # 2. Test FARM predictions with outside eval script
    starttime = time()
    model = Inferencer(model=model,
                       processor=processor,
                       task_type="question_answering",
                       batch_size=40 * 4,
                       gpu=device.type == "cuda")
    filename = data_dir / evaluation_filename
    result = model.inference_from_file(file=filename,
                                       return_json=False,
                                       multiprocessing_chunksize=80)
    results_squad = [x.to_squad_eval() for x in result]

    elapsed = time() - starttime

    os.makedirs("../testsave", exist_ok=True)
    write_squad_predictions(predictions=results_squad,
                            predictions_filename=filename,
                            out_filename="testsave/predictions.json")
    script_params = {
        "data_file": filename,
        "pred_file": "testsave/predictions.json",
        "na_prob_thresh": 1,
        "na_prob_file": False,
        "out_file": False
    }
    results_official = squad_evaluation.main(OPTS=DotMap(script_params))
    f1_score = results_official["f1"]
    em_score = results_official["exact"]

    gold_EM = 78.4890
    gold_f1 = 81.7104
    gold_elapsed = 27  # 4x V100
    print(elapsed)
    if test_assertions:
        np.testing.assert_allclose(
            em_score,
            gold_EM,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for EM by: {em_score - gold_EM}"
        )
        np.testing.assert_allclose(
            f1_score,
            gold_f1,
            rtol=0.001,
            err_msg=
            f"Eval with official script changed for f1 score by: {f1_score - gold_f1}"
        )
        np.testing.assert_allclose(
            elapsed,
            gold_elapsed,
            rtol=0.1,
            err_msg=
            f"Inference speed changed significantly by: {elapsed - gold_elapsed} seconds"
        )
Пример #21
0
    def __init__(self,
                 model_name_or_path: Union[str, Path],
                 model_version: Optional[str] = None,
                 context_window_size: int = 150,
                 batch_size: int = 50,
                 use_gpu: bool = True,
                 no_ans_boost: float = 0.0,
                 return_no_answer: bool = False,
                 top_k: int = 10,
                 top_k_per_candidate: int = 3,
                 top_k_per_sample: int = 1,
                 num_processes: Optional[int] = None,
                 max_seq_len: int = 256,
                 doc_stride: int = 128,
                 progress_bar: bool = True,
                 duplicate_filtering: int = 0,
                 use_confidence_scores: bool = True):
        """
        :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'bert-base-cased',
        'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'.
        See https://huggingface.co/models for full list of available models.
        :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :param context_window_size: The size, in characters, of the window around the answer span that is used when
                                    displaying the context around the answer.
        :param batch_size: Number of samples the model receives in one batch for inference.
                           Memory consumption is much lower in inference mode. Recommendation: Increase the batch size
                           to a value so only a single batch is used.
        :param use_gpu: Whether to use GPU (if available)
        :param no_ans_boost: How much the no_answer logit is boosted/increased.
        If set to 0 (default), the no_answer logit is not changed.
        If a negative number, there is a lower chance of "no_answer" being predicted.
        If a positive number, there is an increased chance of "no_answer"
        :param return_no_answer: Whether to include no_answer predictions in the results.
        :param top_k: The maximum number of answers to return
        :param top_k_per_candidate: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text).
        Note that this is not the number of "final answers" you will receive
        (see `top_k` in FARMReader.predict() or Finder.get_answers() for that)
        and that FARM includes no_answer in the sorted list of predictions.
        :param top_k_per_sample: How many answers to extract from each small text passage that the model can process at once
        (one "candidate doc" is usually split into many smaller "passages").
        You usually want a very small value here, as it slows down inference
        and you don't gain much of quality by having multiple answers from one passage.
        Note that this is not the number of "final answers" you will receive
        (see `top_k` in FARMReader.predict() or Finder.get_answers() for that)
        and that FARM includes no_answer in the sorted list of predictions.
        :param num_processes: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer determine optimum number. If you
                              want to debug the Language Model, you might need to disable multiprocessing!
        :param max_seq_len: Max sequence length of one input text for the model
        :param doc_stride: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``)
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        :param duplicate_filtering: Answers are filtered based on their position. Both start and end position of the answers are considered.
                                    The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal.
        """

        # save init parameters to enable export of component config as YAML
        self.set_config(model_name_or_path=model_name_or_path,
                        model_version=model_version,
                        context_window_size=context_window_size,
                        batch_size=batch_size,
                        use_gpu=use_gpu,
                        no_ans_boost=no_ans_boost,
                        return_no_answer=return_no_answer,
                        top_k=top_k,
                        top_k_per_candidate=top_k_per_candidate,
                        top_k_per_sample=top_k_per_sample,
                        num_processes=num_processes,
                        max_seq_len=max_seq_len,
                        doc_stride=doc_stride,
                        progress_bar=progress_bar,
                        duplicate_filtering=duplicate_filtering,
                        use_confidence_scores=use_confidence_scores)

        self.return_no_answers = return_no_answer
        self.top_k = top_k
        self.top_k_per_candidate = top_k_per_candidate
        self.inferencer = QAInferencer.load(model_name_or_path,
                                            batch_size=batch_size,
                                            gpu=use_gpu,
                                            task_type="question_answering",
                                            max_seq_len=max_seq_len,
                                            doc_stride=doc_stride,
                                            num_processes=num_processes,
                                            revision=model_version,
                                            disable_tqdm=not progress_bar,
                                            strict=False)
        self.inferencer.model.prediction_heads[
            0].context_window_size = context_window_size
        self.inferencer.model.prediction_heads[0].no_ans_boost = no_ans_boost
        self.inferencer.model.prediction_heads[
            0].n_best = top_k_per_candidate + 1  # including possible no_answer
        try:
            self.inferencer.model.prediction_heads[
                0].n_best_per_sample = top_k_per_sample
        except:
            logger.warning(
                "Could not set `top_k_per_sample` in FARM. Please update FARM version."
            )
        try:
            self.inferencer.model.prediction_heads[
                0].duplicate_filtering = duplicate_filtering
        except:
            logger.warning(
                "Could not set `duplicate_filtering` in FARM. Please update FARM version."
            )
        self.max_seq_len = max_seq_len
        self.use_gpu = use_gpu
        self.progress_bar = progress_bar
        self.device, _ = initialize_device_settings(use_cuda=self.use_gpu)
        self.use_confidence_scores = use_confidence_scores
def test_doc_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir=Path("samples/doc_class"),
        train_filename=Path("train-sample.tsv"),
        label_list=["OTHER", "OFFENSE"],
        metric="f1_macro",
        dev_filename="test-sample.tsv",
        test_filename=None,
        dev_split=0.0,
        label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = DistilBert.load(lang_model)
    prediction_head = TextClassificationHead()
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = Path("testsave/doc_class")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Malte liebt Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }]

    inf = Inferencer.load(save_dir, batch_size=2)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
Пример #23
0
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="SQuAD", run_name="qa_albert")

#########################
######## Settings
########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
batch_size = 32
n_epochs = 2
evaluate_every = 1
base_LM_model = "albert"
train_filename = "train-v2.0.json"
dev_filename = "dev-v2.0.json"
save_dir = "../saved_models/qa_medium_albert"
inference_file = "../data/squad20/subsets/5ad3ff1b604f3c001a3ffc74.json"
predictions_file = save_dir + "/predictions.json"
full_predictions_file = save_dir + "/full_predictions.json"
inference_multiprocessing = False
train = False
inference = True

if train:
Пример #24
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset.
    # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label"
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        dev_filename="dev.tsv",
        test_filename=None,
        data_dir=Path("../data/asnq_binary"),
        delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-6,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "how many times have real madrid won the champions league in a row",
            "text_b":
            "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
        },
        {
            "text": "how many seasons of the blacklist are there on netflix",
            "text_b": "Retrieved March 27 , 2018 ."
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
Пример #25
0
def test_qa(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "distilbert-base-uncased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model,
                               do_lower_case=True)
    label_list = ["start_token", "end_token"]
    processor = SquadProcessor(tokenizer=tokenizer,
                               max_seq_len=20,
                               doc_stride=10,
                               max_query_length=6,
                               train_filename="train-sample.json",
                               dev_filename="dev-sample.json",
                               test_filename=None,
                               data_dir=Path("samples/qa"),
                               label_list=label_list,
                               metric="squad")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(base_LM_model)
    prediction_head = QuestionAnsweringHead()
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)
    trainer.train()
    save_dir = Path("testsave/qa")
    model.save(save_dir)
    processor.save(save_dir)

    inferencer = Inferencer.load(save_dir,
                                 batch_size=2,
                                 gpu=False,
                                 num_processes=0)

    qa_format_1 = [{
        "questions": ["Who counted the game among the best ever made?"],
        "text":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]
    qa_format_2 = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
    }]

    result1 = inferencer.inference_from_dicts(dicts=qa_format_1)
    result2 = inferencer.inference_from_dicts(dicts=qa_format_2)
    assert result1 == result2
Пример #26
0
    def train(
        self,
        data_dir: str,
        train_filename: str,
        dev_filename: Optional[str] = None,
        test_file_name: Optional[str] = None,
        use_gpu: Optional[bool] = None,
        batch_size: int = 10,
        n_epochs: int = 2,
        learning_rate: float = 1e-5,
        max_seq_len: Optional[int] = None,
        warmup_proportion: float = 0.2,
        dev_split: Optional[float] = 0.1,
        evaluate_every: int = 300,
        save_dir: Optional[str] = None,
    ):
        """
        Fine-tune a model on a QA dataset. Options:
        - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data)
        - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool)

        :param data_dir: Path to directory containing your training data in SQuAD style
        :param train_filename: filename of training data
        :param dev_filename: filename of dev / eval data
        :param test_file_name: filename of test data
        :param dev_split: Instead of specifying a dev_filename you can also specify a ratio (e.g. 0.1) here
                          that get's split off from training data for eval.
        :param use_gpu: Whether to use GPU (if available)
        :param batch_size: Number of samples the model receives in one batch for training
        :param n_epochs: number of iterations on the whole training data set
        :param learning_rate: learning rate of the optimizer
        :param max_seq_len: maximum text length (in tokens). Everything longer gets cut down.
        :param warmup_proportion: Proportion of training steps until maximum learning rate is reached.
                                  Until that point LR is increasing linearly. After that it's decreasing again linearly.
                                  Options for different schedules are available in FARM.
        :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset
        :param save_dir: Path to store the final model
        :return: None
        """

        if dev_filename:
            dev_split = None

        set_all_seeds(seed=42)

        # For these variables, by default, we use the value set when initializing the FARMReader.
        # These can also be manually set when train() is called if you want a different value at train vs inference
        if use_gpu is None:
            use_gpu = self.use_gpu
        if max_seq_len is None:
            max_seq_len = self.max_seq_len

        device, n_gpu = initialize_device_settings(use_cuda=use_gpu)

        if not save_dir:
            save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}"

        # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=self.inferencer.processor.tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            dev_split=dev_split,
            test_filename=test_file_name,
            data_dir=Path(data_dir),
        )

        # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them
        # and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor,
                             batch_size=batch_size,
                             distributed=False)

        # 3. Create an optimizer and pass the already initialized model
        model, optimizer, lr_schedule = initialize_optimizer(
            model=self.inferencer.model,
            learning_rate=learning_rate,
            schedule_opts={
                "name": "LinearWarmup",
                "warmup_proportion": warmup_proportion
            },
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device)
        # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )
        # 5. Let it grow!
        self.inferencer.model = trainer.train()
        self.save(Path(save_dir))
Пример #27
0
def train_from_scratch():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="from_scratch", run_name="debug")

    #########################
    ######## Settings
    ########################
    set_all_seeds(seed=39)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    evaluate_every = 5000
    vocab_size = 30522
    # dev_filename = None
    save_dir = Path("saved_models/train_from_scratch")

    n_epochs = 10
    learning_rate = 1e-4
    warmup_proportion = 0.05
    batch_size = 16  # (probably only possible via gradient accumulation steps)
    max_seq_len = 64

    # 1.Create a tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = BertStyleLMProcessor(
        data_dir=Path("data/lm_finetune_nips"),
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        train_filename="train.txt",
        dev_split=2000 / 8_000_000,
        dev_filename=None,
        test_filename=None,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and
    #    calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.from_scratch("bert", vocab_size)

    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead(768, vocab_size)
    next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence")

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=learning_rate,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": warmup_proportion
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        grad_acc_steps=8,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer.create_or_load_checkpoint(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
        grad_acc_steps=8,
        checkpoint_root_dir=Path(
            "saved_models/train_from_scratch/checkpoints"),
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)
Пример #28
0
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM",
                          run_name="Run_doc_regression")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 32
evaluate_every = 30
lang_model = "bert-base-cased"

# 1.Create a tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model, do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
#    We do not have a sample dataset for regression yet, add your own dataset to run the example
processor = RegressionProcessor(
    tokenizer=tokenizer,
    max_seq_len=128,
    data_dir="../data/<YOUR-DATASET>",
Пример #29
0
def test_doc_regression(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = RegressionProcessor(tokenizer=tokenizer,
                                    max_seq_len=8,
                                    data_dir="samples/doc_regr",
                                    train_filename="train-sample.tsv",
                                    dev_filename="test-sample.tsv",
                                    test_filename=None,
                                    label_column_name="label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = RegressionHead(layer_dims=[768, 1])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      warmup_linear=warmup_linear,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_regr"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            "The dress is just fabulous and it totally fits my size. The fabric is of great quality and the seams are really well hidden. I am super happy with this purchase and I am looking forward to trying some more from the same brand."
        },
        {
            "text":
            "it just did not fit right. The top is very thin showing everything."
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["pred"], np.float32)
Пример #30
0
def test_s3e_fit():
    # small test data
    language_model = Path("samples/s3e/tiny_fasttext_model")
    corpus_path = Path("samples/s3e/tiny_corpus.txt")
    save_dir = Path("testsave/fitted_s3e/")
    do_lower_case = False
    batch_size = 2
    use_gpu = False

    # Fit S3E on a corpus
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=use_gpu, use_amp=False)

    # Create a InferenceProcessor
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=language_model,
                               do_lower_case=do_lower_case)
    processor = InferenceProcessor(tokenizer=tokenizer, max_seq_len=128)

    # Create an AdaptiveModel
    language_model = LanguageModel.load(language_model)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[],
                          embeds_dropout_prob=0.1,
                          lm_output_types=[],
                          device=device)

    model, processor, s3e_stats = fit_s3e_on_corpus(processor=processor,
                                                    model=model,
                                                    corpus=corpus_path,
                                                    n_clusters=3,
                                                    pca_n_components=30,
                                                    svd_postprocessing=True,
                                                    min_token_occurrences=1)

    # save everything to allow inference without fitting everything again
    model.save(save_dir)
    processor.save(save_dir)
    with open(save_dir / "s3e_stats.pkl", "wb") as f:
        pickle.dump(s3e_stats, f)

    # Load model, tokenizer and processor directly into Inferencer
    inferencer = Inferencer(model=model,
                            processor=processor,
                            task_type="embeddings",
                            gpu=use_gpu,
                            batch_size=batch_size,
                            extraction_strategy="s3e",
                            extraction_layer=-1,
                            s3e_stats=s3e_stats,
                            num_processes=0)

    # Input
    basic_texts = [
        {
            "text": "a man is walking on the street."
        },
        {
            "text": "a woman is walking on the street."
        },
    ]

    # Get embeddings for input text (you can vary the strategy and layer)
    result = inferencer.inference_from_dicts(dicts=basic_texts)
    assert result[0]["context"] == [
        'a', 'man', 'is', 'walking', 'on', 'the', 'street', '.'
    ]
    assert result[0]["vec"][0] - 0.00527727306941057 < 1e-6
    assert result[0]["vec"][-2] - 0.06285100416478565 < 1e-6