예제 #1
0
def test_processor_saving_loading(caplog):
    caplog.set_level(logging.CRITICAL)
    set_all_seeds(seed=42)
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir="samples/doc_class",
        train_filename="train-sample.tsv",
        dev_filename=None,
        test_filename=None,
        dev_split=0.1,
        columns=["text", "label", "unused"],
        label_list=["OTHER", "OFFENSE"],
        metrics=["f1_macro"])
    dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv")
    data, tensor_names = processor.dataset_from_dicts(dicts)

    save_dir = "testsave/processor"
    processor.save(save_dir)

    processor = processor.load_from_dir(save_dir)
    dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv")
    data_loaded, tensor_names_loaded = processor.dataset_from_dicts(dicts)

    assert tensor_names == tensor_names_loaded
    for i in range(len(data.tensors)):
        assert torch.all(torch.eq(data.tensors[i], data_loaded.tensors[i]))
예제 #2
0
def evaluate_classification():
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=True)
    lang_model = "deepset/bert-base-german-cased-sentiment-Germeval17"
    do_lower_case = False
    batch_size = 100

    data_dir = Path("../data/germeval17")
    evaluation_filename = "test_TIMESTAMP1.tsv"
    label_list = ["negative", "neutral", "positive"]
    metric = "f1_macro"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2017 Data automaticaly if it is not available.

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=None,
        dev_filename=None,
        dev_split=0,
        test_filename=evaluation_filename,
        data_dir=data_dir,
    )

    # 3. Create a DataSilo that loads dataset, provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an Evaluator
    evaluator = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device
    )

    # 5. Load model
    model = AdaptiveModel.convert_from_transformers(lang_model, device=device, task_type="text_classification")
    # use "load" if you want to use a local model that was trained with FARM
    # model = AdaptiveModel.load(lang_model, device=device)
    model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True)

    # 6. Run the Evaluator
    results = evaluator.eval(model)
    f1_score = results[0]["f1_macro"]
    print("Macro-averaged F1-Score:", f1_score)
예제 #3
0
def test_prediction_head_load_save_class_weights(tmp_path, caplog=None):
    """This is a regression test for #428 and #422."""
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 1
    lang_model = "bert-base-german-cased"
    data_dir_path = "samples/doc_class"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    tcp_params = dict(tokenizer=tokenizer,
                      max_seq_len=8,
                      data_dir=Path(data_dir_path),
                      train_filename="train-sample.tsv",
                      label_list=["OTHER", "OFFENSE"],
                      metric="f1_macro",
                      dev_filename="test-sample.tsv",
                      test_filename=None,
                      dev_split=0.0,
                      label_column_name="coarse_label")

    processor = TextClassificationProcessor(**tcp_params)

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(
        num_labels=2,
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model.save(tmp_path)
    model_loaded = AdaptiveModel.load(tmp_path, device='cpu')
    assert model_loaded is not None
예제 #4
0
def test_data_silo_for_holdout1():
    n_splits = 5
    lang_model = "distilbert-base-german-cased"
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)
    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("data/germeval18"),
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_split=0.2,
                                            dev_stratification=False,
                                            label_column_name="coarse_label")
    data_silo = DataSilo(processor=processor, batch_size=32, max_processes=128)
    silos = DataSiloForHoldout.make(
        data_silo,
        sets=['train'],
        n_splits=n_splits,
        train_split=0.8,
    )

    assert len(silos) == n_splits
    for idx, silo in enumerate(silos):
        train_ds = silo.get_data_loader("train").dataset
        dev_ds = silo.get_data_loader("dev").dataset
        test_ds = silo.get_data_loader("test").dataset
        train_indices = train_ds.datasets[0].indices
        dev_indices = dev_ds.indices
        train_idx_set = set([i for i in train_indices])
        dev_idx_set = set([i for i in dev_indices])
        orig_train = train_ds.datasets[0].dataset
        orig_train_idx = orig_train.indices
        test_idx = test_ds.indices
        orig_train_idx_set = set([i for i in orig_train_idx])
        test_idx_set = set([i for i in test_idx])
        assert len(orig_train_idx_set.intersection(test_idx_set)) == 0
        assert len(train_idx_set.intersection(dev_idx_set)) == 0
    del tokenizer
    del processor
    del data_silo
    del silos
예제 #5
0
def test_doc_classification(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 8
    evaluate_every = 5
    lang_model = "bert-base-german-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[
        768, len(processor.tasks["text_classification"]["label_list"])
    ])
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      warmup_linear=warmup_linear,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Martin Müller spielt Handball in Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }, {
        "text":
        "Franzosen verteidigen 2:1-Führung – Kritische Stimmen zu Schwedens Superstar"
    }, {
        "text": "Neues Video von Designern macht im Netz die Runde"
    }, {
        "text":
        "23-jähriger Brasilianer muss vier Spiele pausieren – Entscheidung kann noch angefochten werden"
    }, {
        "text":
        "Aufständische verwendeten Chemikalie bei Gefechten im August."
    }, {
        "text":
        "Bewährungs- und Geldstrafe für 26-Jährigen wegen ausländerfeindlicher Äußerung"
    }, {
        "text":
        "ÖFB-Teamspieler nur sechs Minuten nach seinem Tor beim 1:1 gegen Sunderland verletzt ausgewechselt"
    }, {
        "text":
        "Ein 31-jähriger Polizist soll einer 42-Jährigen den Knöchel gebrochen haben"
    }, {
        "text":
        "18 Menschen verschleppt. Kabul – Nach einem Hubschrauber-Absturz im Norden Afghanistans haben Sicherheitskräfte am Mittwoch versucht"
    }]
    #TODO enable loading here again after we have finished migration towards "processor.tasks"
    #inf = Inferencer.load(save_dir)
    inf = Inferencer(model=model, processor=processor)
    result = inf.run_inference(dicts=basic_texts)
    assert result[0]["predictions"][0]["label"] == "OTHER"
    assert abs(result[0]["predictions"][0]["probability"] - 0.7) <= 0.1

    loaded_processor = TextClassificationProcessor.load_from_dir(save_dir)
    inf2 = Inferencer(model=model, processor=loaded_processor)
    result_2 = inf2.run_inference(dicts=basic_texts)
    pprint(list(zip(result, result_2)))
    for r1, r2 in list(zip(result, result_2)):
        assert r1 == r2


# if(__name__=="__main__"):
#     test_doc_classification()
예제 #6
0
def test_data_silo_for_cross_val_nested():
    lang_model = "distilbert-base-german-cased"
    n_outer_splits = 3
    n_inner_splits = 3

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("data/germeval18"),
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=32)

    silos = DataSiloForCrossVal.make(
        data_silo,
        sets=['test', 'train'],
        n_splits=n_outer_splits,
        n_inner_splits=n_inner_splits,
    )

    # check number of silos
    assert len(silos) == (n_outer_splits * n_inner_splits)

    # because the outer cross validation creates the test set it must be the same
    # in silo 0 and silo 1
    data_loader_test_indices_0 = silos[0].get_data_loader(
        'test').dataset.indices
    data_loader_test_indices_1 = silos[1].get_data_loader(
        'test').dataset.indices
    assert data_loader_test_indices_0.size > 0
    assert data_loader_test_indices_1.size > 0
    assert data_loader_test_indices_0.ndim == 1
    assert data_loader_test_indices_1.ndim == 1
    assert np.array_equal(data_loader_test_indices_0,
                          data_loader_test_indices_1)

    # because the inner cross validation creates the dev set it must be different
    # in silo 0 and silo 1
    data_loader_dev_indices_0 = silos[0].get_data_loader('dev').dataset.indices
    data_loader_dev_indices_1 = silos[1].get_data_loader('dev').dataset.indices
    assert data_loader_dev_indices_0.size > 0
    assert data_loader_dev_indices_1.size > 0
    assert data_loader_dev_indices_0.ndim == 1
    assert data_loader_dev_indices_1.ndim == 1
    assert not np.array_equal(data_loader_dev_indices_0,
                              data_loader_dev_indices_1)

    # extract and test train sets of silo 0 and 1
    data_loader_train_indices_0 = silos[0].get_data_loader(
        'train').dataset.indices
    data_loader_train_indices_1 = silos[1].get_data_loader(
        'train').dataset.indices
    assert data_loader_train_indices_0.size > 0
    assert data_loader_train_indices_1.size > 0
    assert data_loader_train_indices_0.ndim == 1
    assert data_loader_train_indices_1.ndim == 1

    # size of dev + train + test must be same on all folds
    assert (data_loader_train_indices_0.size + \
           data_loader_dev_indices_0.size + \
           data_loader_test_indices_0.size) == \
           (data_loader_train_indices_1.size + \
           data_loader_dev_indices_1.size + \
           data_loader_test_indices_1.size)

    del tokenizer
    del processor
    del data_silo
    del silos
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    #############################################
    # CUSTOM OPTIMIZER & LR SCHEDULE
    #############################################
    # learning rate schedules from transformers
    schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "Constant"}
    # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4}

    # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options)
    # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1}
    # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10}

    # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options)
    optimizer_opts = {"name": "SGD", "momentum": 0.0}

    # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options)
    # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True}

    # or from transformers (default in FARM)
    #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
    #############################################

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-3,
        optimizer_opts=optimizer_opts,
        schedule_opts=schedule_opts,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #8
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
        {"text": "Martin Müller spielt Handball in Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #9
0
########## Settings
##########################
set_all_seeds(seed=42)
batch_size = 32
use_gpu = True
device, n_gpu = initialize_device_settings(use_cuda=use_gpu)
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model, do_lower_case=False
)

# 2. Create a DataProcessor only for inference, in this case, we utilize the TextClassificationProcessor and data_dir can be empty
processor = TextClassificationProcessor(
    data_dir="", tokenizer=tokenizer, max_seq_len=128
)

# 4. Create an AdaptiveModel with  a pretrained language model as a basis
language_model = Bert.load(lang_model)

adaptive_model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[],
    embeds_dropout_prob=0,
    lm_output_types=["per_token", "per_sequence"],
    device=device,
)

# 5. Extract embeddings with model in inference mode
basic_texts = [
예제 #10
0
def doc_classification_multilabel():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 32

    evaluate_every = 500
    lang_model = "bert-base-uncased"
    do_lower_case = True

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    metric = "acc"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/toxic-comments"),
                                            label_list=label_list,
                                            label_column_name="label",
                                            metric=metric,
                                            quote_char='"',
                                            multilabel=True,
                                            train_filename="train.tsv",
                                            dev_filename="val.tsv",
                                            test_filename=None,
                                            dev_split=0,
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-german-multi-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "You f*****g bastards"},
        {"text": "What a lovely world"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #11
0
    pretrained_model_name_or_path=lang_model, do_lower_case=True)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
metric = "acc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/toxic-comments",
                                        label_list=label_list,
                                        label_column_name="label",
                                        metric=metric,
                                        quote_char='"',
                                        multilabel=True,
                                        train_filename="train.tsv",
                                        dev_filename="val.tsv",
                                        test_filename=None,
                                        dev_split=0)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = MultiLabelTextClassificationHead(layer_dims=[
    768, len(processor.tasks["text_classification"]["label_list"])
예제 #12
0
lang_model = "xlnet-large-cased"
do_lower_case = False

tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                           do_lower_case=do_lower_case)

metric = "f1_macro"  # desired metric for evaluation

processor = TextClassificationProcessor(
    tokenizer=tokenizer,
    max_seq_len=20,  # BERT can only handle sequence lengths of up to 512
    label_list=label_list,
    data_dir='new/',
    label_column_name="class",  # our labels are located in the "genre" column
    metric=metric,
    quote_char='"',
    multilabel=True,
    train_filename="train.tsv",
    dev_filename=None,
    test_filename="test.tsv",
    dev_split=0.1  # this will extract 10% of the train set to create a dev set
)

data_silo = DataSilo(processor=processor, batch_size=batch_size)

# loading the pretrained BERT base cased model
language_model = LanguageModel.load(lang_model)
# prediction head for our model that is suited for classifying news article genres
prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list))

model = AdaptiveModel(language_model=language_model,
예제 #13
0
def main(args):
    print(f"[INFO] PyTorch Version: {torch.__version__}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("[INFO] Devices available: {}".format(device))
    checkpoint_path = Path(args.ckpt_path) / args.run_name
    ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri)
    ml_logger.init_experiment(experiment_name=args.experiment_name,
                              run_name=args.run_name)
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=args.pretrained_model_name_or_path,
        do_lower_case=False)
    # Processor
    if args.task_name == "text_classification":
        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            train_filename=args.train_filename,
            dev_filename=None,
            test_filename=args.test_filename,
            header=0,
            max_seq_len=args.max_seq_len,
            data_dir=args.data_dir,
            label_list=args.label_list,
            metric=args.metric,
            label_column_name=args.label_column_name,
            text_column_name=args.text_column_name)
    elif args.task_name == "question_answering":
        processor = SquadProcessor(tokenizer=tokenizer,
                                   train_filename=args.train_filename,
                                   dev_filename=args.test_filename,
                                   test_filename=args.test_filename,
                                   max_seq_len=args.max_seq_len,
                                   data_dir=args.data_dir,
                                   label_list=args.label_list,
                                   metric=args.metric,
                                   max_query_length=64,
                                   doc_stride=128,
                                   max_answers=1)
    else:
        raise ValueError("task name error")
    processor.save(checkpoint_path)

    # DataSilo
    data_silo = DataSilo(processor=processor,
                         batch_size=args.batch_size,
                         eval_batch_size=args.eval_batch_size,
                         caching=True,
                         cache_path=checkpoint_path)
    # LanguageModel: Build pretrained language model
    language_model = LanguageModel.load(args.pretrained_model_name_or_path,
                                        language="korean")

    # PredictionHead: Build predictor layer
    if args.task_name == "text_classification":
        # If you do classification on imbalanced classes, consider using class weights.
        # They change the loss function to down-weight frequent classes.
        prediction_head = TextClassificationHead(
            num_labels=len(args.label_list),
            class_weights=data_silo.calculate_class_weights(
                task_name=args.task_name))
    elif args.task_name == "question_answering":
        prediction_head = QuestionAnsweringHead(
            layer_dims=[768, 2],
            task_name=args.task_name,
        )
    else:
        raise ValueError("task name error")

    # AdaptiveModel: Combine all
    if args.task_name == "text_classification":
        lm_output_types = ["per_sequence"]
    elif args.task_name == "question_answering":
        lm_output_types = ["per_token"]
    else:
        raise ValueError("task name error")

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=args.embeds_dropout_prob,
                          lm_output_types=lm_output_types,
                          device=device)

    # Initialize Optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        device=device,
        learning_rate=args.learning_rate,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=args.n_epochs)
    # EarlyStopping
    earlymetric = "f1" if args.task_name == "question_answering" else "acc"
    mode = "max" if args.task_name in [
        "text_classification", "question_answering"
    ] else "min"
    earlystop = EarlyStopping(save_dir=checkpoint_path,
                              metric=earlymetric,
                              mode=mode,
                              patience=5)

    # Trainer
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        lr_schedule=lr_schedule,
        data_silo=data_silo,
        early_stopping=earlystop,
        evaluate_every=args.evaluate_every,
        checkpoints_to_keep=args.checkpoints_to_keep,
        checkpoint_root_dir=checkpoint_path,
        checkpoint_every=args.checkpoint_every,
        epochs=args.n_epochs,
        n_gpu=args.n_gpu,
        device=device,
    )
    # now train!
    model = trainer.train()
def test_doc_classification():
    #caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "roberta-base"

    tokenizer = RobertaTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=8,
                                            data_dir="samples/doc_class",
                                            train_filename="train-sample.tsv",
                                            label_list=["OTHER", "OFFENSE"],
                                            metric="f1_macro",
                                            dev_filename="test-sample.tsv",
                                            test_filename=None,
                                            dev_split=0.0,
                                            label_column_name="coarse_label")

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = Roberta.load(lang_model)
    prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    model = trainer.train(model)

    save_dir = "testsave/doc_class_roberta"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Martin Müller spielt Handball in Berlin."},
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."}
    ]


    inf = Inferencer.load(save_dir,batch_size=2)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"],np.float32)
예제 #15
0
파일: infer.py 프로젝트: yon606/FARM
    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256,
             doc_stride=128,
             extraction_layer=None,
             extraction_strategy=None,
             s3e_stats=None,
             num_processes=None,
             disable_tqdm=False):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

            # override processor attributes loaded from config file with inferencer params
            processor.max_seq_len = max_seq_len
            if hasattr(processor, "doc_stride"):
                processor.doc_stride = doc_stride

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir="data",
                    doc_stride=doc_stride)
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir="data",
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )
            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir="data",
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', "
                    f"'embeddings', 'text_classification', 'ner'")

        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm)
예제 #16
0
def doc_classification_cola():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 8
    evaluate_every = 450
    lang_model = "/bert-base-chinese" #BERT中文模型的路径
    #模型下载地址https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Cola 2018 Data.

    label_list =["城乡建设","卫生计生","商贸旅游","劳动和社会保障","教育文体","交通运输","环境保护"]
    metric = "acc"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=507,
                                            data_dir=Path("/BERT留言分类数据集"), #存放文本分类数据的文件夹路径,数据格式:第一列按字符分隔的text,第二列label,之间用制表符分隔。第一行需要有"text"与"label"
                                            dev_filename=None, #Path("dev.tsv"),
                                            dev_split=0.1,
                                            test_filename="/BERT留言分类数据集/test.tsv",
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    # language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("/BERT文本分类输出的模型")
    model.save(save_dir)
    processor.save(save_dir)
def test_doc_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir=Path("samples/doc_class"),
        train_filename=Path("train-sample.tsv"),
        label_list=["OTHER", "OFFENSE"],
        metric="f1_macro",
        dev_filename="test-sample.tsv",
        test_filename=None,
        dev_split=0.0,
        label_column_name="coarse_label")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = DistilBert.load(lang_model)
    prediction_head = TextClassificationHead()
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    model = trainer.train(model)

    save_dir = Path("testsave/doc_class")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Malte liebt Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }]

    inf = Inferencer.load(save_dir, batch_size=2)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
예제 #18
0
def doc_classification(task,
                       model_type,
                       n_epochs,
                       batch_size,
                       embeds_dropout,
                       evaluate_every,
                       use_cuda,
                       max_seq_len,
                       learning_rate,
                       do_lower_case,
                       register_model,
                       save_model=True,
                       early_stopping=False):

    language = cu.params.get('language')

    # Check task
    if cu.tasks.get(str(task)).get('type') != 'classification':
        raise Exception('NOT A CLASSIFICATION TASK')

    # Data
    dt_task = dt.Data(task=task)
    ## Download training files
    if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')):
        dt_task.download('data_dir', dir='data_dir', source='datastore')

    # Settings
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    lang_model = he.get_farm_model(model_type, language)
    save_dir = dt_task.get_path('model_dir')
    label_list = dt_task.load('fn_label', dir='data_dir',
                              header=None)[0].to_list()

    # AML log
    try:
        aml_run.log('task', task)
        aml_run.log('language', language)
        aml_run.log('n_epochs', n_epochs)
        aml_run.log('batch_size', batch_size)
        aml_run.log('learning_rate', learning_rate)
        aml_run.log('embeds_dropout', embeds_dropout)
        aml_run.log('max_seq_len', max_seq_len)
        aml_run.log('lang_model', lang_model)
        aml_run.log_list('label_list', label_list)
    except:
        pass

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        # AML log
        try:
            aml_run.log('acc', acc.get('acc'))
            aml_run.log('f1macro', f1macro)
            aml_run.log('f1micro', f1micro)
        except:
            pass
        return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro}

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=dt_task.data_dir,
        label_list=label_list,
        metric=metric,
        label_column_name="label",
        train_filename=dt_task.get_path('fn_train', dir='data_dir'),
        test_filename=dt_task.get_path('fn_test', dir='data_dir'))

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    ## Pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    ## Prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(processor.tasks["text_classification"]["label_list"]),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=embeds_dropout,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        learning_rate=learning_rate,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    if early_stopping:
        earlystopping = EarlyStopping(
            metric="f1_macro",
            mode="max",  # use f1_macro from the dev evaluator of the trainer
            # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
            save_dir=save_dir,  # where to save the best model
            patience=
            2  # number of evaluations to wait for improvement before terminating the training
        )
    else:
        earlystopping = None

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Store it:
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    if save_model:
        model.save(save_dir)
        processor.save(save_dir)

        if register_model:
            dt_task.upload('model_dir', destination='model')
예제 #19
0
파일: infer.py 프로젝트: wwmmqq/FARM
    def load(cls,
             model_name_or_path,
             batch_size=4,
             gpu=False,
             task_type=None,
             return_class_probs=False,
             strict=True,
             max_seq_len=256):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :return: An instance of the Inferencer.

        """

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = AdaptiveModel.load(model_name_or_path,
                                       device,
                                       strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            logger.info(
                f"Could not find `{model_name_or_path}` locally. Try to download from model hub ..."
            )
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path, device, task_type)
            config = AutoConfig.from_pretrained(model_name_or_path)
            tokenizer = Tokenizer.load(model_name_or_path)

            # TODO infer task_type automatically from config (if possible)
            if task_type == "question_answering":
                processor = SquadProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    label_list=["start_token", "end_token"],
                    metric="squad",
                    data_dir=None,
                )
            elif task_type == "embeddings":
                processor = InferenceProcessor(tokenizer=tokenizer,
                                               max_seq_len=max_seq_len)

            elif task_type == "text_classification":
                label_list = list(config.id2label[id]
                                  for id in range(len(config.id2label)))
                processor = TextClassificationProcessor(
                    tokenizer=tokenizer,
                    max_seq_len=max_seq_len,
                    data_dir=None,
                    label_list=label_list,
                    label_column_name="label",
                    metric="acc",
                    quote_char='"',
                )

            # elif task_type == "multilabel-classification":
            #     # label_list = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
            #     label_list = list(config.label2id.keys())
            #
            #     processor = TextClassificationProcessor(tokenizer=tokenizer,
            #                                             max_seq_len=max_seq_len,
            #                                             data_dir=None,
            #                                             label_list=label_list,
            #                                             label_column_name="label",
            #                                             metric="acc",
            #                                             quote_char='"',
            #                                             multilabel=True,
            #                                             )

            elif task_type == "ner":
                label_list = list(config.label2id.keys())
                processor = NERProcessor(tokenizer=tokenizer,
                                         max_seq_len=max_seq_len,
                                         data_dir=None,
                                         metric="seq_f1",
                                         label_list=label_list)
            else:
                raise ValueError(
                    f"`task_type` {task_type} is not supported yet. "
                    f"Valid options for arg `task_type`: 'question_answering', 'embeddings', 'text_classification'"
                )

        return cls(
            model,
            processor,
            batch_size=batch_size,
            gpu=gpu,
            name=name,
            return_class_probs=return_class_probs,
        )
예제 #20
0
def doc_classification_cola():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_cola")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 100
    evaluate_every = 20
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Cola 2018 Data.

    label_list = ["0", "1"]
    metric = "mcc"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("../data/cola"),
                                            dev_filename=Path("dev.tsv"),
                                            dev_split=None,
                                            test_filename=None,
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    # language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text": "The box contained the ball from the tree."
        },
        {
            "text": "I'll fix you a drink."
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #21
0
def doc_classification_crossvalidation():
    ##########################
    ########## Logging
    ##########################
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)
    # reduce verbosity from transformers library
    logging.getLogger('transformers').setLevel(logging.WARNING)

    # ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    ml_logger = MLFlowLogger(tracking_uri="logs")
    # ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    xval_folds = 5
    xval_stratified = True

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    use_amp = None

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    # For xval, we also store the actual predictions and labels in each result so we can
    # calculate overall metrics over all folds later
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels).get("acc")
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        mcc = matthews_corrcoef(labels, preds)
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro,
            "mcc": mcc
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # Load one silo for each fold in our cross-validation
    silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)

    # the following steps should be run for each of the folds of the cross validation, so we put them
    # into a function
    def train_on_split(silo_to_use, n_fold, save_dir):
        logger.info(
            f"############ Crossvalidation: Fold {n_fold} ############")
        # Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => Text classification
        prediction_head = TextClassificationHead(
            class_weights=data_silo.calculate_class_weights(
                task_name="text_classification"),
            num_labels=len(label_list))

        model = AdaptiveModel(language_model=language_model,
                              prediction_heads=[prediction_head],
                              embeds_dropout_prob=0.2,
                              lm_output_types=["per_sequence"],
                              device=device)

        # Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=0.5e-5,
            device=device,
            n_batches=len(silo_to_use.loaders["train"]),
            n_epochs=n_epochs,
            use_amp=use_amp)

        # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        # Also create an EarlyStopping instance and pass it on to the trainer

        # An early stopping instance can be used to save the model that performs best on the dev set
        # according to some metric and stop training when no improvement is happening for some iterations.
        # NOTE: Using a different save directory for each fold, allows us afterwards to use the
        # nfolds best models in an ensemble!
        save_dir = Path(str(save_dir) + f"-{n_fold}")
        earlystopping = EarlyStopping(
            metric="f1_offense",
            mode=
            "max",  # use the metric from our own metrics function instead of loss
            save_dir=save_dir,  # where to save the best model
            patience=
            5  # number of evaluations to wait for improvement before terminating the training
        )

        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=silo_to_use,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          early_stopping=earlystopping,
                          evaluator_test=False)

        # train it
        trainer.train()

        return trainer.model

    # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
    # on the test set of each fold
    # Remember all the results for overall metrics over all predictions of all folds and for averaging
    allresults = []
    all_preds = []
    all_labels = []
    bestfold = None
    bestf1_offense = -1
    save_dir = Path("saved_models/bert-german-doc-tutorial-es")
    for num_fold, silo in enumerate(silos):
        model = train_on_split(silo, num_fold, save_dir)

        # do eval on test set here (and not in Trainer),
        #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
        evaluator_test = Evaluator(data_loader=silo.get_data_loader("test"),
                                   tasks=silo.processor.tasks,
                                   device=device)
        result = evaluator_test.eval(model, return_preds_and_labels=True)
        evaluator_test.log_results(result,
                                   "Test",
                                   steps=len(silo.get_data_loader("test")),
                                   num_fold=num_fold)

        allresults.append(result)
        all_preds.extend(result[0].get("preds"))
        all_labels.extend(result[0].get("labels"))

        # keep track of best fold
        f1_offense = result[0]["f1_offense"]
        if f1_offense > bestf1_offense:
            bestf1_offense = f1_offense
            bestfold = num_fold

    # Save the per-fold results to json for a separate, more detailed analysis
    with open("doc_classification_xval.results.json", "wt") as fp:
        json.dump(allresults, fp)

    # calculate overall metrics across all folds
    xval_f1_micro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="micro")
    xval_f1_macro = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             average="macro")
    xval_f1_offense = f1_score(all_labels,
                               all_preds,
                               labels=label_list,
                               pos_label="OFFENSE")
    xval_f1_other = f1_score(all_labels,
                             all_preds,
                             labels=label_list,
                             pos_label="OTHER")
    xval_mcc = matthews_corrcoef(all_labels, all_preds)

    logger.info("XVAL F1 MICRO:   ", xval_f1_micro)
    logger.info("XVAL F1 MACRO:   ", xval_f1_macro)
    logger.info("XVAL F1 OFFENSE: ", xval_f1_offense)
    logger.info("XVAL F1 OTHER:   ", xval_f1_other)
    logger.info("XVAL MCC:        ", xval_mcc)

    # -----------------------------------------------------
    # Just for illustration, use the best model from the best xval val for evaluation on
    # the original (still unseen) test set.
    logger.info(
        "###### Final Eval on hold out test set using best model #####")
    evaluator_origtest = Evaluator(
        data_loader=data_silo.get_data_loader("test"),
        tasks=data_silo.processor.tasks,
        device=device)
    # restore model from the best fold
    lm_name = model.language_model.name
    save_dir = Path(f"saved_models/bert-german-doc-tutorial-es-{bestfold}")
    model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
    model.connect_heads_with_processor(data_silo.processor.tasks,
                                       require_labels=True)

    result = evaluator_origtest.eval(model)
    logger.info("TEST F1 MICRO:   ", result[0]["f1_micro"])
    logger.info("TEST F1 MACRO:   ", result[0]["f1_macro"])
    logger.info("TEST F1 OFFENSE: ", result[0]["f1_offense"])
    logger.info("TEST F1 OTHER:   ", result[0]["f1_other"])
    logger.info("TEST MCC:        ", result[0]["mcc"])
예제 #22
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification_fasttext")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 3
    batch_size = 32
    evaluate_every = 100
    # load fasttext from a local path:
    #fasttext_model = "../saved_models/fasttext-german-uncased"
    # or through s3
    fasttext_model = "fasttext-german-uncased"
    do_lower_case = True
    max_features = 10_000  # maximum number of unique words we will transform
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1. To make Fasttext work within FARM and with advanced aggregation strategies, we need a fixed vocabulary and associated Wordembeddings
    ft_converter = Fasttext_converter(
        pretrained_model_name_or_path=fasttext_model,
        do_lower_case=do_lower_case,
        data_path=Path("../data/germeval18"),
        train_filename="train.tsv",
        output_path=Path("../saved_models/fasttext-german-uncased-converted"),
        language="German",
        max_features=max_features)
    # We convert the data to have fixed size vocab and embeddings
    vocab_counts = ft_converter.convert_on_data()

    # 2. Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=ft_converter.output_path,
        do_lower_case=do_lower_case)

    # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=ft_converter.data_path,
        label_list=label_list,
        train_filename=ft_converter.train_filename,
        dev_split=0,
        test_filename="test.tsv",
        metric=metric,
        label_column_name="coarse_label")

    # 4. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor, batch_size=batch_size, max_processes=1
    )  # multiprocessing with WordembeddingTokenizer is not optimal - so disable it

    # 5. Create an AdaptiveModel
    # a) which consists of the newly created embedding model as a basis.
    language_model = LanguageModel.load(ft_converter.output_path)
    # b) and a prediction head on top that is suited for our task => Text classification
    # Since we do not have a powerful Transformer based Language Model, we need a slightly deeper NN
    # for going the Classification
    prediction_head = TextClassificationHead(
        layer_dims=[300, 600, len(label_list)],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 6. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-3,
        device=device,
        n_batches=len(
            data_silo.get_data_loader("train")
        ),  #len(data_silo.loaders["train"]),streaming: len(data_silo.get_data_loader("train"))
        n_epochs=n_epochs)

    # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 8. Let it grow
    trainer.train()
예제 #23
0
def doc_classifcation():
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    tokenizer = AutoTokenizer.from_pretrained(lang_model, strip_accents=False)
    #tokenizer = Tokenizer.load(
    #    pretrained_model_name_or_path=lang_model,
    #    do_lower_case=do_lower_case)

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("./data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            dev_filename="test.tsv",  # we want to evaluate against test
                                            label_column_name="coarse_label",
                                            )

    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    earlystopping = EarlyStopping(
        metric=metric, mode="max",
        #save_dir=Path("./saved_models"),
        patience=3
    )

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        early_stopping=earlystopping,
        device=device)

    trainer.train()

    return earlystopping.best_so_far
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
    model.close_multiprocessing_pool()

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
    model.close_multiprocessing_pool()
예제 #25
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification_glove")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 3
    batch_size = 32
    evaluate_every = 100
    # load from a local path:
    lang_model = Path("../saved_models/glove-german-uncased")
    # or through s3
    #lang_model = "glove-german-uncased"
    do_lower_case = True

    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data automaticaly if it is not available.
    # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv
    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        dev_split=0,
        test_filename="test.tsv",
        train_filename="train.tsv",
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)

    # 4. Create an AdaptiveModel
    # a) which consists of an embedding model as a basis.
    # Word embedding models only converts words it has seen during training to embedding vectors.
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[300, 600, len(label_list)],
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()
예제 #26
0
# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                           do_lower_case=do_lower_case)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.

label_list = ["0", "1"]
metric = "mcc"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=64,
                                        data_dir="../data/cola",
                                        dev_filename="dev.tsv",
                                        dev_split=None,
                                        test_filename=None,
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="label")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)

# language_model = Roberta.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
예제 #27
0
device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                           do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/germeval18",
                                        label_list=label_list,
                                        metric=metric,
                                        label_column_name="coarse_label")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(
    layer_dims=[
        768, len(processor.tasks["text_classification"]["label_list"])
    ],
    class_weights=data_silo.calculate_class_weights(
    def perform_fine_tuning(current_info_need,
                            bert_model,
                            label_list,
                            num_epochs,
                            condition,
                            folds=10,
                            stratified=True,
                            learning_rate=2e-5,
                            batch_size=32,
                            embeds_dropout_prob=.1):

        ## Define evaluation metrics ##
        def evaluation_metrics(preds, labels):
            acc = simple_accuracy(preds, labels).get("acc")
            f1other = f1_score(y_true=labels, y_pred=preds, pos_label="Other")
            f1infoneed = f1_score(y_true=labels,
                                  y_pred=preds,
                                  pos_label=current_info_need)
            recall_infoneed = recall_score(y_true=labels,
                                           y_pred=preds,
                                           pos_label=current_info_need)
            precision_infoneed = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 pos_label=current_info_need)
            recall_other = recall_score(y_true=labels,
                                        y_pred=preds,
                                        pos_label="Other")
            precision_other = precision_score(y_true=labels,
                                              y_pred=preds,
                                              pos_label="Other")
            recall_macro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="macro")
            precision_macro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="macro")
            recall_micro = recall_score(y_true=labels,
                                        y_pred=preds,
                                        average="micro")
            precision_micro = precision_score(y_true=labels,
                                              y_pred=preds,
                                              average="micro")
            recall_weighted = recall_score(y_true=labels,
                                           y_pred=preds,
                                           average="weighted")
            precision_weighted = precision_score(y_true=labels,
                                                 y_pred=preds,
                                                 average="weighted")
            f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
            f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
            mcc = matthews_corrcoef(labels, preds)
            f1weighted = f1_score(y_true=labels,
                                  y_pred=preds,
                                  average="weighted")

            return {
                "info_need": current_info_need,
                "model": bert_model,
                "num_epochs": num_epochs,
                "condition": condition,
                "acc": acc,
                "f1_other": f1other,
                "f1_infoneed": f1infoneed,
                "precision_infoneed": precision_infoneed,
                "recall_infoneed": recall_infoneed,
                "recall_other": recall_other,
                "precision_other": precision_other,
                "recall_macro": recall_macro,
                "precision_macro": precision_macro,
                "recall_micro": recall_micro,
                "precision_micro": precision_micro,
                "recall_weighted": recall_weighted,
                "precision_weighted": precision_weighted,
                "f1_weighted": f1weighted,
                "f1_macro": f1macro,
                "f1_micro": f1micro,
                "f1_weighted": f1weighted,
                "mcc": mcc
            }

        register_metrics(
            f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs',
            evaluation_metrics)
        metric = f'eval_metrics_{current_info_need}_{bert_model}_{condition}__{num_epochs}_epochs'
        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        logger, ml_logger = init_logging()
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=bert_model,
                                   do_lower_case=False)

        processor = TextClassificationProcessor(
            tokenizer=tokenizer,
            max_seq_len=256,
            train_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_train.csv",
            test_filename=
            f"{current_info_need}_{condition}_{num_epochs}_epochs_test.csv",
            data_dir="data/",
            label_list=label_list,
            metric=metric,
            text_column_name="utterance",
            label_column_name=level,
            delimiter=";")

        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        silos = DataSiloForCrossVal.make(data_silo,
                                         n_splits=folds,
                                         sets=['train', 'test'])

        # the following steps should be run for each of the folds of the cross validation, so we put them
        # into a function
        def train_on_split(silo_to_use, n_fold, save_dir):
            logger.info(
                f"############ Crossvalidation: Fold {n_fold} ############")
            # Create an AdaptiveModel
            # a) which consists of a pretrained language model as a basis
            language_model = LanguageModel.load(bert_model)
            # b) and a prediction head on top that is suited for our task => Text classification
            prediction_head = TextClassificationHead(
                class_weights=data_silo.calculate_class_weights(
                    task_name="text_classification"),
                num_labels=len(label_list))

            model = AdaptiveModel(language_model=language_model,
                                  prediction_heads=[prediction_head],
                                  embeds_dropout_prob=embeds_dropout_prob,
                                  lm_output_types=["per_sequence"],
                                  device=device)

            # Create an optimizer
            model, optimizer, lr_schedule = initialize_optimizer(
                model=model,
                learning_rate=learning_rate,
                device=device,
                n_batches=len(silo_to_use.loaders["train"]),
                n_epochs=num_epochs,
                use_amp=None)

            # Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
            # Also create an EarlyStopping instance and pass it on to the trainer

            # An early stopping instance can be used to save the model that performs best on the dev set
            # according to some metric and stop training when no improvement is happening for some iterations.
            # NOTE: Using a different save directory for each fold, allows us afterwards to use the
            # nfolds best models in an ensemble!
            save_dir = Path(str(save_dir) + f"-{n_fold}")
            earlystopping = EarlyStopping(
                metric="f1_infoneed",
                mode=
                "max",  # use the metric from our own metrics function instead of loss
                save_dir=save_dir,  # where to save the best model
                patience=
                5  # number of evaluations to wait for improvement before terminating the training
            )

            trainer = Trainer(model=model,
                              optimizer=optimizer,
                              data_silo=silo_to_use,
                              epochs=num_epochs,
                              n_gpu=n_gpu,
                              lr_schedule=lr_schedule,
                              evaluate_every=100,
                              device=device,
                              early_stopping=earlystopping,
                              evaluator_test=False)

            # train it
            trainer.train()

            return trainer.model

        # for each fold, run the whole training, earlystopping to get a model, then evaluate the model
        # on the test set of each fold
        # Remember all the results for overall metrics over all predictions of all folds and for averaging
        allresults = []
        all_preds = []
        all_labels = []
        bestfold = None
        bestf1_info_need = -1
        language_model_name = bert_model
        if language_model_name.find("/") != -1:
            language_model_name = language_model_name.replace("/", "_")
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}"
        )
        for num_fold, silo in enumerate(silos):
            model = train_on_split(silo, num_fold, save_dir)

            # do eval on test set here (and not in Trainer),
            #  so that we can easily store the actual preds and labels for a "global" eval across all folds.
            evaluator_test = Evaluator(
                data_loader=silo.get_data_loader("test"),
                tasks=silo.processor.tasks,
                device=device)
            result = evaluator_test.eval(model, return_preds_and_labels=True)
            evaluator_test.log_results(result,
                                       "Test",
                                       steps=len(silo.get_data_loader("test")),
                                       num_fold=num_fold)

            allresults.append(result)
            all_preds.extend(result[0].get("preds"))
            all_labels.extend(result[0].get("labels"))

            # keep track of best fold
            f1_info_need = result[0]["f1_infoneed"]
            if f1_info_need > bestf1_info_need:
                bestf1_info_need = f1_info_need
                bestfold = num_fold

            # emtpy cache to avoid memory leak and cuda OOM across multiple folds
            model.cpu()
            torch.cuda.empty_cache()

        # Save the per-fold results to json for a separate, more detailed analysis
        with open(
                f"classification_results/test/{current_info_need}-{language_model_name}-{condition}-{num_epochs}_epochs-{folds}-fold-cv.results.json",
                "wt") as fp:
            json.dump(allresults, fp)

        # calculate overall metrics across all folds
        xval_f1_other = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 pos_label="Other")
        xval_f1_info_need = f1_score(all_labels,
                                     all_preds,
                                     labels=label_list,
                                     pos_label=current_info_need)
        xval_f1_micro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="micro")
        xval_f1_macro = f1_score(all_labels,
                                 all_preds,
                                 labels=label_list,
                                 average="macro")
        xval_mcc = matthews_corrcoef(all_labels, all_preds)

        xval_overall_results = {
            "xval_f1_other": xval_f1_other,
            f"xval_f1_infoneed": xval_f1_info_need,
            "xval_f1_micro": xval_f1_micro,
            "xval_f1_macro": xval_f1_macro,
            "xval_f1_mcc": xval_mcc
        }

        logger.info(f"XVAL F1 MICRO: {xval_f1_micro}")
        logger.info(f"XVAL F1 MACRO: {xval_f1_macro}")
        logger.info(f"XVAL F1 OTHER: {xval_f1_other}")
        logger.info(
            f"XVAL F1 {current_info_need} {condition} {num_epochs} epochs:   {xval_f1_info_need}"
        )
        logger.info(f"XVAL MCC: {xval_mcc}")

        # -----------------------------------------------------
        # Just for illustration, use the best model from the best xval val for evaluation on
        # the original (still unseen) test set.
        logger.info(
            "###### Final Eval on hold out test set using best model #####")
        evaluator_origtest = Evaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)
        # restore model from the best fold
        lm_name = model.language_model.name
        save_dir = Path(
            f"saved_models/{current_info_need}-{condition}-{num_epochs}_epochs-cook-{language_model_name}-{bestfold}"
        )
        model = AdaptiveModel.load(save_dir, device, lm_name=lm_name)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        result = evaluator_origtest.eval(model)
        logger.info("TEST F1 MICRO: {}".format(result[0]["f1_micro"]))
        logger.info("TEST F1 MACRO: {}".format(result[0]["f1_macro"]))
        logger.info("TEST F1 OTHER: {}".format(result[0]["f1_other"]))
        logger.info("TEST F1 {0}: {1}".format(current_info_need,
                                              result[0]["f1_infoneed"]))
        logger.info("TEST MCC:  {}".format(result[0]["mcc"]))

        test_set_results = {
            "test_f1_other": result[0]["f1_other"],
            "test_f1_infoneed": result[0][f"f1_infoneed"],
            "test_f1_micro": result[0]["f1_micro"],
            "test_f1_macro": result[0]["f1_macro"],
            "test_f1_mcc": result[0]["mcc"]
        }
def doc_classification(
    task_config,
    model_name_or_path,
    cache_dir,
    data_dir,
    save_dir,
    model_dir,
    run_name="0",
    lr=1e-05,
    warmup_steps=5000,
    balance_classes=True,
    embeds_dropout=0.1,
    epochs=200,  # large because we use early stopping by default
    batch_size=20,
    grad_acc_steps=1,
    early_stopping_metric="roc_auc",
    early_stopping_mode="max",
    early_stopping_patience=10,
    model_class="Bert",
    tokenizer_class="BertTokenizer",
    do_lower_case=False,
    do_train=True,
    do_eval=True,
    do_hpo=False,
    print_preds=False,
    print_dev_preds=False,
    max_seq_len=512,
    seed=11,
    eval_every=500,
    use_amp=False,
    use_cuda=True,
):
    # Load task config
    task_config = yaml.safe_load(open(task_config))

    data_dir = data_dir
    save_dir = save_dir
    model_dir = model_dir

    # Create label list from args list or (for large label lists) create from file by splitting by space
    if isinstance(task_config["data"]["label_list"], list):
        label_list = task_config["data"]["label_list"]
    else:
        with open(data_dir / 'labels' /
                  task_config["data"]["label_list"]) as code_file:
            label_list = code_file.read().split(" ")

    # Register Outcome Metrics
    register_task_metrics(label_list)

    # General Settings
    set_all_seeds(seed=seed)
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=model_name_or_path,
        tokenizer_class=tokenizer_class,
        do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=data_dir,
        label_list=label_list,
        metric=task_config["metric"],
        multilabel=task_config["multilabel"],
        train_filename=task_config["data"]["train_filename"],
        dev_filename=task_config["data"]["dev_filename"],
        dev_split=task_config["data"]["dev_split"]
        if "dev_split" in task_config["data"] else None,
        test_filename=task_config["data"]["test_filename"],
        delimiter=task_config["data"]["parsing"]["delimiter"],
        quote_char=task_config["data"]["parsing"]["quote_char"],
        label_column_name=task_config["data"]["parsing"]["label_column"])

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         caching=True,
                         cache_path=Path(cache_dir),
                         batch_size=batch_size)

    if do_train:

        # Setup MLFlow logger
        ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"])
        ml_logger.init_experiment(
            experiment_name=task_config["experiment_name"],
            run_name=f'{task_config["experiment_name"]}_{run_name}')

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(model_name_or_path,
                                            language_model_class=model_class)

        # b) and a prediction head on top that is suited for our task

        # Define class weights
        if balance_classes:
            class_weights = data_silo.calculate_class_weights(
                task_name=task_config["task_type"])
        else:
            class_weights = None

        # Create Multi- or Single-Label Classification Heads
        if task_config["multilabel"]:

            prediction_head = MultiLabelTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        else:
            prediction_head = ExtendedTextClassificationHead(
                class_weights=class_weights, num_labels=len(label_list))

        model = ExtendedAdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=embeds_dropout,
            lm_output_types=[task_config["output_type"]],
            device=device)

        # 5. Create an optimizer
        schedule_opts = {
            "name": "LinearWarmup",
            "num_warmup_steps": warmup_steps
        }

        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=lr,
            device=device,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=epochs,
            use_amp=use_amp,
            grad_acc_steps=grad_acc_steps,
            schedule_opts=schedule_opts)

        # 6. Create an early stopping instance
        early_stopping = None
        if early_stopping_mode != "none":
            early_stopping = EarlyStopping(mode=early_stopping_mode,
                                           min_delta=0.0001,
                                           save_dir=model_dir,
                                           metric=early_stopping_metric,
                                           patience=early_stopping_patience)

        # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it
        # from time to time

        trainer = ExtendedTrainer(model=model,
                                  optimizer=optimizer,
                                  data_silo=data_silo,
                                  epochs=epochs,
                                  n_gpu=n_gpu,
                                  lr_schedule=lr_schedule,
                                  evaluate_every=eval_every,
                                  early_stopping=early_stopping,
                                  device=device,
                                  grad_acc_steps=grad_acc_steps,
                                  evaluator_test=do_eval)

        def score_callback(eval_score, train_loss):
            tune.report(roc_auc_dev=eval_score, train_loss=train_loss)

        # 8. Train the model
        trainer.train(score_callback=score_callback if do_hpo else None)

        # 9. Save model if not saved in early stopping
        model.save(model_dir + "/final_model")
        processor.save(model_dir + "/final_model")

    if do_eval:
        # Load newly trained model or existing model
        if do_train:
            model_dir = model_dir
        else:
            model_dir = Path(model_name_or_path)

        logger.info("###### Eval on TEST SET #####")

        evaluator_test = ExtendedEvaluator(
            data_loader=data_silo.get_data_loader("test"),
            tasks=data_silo.processor.tasks,
            device=device)

        # Load trained model for evaluation
        model = ExtendedAdaptiveModel.load(model_dir, device)
        model.connect_heads_with_processor(data_silo.processor.tasks,
                                           require_labels=True)

        # Evaluate
        results = evaluator_test.eval(model, return_preds_and_labels=True)

        # Log results
        utils.log_results(results,
                          dataset_name="test",
                          steps=len(evaluator_test.data_loader),
                          save_path=model_dir + "/eval_results.txt")

        if print_preds:
            # Print model test predictions
            utils.save_predictions(results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"])

        if print_dev_preds:
            # Evaluate on dev set, e.g. for threshold tuning
            evaluator_dev = Evaluator(
                data_loader=data_silo.get_data_loader("dev"),
                tasks=data_silo.processor.tasks,
                device=device)
            dev_results = evaluator_dev.eval(model,
                                             return_preds_and_labels=True)
            utils.log_results(dev_results,
                              dataset_name="dev",
                              steps=len(evaluator_dev.data_loader),
                              save_path=model_dir + "/eval_dev_results.txt")

            # Print model dev predictions
            utils.save_predictions(dev_results,
                                   save_dir=model_dir,
                                   multilabel=task_config["multilabel"],
                                   dataset_name="dev")
예제 #30
0
evaluate_every = 100
lang_model = "bert-base-german-cased"

# 1.Create a tokenizer
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model, do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.

label_list = ["OTHER", "OFFENSE"]
metric = "f1_macro"

processor = TextClassificationProcessor(tokenizer=tokenizer,
                                        max_seq_len=128,
                                        data_dir="../data/germeval18",
                                        labels=label_list,
                                        metric=metric,
                                        source_field="coarse_label")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
prediction_head = TextClassificationHead(layer_dims=[
    768, len(processor.tasks["text_classification"]["label_list"])
])

model = AdaptiveModel(language_model=language_model,