예제 #1
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_natural_questions")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 1
    evaluate_every = 500
    lang_model = "deepset/roberta-base-squad2"  # start with a model that can already extract answers
    do_lower_case = False  # roberta is a cased model
    train_filename = "train_medium.jsonl"
    dev_filename = "dev_medium.jsonl"
    keep_is_impossible = 0.15  # downsample negative examples after data conversion
    downsample_context_size = 300  # reduce length of wikipedia articles to relevant part around the answer

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart
    html_tags = [
        "<Th>",
        "</Th>",
        "<Td>",
        "</Td>",
        "<Tr>",
        "</Tr>",
        "<Li>",
        "</Li>",
        "<P>",
        "</P>",
        "<Ul>",
        "</Ul>",
        "<H1>",
        "</H1>",
        "<H2>",
        "</H2>",
        "<H3>",
        "</H3>",
        "<H4>",
        "</H4>",
        "<H5>",
        "</H5>",
        "<Td_colspan=",
    ]
    tokenizer.add_tokens(html_tags)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = NaturalQuestionsProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        train_filename=train_filename,
        dev_filename=dev_filename,
        keep_is_impossible=keep_is_impossible,
        downsample_context_size=downsample_context_size,
        data_dir=Path("../data/natural_questions"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         caching=True)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model,
                                        n_added_tokens=len(html_tags))
    # b) and in case of Natural Questions we need two Prediction Heads
    #    one for extractive Question Answering
    qa_head = QuestionAnsweringHead()
    #    another one for answering yes/no questions or deciding if the given text passage might contain an answer
    classification_head = TextClassificationHead(
        num_labels=len(processor.answer_type_list)
    )  # answer_type_list = ["is_impossible", "span", "yes", "no"]
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/roberta-base-squad2-nq")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3
    fetch_archive_from_http(
        "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip",
        output_dir="../saved_models/farm")
    QA_input = [{
        "qas": [
            "Did GameTrailers rated Twilight Princess as one of the best games ever created?"
        ],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = Inferencer.load(
        model_name_or_path="../saved_models/farm/roberta-base-squad2-nq",
        batch_size=batch_size,
        gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)

    pprint.pprint(result)

    print(
        f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?"
        f"\nAnswer from model: {result[0]['predictions'][0]['answers'][0]['classification']}"
    )
예제 #2
0
def test_doc_classification(data_dir_path, text_column_name, caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    tcp_params = dict(tokenizer=tokenizer,
                      max_seq_len=8,
                      data_dir=Path(data_dir_path),
                      train_filename="train-sample.tsv",
                      label_list=["OTHER", "OFFENSE"],
                      metric="f1_macro",
                      dev_filename="test-sample.tsv",
                      test_filename=None,
                      dev_split=0.0,
                      label_column_name="coarse_label")

    if text_column_name is not None:
        tcp_params["text_column_name"] = text_column_name

    processor = TextClassificationProcessor(**tcp_params)

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(num_labels=2)
    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts=None)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/doc_class")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [{
        "text": "Martin Müller spielt Handball in Berlin."
    }, {
        "text":
        "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei."
    }]

    inf = Inferencer.load(save_dir, batch_size=2, num_processes=0)
    result = inf.inference_from_dicts(dicts=basic_texts)
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
    result2 = inf.inference_from_dicts(dicts=basic_texts, return_json=True)
    assert result == result2
예제 #3
0
def test_lm_finetuning_custom_vocab(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 1
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)
    tokenizer.add_tokens([
        "aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbbbbbb", "ccccccccccccccccccccccc"
    ])

    processor = BertStyleLMProcessor(data_dir=Path("samples/lm_finetuning"),
                                     train_filename="train-sample.txt",
                                     test_filename="test-sample.txt",
                                     dev_filename=None,
                                     tokenizer=tokenizer,
                                     max_seq_len=12,
                                     next_sent_pred=True)
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)

    language_model = LanguageModel.load(lang_model,
                                        n_added_tokens=len(
                                            tokenizer.added_tokens_decoder))
    lm_prediction_head = BertLMHead.load(lang_model,
                                         n_added_tokens=len(
                                             tokenizer.added_tokens_decoder))
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'CosineWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    trainer.train()

    # LM embeddings and weight of decoder in head are shared and should therefore be equal
    assert torch.all(
        torch.eq(model.language_model.model.embeddings.word_embeddings.weight,
                 model.prediction_heads[0].decoder.weight))

    save_dir = Path("testsave/lm_finetuning")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Farmer's life is great."
        },
        {
            "text": "It's nothing for big city kids though."
        },
    ]
    model = Inferencer.load(save_dir, task_type="embeddings")
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == [
        'Farmer', "'", 's', 'life', 'is', 'great', '.'
    ]
    assert result[0]["vec"].shape == (768, )
    # TODO check why results vary accross runs with same seed
    assert isinstance(result[0]["vec"][0], np.float32)
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    #############################################
    # CUSTOM OPTIMIZER & LR SCHEDULE
    #############################################
    # learning rate schedules from transformers
    schedule_opts = {"name": "LinearWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "Constant"}
    # schedule_opts = {"name": "CosineWarmup", "warmup_proportion": 0.4}
    # schedule_opts = {"name": "CosineWarmupWithRestarts", "warmup_proportion": 0.4}

    # or from native pytorch (see https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html for all options)
    # schedule_opts = {"name": "StepLR", "step_size": 30, "gamma": 0.1}
    # schedule_opts = {"name": "ReduceLROnPlateau", "mode": 'min', "factor": 0.1, "patience":10}

    # optimizers from pytorch (see https://pytorch.org/docs/stable/optim.html for all options)
    optimizer_opts = {"name": "SGD", "momentum": 0.0}

    # or from apex (see https://github.com/NVIDIA/apex/tree/master/apex/optimizers for all options)
    # optimizer_opts = {"name": "FusedLAMB", "bias_correction": True}

    # or from transformers (default in FARM)
    #optimizer_opts = {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01}
    #############################################

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=128,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-3,
        optimizer_opts=optimizer_opts,
        schedule_opts=schedule_opts,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #5
0
def test_qa(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 6
    n_epochs = 2
    evaluate_every = 100
    base_LM_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=base_LM_model, do_lower_case=False)
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        train_filename="train-sample.json",
        dev_filename="dev-sample.json",
        test_filename=None,
        data_dir="samples/qa",
    )

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)
    language_model = Bert.load(base_LM_model)
    prediction_head = QuestionAnsweringHead(
        layer_dims=[768, len(processor.label_list)])
    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        warmup_proportion=0.2,
        n_examples=data_silo.n_samples("train"),
        batch_size=batch_size,
        n_epochs=n_epochs,
    )
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    model = trainer.train(model)
    save_dir = "testsave/qa"
    model.save(save_dir)
    processor.save(save_dir)

    QA_input = [{
        "questions": ["In what country is Normandy located?"],
        "text":
        "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
    }]

    model = Inferencer(save_dir)
    result = model.run_inference(dicts=QA_input)
    assert result[0]["predictions"][0]["label"] == 'The'
예제 #6
0
def run_experiment(args):
    validate_args(args)
    directory_setup(output_dir=args.output_dir, do_train=args.do_train)
    distributed = bool(args.local_rank != -1)

    # Init device and distributed settings
    device, n_gpu = initialize_device_settings(use_cuda=args.cuda,
                                               local_rank=args.local_rank,
                                               fp16=args.fp16)

    args.batch_size = args.batch_size // args.gradient_accumulation_steps
    if n_gpu > 1:
        args.batch_size = args.batch_size * n_gpu
    set_all_seeds(args.seed)

    # Prepare Data
    tokenizer = BertTokenizer.from_pretrained(args.model,
                                              do_lower_case=args.lower_case)
    processor = Processor.load(
        processor_name=args.processor_name,
        tokenizer=tokenizer,
        max_seq_len=args.max_seq_len,
        data_dir=args.data_dir,
    )

    data_silo = DataSilo(processor=processor,
                         batch_size=args.batch_size,
                         distributed=distributed)

    class_weights = None
    if args.balance_classes:
        class_weights = data_silo.class_weights

    model = get_adaptive_model(
        lm_output_type=args.lm_output_type,
        prediction_heads=args.prediction_head,
        layer_dims=args.layer_dims,
        model=args.model,
        device=device,
        class_weights=class_weights,
        fp16=args.fp16,
        embeds_dropout_prob=args.embeds_dropout_prob,
        local_rank=args.local_rank,
        n_gpu=n_gpu,
    )

    # Init optimizer

    # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this?
    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=args.learning_rate,
        warmup_proportion=args.warmup_proportion,
        loss_scale=args.loss_scale,
        fp16=args.fp16,
        n_examples=data_silo.n_samples("train"),
        batch_size=args.batch_size,
        grad_acc_steps=args.gradient_accumulation_steps,
        n_epochs=args.epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=args.epochs,
        n_gpu=n_gpu,
        grad_acc_steps=args.gradient_accumulation_steps,
        fp16=args.fp16,
        warmup_linear=warmup_linear,
        evaluate_every=args.eval_every,
        device=device,
    )

    model = trainer.train(model)

    model_name = (
        f"{model.language_model.name}-{model.language_model.language}-{args.name}"
    )
    processor.save(f"save/{model_name}")
    model.save(f"save/{model_name}")
예제 #7
0
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM",
                          run_name="Run_question_answering")

##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 24
n_epochs = 2
evaluate_every = 500
base_LM_model = "bert-base-cased"
train_filename = "train-v2.0.json"
dev_filename = "dev-v2.0.json"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model,
                           do_lower_case=False)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
label_list = ["start_token", "end_token"]
metric = "squad"
processor = SquadProcessor(
def doc_classification(task,
                       model_type,
                       n_epochs,
                       batch_size,
                       embeds_dropout,
                       evaluate_every,
                       use_cuda,
                       max_seq_len,
                       learning_rate,
                       do_lower_case,
                       register_model,
                       save_model=True,
                       early_stopping=False):

    language = cu.params.get('language')

    # Check task
    if cu.tasks.get(str(task)).get('type') != 'multi_classification':
        raise Exception('NOT A MULTI CLASSIFICATION TASK')

    # Data
    dt_task = dt.Data(task=task)
    ## Download training files
    if not os.path.isfile(dt_task.get_path('fn_train', dir='data_dir')):
        dt_task.download('data_dir', dir='data_dir', source='datastore')

    # Settings
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    lang_model = he.get_farm_model(model_type, language)
    save_dir = dt_task.get_path('model_dir')
    label_list = dt_task.load('fn_label', dir='data_dir',
                              header=None)[0].to_list()

    # AML log
    try:
        aml_run.log('task', task)
        aml_run.log('language', language)
        aml_run.log('n_epochs', n_epochs)
        aml_run.log('batch_size', batch_size)
        aml_run.log('learning_rate', learning_rate)
        aml_run.log('embeds_dropout', embeds_dropout)
        aml_run.log('max_seq_len', max_seq_len)
        aml_run.log('lang_model', lang_model)
        aml_run.log_list('label_list', label_list)
    except:
        pass

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
        # AML log
        try:
            aml_run.log('acc', acc.get('acc'))
            aml_run.log('f1macro', f1macro)
            aml_run.log('f1micro', f1micro)
        except:
            pass
        return {"acc": acc, "f1_macro": f1macro, "f1_micro": f1micro}

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=max_seq_len,
        data_dir=dt_task.data_dir,
        label_list=label_list,
        label_column_name="label",
        metric=metric,
        quote_char='"',
        multilabel=True,
        train_filename=dt_task.get_path('fn_train', dir='data_dir'),
        test_filename=dt_task.get_path('fn_test', dir='data_dir'),
        dev_split=0.3)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = MultiLabelTextClassificationHead(
        num_labels=len(processor.tasks["text_classification"]["label_list"]))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=embeds_dropout,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=learning_rate,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Store it:
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    if save_model:
        model.save(save_dir)
        processor.save(save_dir)

        if register_model:
            dt_task.upload('model_dir', destination='model')
def doc_classification_with_earlystopping():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    # for local logging instead:
    # ml_logger = MLFlowLogger(tracking_uri="logs")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="DocClassification_ES_f1_1")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 20
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    # The processor wants to know the possible labels ...
    label_list = ["OTHER", "OFFENSE"]

    # The evaluation on the dev-set can be done with one of the predefined metrics or with a
    # metric defined as a function from (preds, labels) to a dict that contains all the actual
    # metrics values. The function must get registered under a string name and the string name must
    # be used.
    def mymetrics(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
        f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
        f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
        f1micro = f1_score(y_true=labels, y_pred=preds, average="macro")
        return {
            "acc": acc,
            "f1_other": f1other,
            "f1_offense": f1offense,
            "f1_macro": f1macro,
            "f1_micro": f1micro
        }

    register_metrics('mymetrics', mymetrics)
    metric = 'mymetrics'

    processor = TextClassificationProcessor(
        tokenizer=tokenizer,
        max_seq_len=64,
        data_dir=Path("../data/germeval18"),
        label_list=label_list,
        metric=metric,
        label_column_name="coarse_label")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.2,
                          lm_output_types=["per_sequence"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=0.5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    # Also create an EarlyStopping instance and pass it on to the trainer

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        metric="f1_offense",
        mode=
        "max",  # use the metric from our own metrics function instead of loss
        # metric="f1_macro", mode="max",  # use f1_macro from the dev evaluator of the trainer
        # metric="loss", mode="min",   # use loss from the dev evaluator of the trainer
        save_dir=Path("saved_models/bert-german-doc-tutorial-es"
                      ),  # where to save the best model
        patience=
        5  # number of evaluations to wait for improvement before terminating the training
    )

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model.
    # NOTE: if early stopping is used, the best model has been stored already in the directory
    # defined with the EarlyStopping instance
    # The model we have at this moment is the model from the last training epoch that was carried
    # out before early stopping terminated the training
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]

    # Load from the final epoch directory and apply
    print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING")
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)

    # Load from saved best model
    print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING")
    model = Inferencer.load(earlystopping.save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print("APPLICATION ON BEST MODEL")
    print(result)
예제 #10
0
def test_dpr_modules(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # 1.Create question and passage tokenizers
    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=
        "facebook/dpr-question_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)
    context_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base",
        do_lower_case=True,
        use_fast=True)

    processor = TextSimilarityProcessor(
        tokenizer=query_tokenizer,
        passage_tokenizer=context_tokenizer,
        max_seq_len=256,
        label_list=["hard_negative", "positive"],
        metric="text_similarity_metric",
        data_dir="data/retriever",
        train_filename="nq-train.json",
        dev_filename="nq-dev.json",
        test_filename="nq-dev.json",
        embed_title=True,
        num_hard_negatives=1)

    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRQuestionEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path="bert-base-uncased",
        language_model_class="DPRContextEncoder",
        hidden_dropout_prob=0,
        attention_probs_dropout_prob=0)

    prediction_head = TextSimilarityHead(similarity_function="dot_product")

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.0,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model.connect_heads_with_processor(processor.tasks)

    assert type(model) == BiAdaptiveModel
    assert type(processor) == TextSimilarityProcessor
    assert type(question_language_model) == DPRQuestionEncoder
    assert type(passage_language_model) == DPRContextEncoder

    # check embedding layer weights
    assert list(model.named_parameters())[0][1][
        0, 0].item() - -0.010200000368058681 < 0.0001

    d = {
        'query':
        'big little lies season 2 how many episodes',
        'passages': [{
            'title': 'Big Little Lies (TV series)',
            'text':
            'series garnered several accolades. It received 16 Emmy Award nominations and won eight, including Outstanding Limited Series and acting awards for Kidman, Skarsgård, and Dern. The trio also won Golden Globe Awards in addition to a Golden Globe Award for Best Miniseries or Television Film win for the series. Kidman and Skarsgård also received Screen Actors Guild Awards for their performances. Despite originally being billed as a miniseries, HBO renewed the series for a second season. Production on the second season began in March 2018 and is set to premiere in 2019. All seven episodes are being written by Kelley',
            'label': 'positive',
            'external_id': '18768923'
        }, {
            'title': 'Little People, Big World',
            'text':
            'final minutes of the season two-A finale, "Farm Overload". A crowd had gathered around Jacob, who was lying on the ground near the trebuchet. The first two episodes of season two-B focus on the accident, and how the local media reacted to it. The first season of "Little People, Big World" generated solid ratings for TLC (especially in the important 18–49 demographic), leading to the show\'s renewal for a second season. Critical reviews of the series have been generally positive, citing the show\'s positive portrayal of little people. Conversely, other reviews have claimed that the show has a voyeuristic bend',
            'label': 'hard_negative',
            'external_id': '7459116'
        }, {
            'title': 'Cormac McCarthy',
            'text':
            'chores of the house, Lee was asked by Cormac to also get a day job so he could focus on his novel writing. Dismayed with the situation, she moved to Wyoming, where she filed for divorce and landed her first job teaching. Cormac McCarthy is fluent in Spanish and lived in Ibiza, Spain, in the 1960s and later settled in El Paso, Texas, where he lived for nearly 20 years. In an interview with Richard B. Woodward from "The New York Times", "McCarthy doesn\'t drink anymore – he quit 16 years ago in El Paso, with one of his young',
            'label': 'negative',
            'passage_id': '2145653'
        }]
    }

    sample = processor._dict_to_samples(d)
    feats = processor._sample_to_features(sample[0])
    dataset, tensor_names = convert_features_to_dataset(feats)
    features = {
        key: val.unsqueeze(0).to(device)
        for key, val in zip(tensor_names, dataset[0])
    }

    # test features
    assert torch.all(
        torch.eq(
            features["query_input_ids"][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 2161, 1016, 2129, 2116, 4178, 102])))
    assert torch.all(
        torch.eq(
            features["passage_input_ids"][0][0][:10].cpu(),
            torch.tensor(
                [101, 2502, 2210, 3658, 1006, 2694, 2186, 1007, 102, 2186])))
    assert len(features["query_segment_ids"][0].nonzero()) == 0
    assert len(features["passage_segment_ids"][0].nonzero()) == 0
    assert torch.all(
        torch.eq(features["query_attention_mask"].nonzero()[:, 1].cpu(),
                 torch.tensor(list(range(10)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][0].nonzero().cpu().squeeze(),
            torch.tensor(list(range(127)))))
    assert torch.all(
        torch.eq(
            features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(),
            torch.tensor(list(range(143)))))

    # test model encodings
    query_vector = model.language_model1(**features)[0]
    passage_vector = model.language_model2(**features)[0]
    assert torch.all(
        torch.le(
            query_vector[0, :10].cpu() - torch.tensor([
                -0.2135, -0.4748, 0.0501, -0.0430, -0.1747, -0.0441, 0.5638,
                0.1405, 0.2285, 0.0893
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[0, :10].cpu() - torch.tensor([
                0.0557, -0.6836, -0.3645, -0.5566, 0.2034, -0.3656, 0.2969,
                -0.0555, 0.3405, -0.8691
            ]),
            torch.ones((1, 10)) * 0.0001))
    assert torch.all(
        torch.le(
            passage_vector[1, :10].cpu() - torch.tensor([
                -0.2006, -1.5002, -0.1897, -0.3421, -0.0405, -0.0471, -0.0306,
                0.1156, 0.3350, -0.3412
            ]),
            torch.ones((1, 10)) * 0.0001))

    # test logits and loss
    embeddings = model(**features)
    query_emb, passage_emb = embeddings[0]
    assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu()))
    assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu()))

    loss = model.logits_to_loss_per_head(embeddings, **features)
    similarity_scores = model.prediction_heads[0]._embeddings_to_scores(
        query_emb, passage_emb).cpu()
    assert torch.all(
        torch.le(
            similarity_scores - torch.tensor([[-1.8311e-03, -6.3016e+00]]),
            torch.ones((1, 2)) * 0.0001))
    assert (loss[0].item() - 0.0018) <= 0.0001
예제 #11
0
def lm_finetuning():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    set_all_seeds(seed=42)
    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_minimal_example_lm")
    ##########################
    ########## Settings
    ##########################
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 30
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = BertStyleLMProcessor(data_dir=Path("../data/lm_finetune_nips"),
                                     tokenizer=tokenizer,
                                     max_seq_len=128,
                                     max_docs=20)
    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_multiprocessing_chunksize=20)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-english-lm-tutorial")
    model.save(save_dir)
    processor.save(save_dir)
예제 #12
0
def doc_classifcation():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    n_epochs = 1
    batch_size = 32
    evaluate_every = 100
    lang_model = "bert-base-german-cased"
    do_lower_case = False
    # or a local path:
    # lang_model = Path("../saved_models/farm-bert-base-cased")
    use_amp = None

    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load GermEval 2018 Data.

    label_list = ["OTHER", "OFFENSE"]
    metric = "f1_macro"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=128,
                                            data_dir=Path("../data/germeval18"),
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="coarse_label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a
    #    few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"),
        num_labels=len(label_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        use_amp=use_amp)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-german-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
        {"text": "Martin Müller spielt Handball in Berlin"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #13
0
파일: infer.py 프로젝트: fablos/FARM
    def __init__(
        self,
        model,
        processor,
        task_type,
        batch_size=4,
        gpu=False,
        name=None,
        return_class_probs=False,
        extraction_strategy=None,
        extraction_layer=None,
        num_processes=None,
    ):
        """
        Initializes Inferencer from an AdaptiveModel and a Processor instance.

        :param model: AdaptiveModel to run in inference mode
        :type model: AdaptiveModel
        :param processor: A dataset specific Processor object which will turn input (file or dict) into a Pytorch Dataset.
        :type processor: Processor
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param name: Name for the current Inferencer model, displayed in the REST API
        :type name: string
        :param return_class_probs: either return probability distribution over all labels or the prob of the associated label
        :type return_class_probs: bool
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
        :type num_processes: int
        :return: An instance of the Inferencer.

        """
        # Init device and distributed settings
        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)

        self.processor = processor
        self.model = model
        self.model.eval()
        self.batch_size = batch_size
        self.device = device
        self.language = self.model.get_language()
        self.task_type = task_type

        if task_type == "embeddings":
            if not extraction_layer or not extraction_strategy:
                logger.warning(
                    "Using task_type='embeddings', but couldn't find one of the args `extraction_layer` and `extraction_strategy`. "
                    "Since FARM 0.4.2, you set both when initializing the Inferencer and then call inferencer.inference_from_dicts() instead of inferencer.extract_vectors()"
                )
            self.model.prediction_heads = torch.nn.ModuleList([])
            self.model.language_model.extraction_layer = extraction_layer
            self.model.language_model.extraction_strategy = extraction_strategy

        # TODO add support for multiple prediction heads

        self.name = name if name != None else f"anonymous-{self.task_type}"
        self.return_class_probs = return_class_probs

        model.connect_heads_with_processor(processor.tasks,
                                           require_labels=False)
        set_all_seeds(42)

        self._set_multiprocessing_pool(num_processes)
예제 #14
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    We do not have a sample dataset for regression yet, add your own dataset to run the example
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="acc",
        label_column_name="label",
        max_seq_len=64,
        train_filename=training_filename,
        dev_filename=test_filename,
        test_filename=test_filename,
        data_dir=Path("../data"),
        tasks={"text_classification"},
        delimiter="\t")

    # train_filename = training_filename,
    # test_filename = test_filename,
    # dev_filename = test_filename,
    # dev_split = 0.5,

    # data_dir=Path("../data/asnq_binary"),

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    # Alte Version vor StreamingDataSilo
    # data_silo = DataSilo(
    #    processor=processor,
    #    batch_size=batch_size, max_processes=4)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(
        num_labels=len(label_list),
        class_weights=data_silo.calculate_class_weights(
            task_name="text_classification"))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-6,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    now = datetime.now()  # current date and time

    # An early stopping instance can be used to save the model that performs best on the dev set
    # according to some metric and stop training when no improvement is happening for some iterations.
    earlystopping = EarlyStopping(
        #metric="f1_weighted", mode="max",  # use f1_macro from the dev evaluator of the trainer
        metric="loss",
        mode="min",  # use loss from the dev evaluator of the trainer
        save_dir=Path(
            "saved_models/earlystopping/" +
            now.strftime("%m%d%Y%H%M%S")),  # where to save the best model
        patience=
        8  # number of evaluations to wait for improvement before terminating the training
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device,
                      early_stopping=earlystopping)
    # model=model,
    # optimizer=optimizer,
    # data_silo=data_silo,
    # epochs=n_epochs,
    # n_gpu=n_gpu,
    # lr_schedule=lr_schedule,
    # evaluate_every=evaluate_every,
    # device=device)

    # 7. Let it grow
    #comment if going to use a stored model
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    # When a new model is being trained and need to be saved
    save_dir = Path("saved_models/text_pair_classification_model" +
                    now.strftime("%m%d%Y%H%M%S"))
    model.save(save_dir)
    processor.save(save_dir)

    # When only a model needs to be loaded change the details to load the needed model
    # save_dir = Path("saved_models/text_pair_classification_model" + "01272021103548")

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    basic_texts = [
        {
            "text":
            "<claim-text>The method of claim 10, wherein the indium metal layer is 10 nm to 100 µm thick.</claim-text>",
            "text_b":
            "<p id="
            "p0001"
            " num="
            "0001"
            ">The present invention is directed to metal plating compositions and methods. More specifically, the present invention is directed to metal plating compositions and methods which provide improved leveling and throwing power.</p <p id="
            "p0039"
            " num="
            "0039"
            ">One or more conventional surfactants may be used. Typically, surfactants include, but are not limited to, nonionic surfactants such as alkyl phenoxy polyethoxyethanols. Other suitable surfactants containing multiple oxyethylene groups also may be used. Such surfactants include compounds of polyoxyethylene polymers having from as many as 20 to 150 repeating units. Such compounds also may perform as suppressors. Also included in the class of polymers are both block and random copolymers of polyoxyethylene (EO) and polyoxypropylene (PO). Surfactants may be added in conventional amounts, such as from 0.05 g/L to 20 g/L or such as from 0.5 g/L to 5 g/L.</p <p id="
            "p0040"
            " num="
            "0040"
            ">Conventional levelers include, but are not limited to, one or more of alkylated polyalkyleneimines and organic sulfo sulfonates. Examples of such compounds include, 4-mercaptopyridine, 2-mercaptothiazoline, ethylene thiourea, thiourea, 1-(2-hydroxyethyl)-2-imidazolidinethion (HIT) and alkylated polyalkyleneimines. Such levelers are included in conventional amounts. Typically, such levelers are included in amounts of 1ppb to 1 g/L, or such as from 10ppb to 500ppm.</p <p id="
            "p0042"
            " num="
            "0042"
            ">Alkali metal salts which may be included in the plating compositions include, but are not limited to, sodium and potassium salts of halogens, such as chloride, fluoride and bromide. Typically chloride is used. Such alkali metal salts are used in conventional amounts.</p <p id="
            "p0053"
            " num="
            "0053"
            ">The metal plating compositions may be used to plate a metal or metal alloy on a substrate by any method known in the art and literature. Typically, the metal or metal alloy is electroplated using conventional electroplating processes with conventional apparatus. A soluble or insoluble anode may be used with the electroplating compositions.</p <p id="
            "p0022"
            " num="
            "0022"
            ">One or more sources of metal ions are included in metal plating compositions to plate metals. The one or more sources of metal ions provide metal ions which include, but are not limited to, copper, tin, nickel, gold, silver, palladium, platinum and indium. Alloys include, but are not limited to, binary and ternary alloys of the foregoing metals. Typically, metals chosen from copper, tin, nickel, gold, silver or indium are plated with the metal plating compositions. More typically, metals chosen from copper, tin, silver or indium are plated. Most typically, copper is plated.</p <p id="
            "p0030"
            " num="
            "0030"
            ">Indium salts which may be used include, but are not limited to, one or more of indium salts of alkane sulfonic acids and aromatic sulfonic acids, such as methanesulfonic acid, ethanesulfonic acid, butane sulfonic acid, benzenesulfonic acid and toluenesulfonic acid, salts of sulfamic acid, sulfate salts, chloride and bromide salts of indium, nitrate salts, hydroxide salts, indium oxides, fluoroborate salts, indium salts of carboxylic acids, such as citric acid, acetoacetic acid, glyoxylic acid, pyruvic acid, glycolic acid, malonic acid, hydroxamic acid, iminodiacetic acid, salicylic acid, glyceric acid, succinic acid, malic acid, tartaric acid, hydroxybutyric acid, indium salts of amino acids, such as arginine, aspartic acid, asparagine, glutamic acid, glycine, glutamine, leucine, lysine, threonine, isoleucine, and valine.</p"
        },
        {
            "text":
            "<claim-text>A toner comprising: <claim-text>toner base particles; and</claim-text> <claim-text>an external additive,</claim-text> <claim-text>the toner base particles each comprising a binder resin and a colorant,</claim-text> <claim-text>wherein the external additive comprises coalesced particles,</claim-text> <claim-text>wherein the coalesced particles are each a non-spherical secondary particle in which primary particles are coalesced together, and</claim-text> <claim-text>wherein an index of a particle size distribution of the coalesced particles is expressed by the following Formula (1): <maths id="
            "math0004"
            " num="
            "(formula (1)"
            "><math display="
            "block"
            "><mfrac><msub><mi>Db</mi><mn>50</mn></msub><msub><mi>Db</mi><mn>10</mn></msub></mfrac><mo>≦</mo><mn>1.20</mn></math><img id="
            "ib0008"
            " file="
            "imgb0008.tif"
            " wi="
            "93"
            " he="
            "21"
            " img-content="
            "math"
            " img-format="
            "tif"
            "/></maths><br/> where, in a distribution diagram in which particle diameters in nm of the coalesced particles are on a horizontal axis and cumulative percentages in % by number of the coalesced particles are on a vertical axis and in which the coalesced particles are accumulated from the coalesced particles having smaller particle diameters to the coalesced particles having larger particle diameters, Db<sub>50</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 50% by number, and Db<sub>10</sub> denotes a particle diameter of the coalesced particle at which the cumulative percentage is 10% by number.</claim-text></claim-text>",
            "text_b":
            "<p id="
            "p0177"
            " num="
            "0177"
            ">For a similar reason, it is preferred that the electroconductive fine powder has a volume-average particle size of 0.5 - 5 µm, more preferably 0.8 - 5 µm, further preferably 1.1 - 5 µm and has a particle size distribution such that particles of 0.5 µm or smaller occupy at most 70 % by volume and particles of 5.0 µm or larger occupy at most 5 % by number.</p <p id="
            "p0189"
            " num="
            "0189"
            ">The volume-average particle size and particle size distribution of the electroconductive fine powder described herein are based on values measured in the following manner. A laser diffraction-type particle size distribution measurement apparatus ("
            "Model LS-230"
            ", available from Coulter Electronics Inc.) is equipped with a liquid module, and the measurement is performed in a particle size range of 0.04 - 2000 µm to obtain a volume-basis particle size distribution. For the measurement, a minor amount of surfactant is added to 10 cc of pure water and 10 mg of a sample electroconductive fine powder is added thereto, followed by 10 min. of dispersion by means of an ultrasonic disperser (ultrasonic homogenizer) to obtain a sample dispersion liquid, which is subjected to a single time of measurement for 90 sec.</p <p id="
            "p0191"
            " num="
            "0191"
            ">In the case where the electroconductive fine powder is composed of agglomerate particles, the particle size of the electroconductive fine powder is determined as the particle size of the agglomerate. The electroconductive fine powder in the form of agglomerated secondary particles can be used as well as that in the form of primary particles. Regardless of its agglomerated form, the electroconductive fine powder can exhibit its desired function of charging promotion by presence in the form of the agglomerate in the charging section at the contact position<!-- EPO <DP n="
            "85"
            "> --> between the charging member and the image-bearing member or in a region in proximity thereto.</p"
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
    model.close_multiprocessing_pool()
예제 #15
0
    def train(
        self,
        data_dir: str,
        train_filename: str,
        dev_filename: Optional[str] = None,
        test_filename: Optional[str] = None,
        use_gpu: Optional[bool] = None,
        batch_size: int = 10,
        n_epochs: int = 2,
        learning_rate: float = 1e-5,
        max_seq_len: Optional[int] = None,
        warmup_proportion: float = 0.2,
        dev_split: float = 0,
        evaluate_every: int = 300,
        save_dir: Optional[str] = None,
        num_processes: Optional[int] = None,
        use_amp: str = None,
    ):
        """
        Fine-tune a model on a TextPairClassification dataset. Options:

        - Take a plain language model (e.g. `bert-base-cased`) and train it for TextPairClassification
        - Take a TextPairClassification model and fine-tune it for your domain

        :param data_dir: Path to directory containing your training data in SQuAD style
        :param train_filename: Filename of training data
        :param dev_filename: Filename of dev / eval data
        :param test_filename: Filename of test data
        :param dev_split: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here
                          that gets split off from training data for eval.
        :param use_gpu: Whether to use GPU (if available)
        :param batch_size: Number of samples the model receives in one batch for training
        :param n_epochs: Number of iterations on the whole training data set
        :param learning_rate: Learning rate of the optimizer
        :param max_seq_len: Maximum text length (in tokens). Everything longer gets cut down.
        :param warmup_proportion: Proportion of training steps until maximum learning rate is reached.
                                  Until that point LR is increasing linearly. After that it's decreasing again linearly.
                                  Options for different schedules are available in FARM.
        :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset
        :param save_dir: Path to store the final model
        :param num_processes: The number of processes for `multiprocessing.Pool` during preprocessing.
                              Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set.
                              Set to None to use all CPU cores minus one.
        :param use_amp: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model.
                        Available options:
                        None (Don't use AMP)
                        "O0" (Normal FP32 training)
                        "O1" (Mixed Precision => Recommended)
                        "O2" (Almost FP16)
                        "O3" (Pure FP16).
                        See details on: https://nvidia.github.io/apex/amp.html
        :return: None
        """

        if dev_filename:
            dev_split = 0

        if num_processes is None:
            num_processes = multiprocessing.cpu_count() - 1 or 1

        set_all_seeds(seed=42)

        # For these variables, by default, we use the value set when initializing the FARMRanker.
        # These can also be manually set when train() is called if you want a different value at train vs inference
        if use_gpu is None:
            use_gpu = self.use_gpu
        if max_seq_len is None:
            max_seq_len = self.max_seq_len

        device, n_gpu = initialize_device_settings(use_cuda=use_gpu,
                                                   use_amp=use_amp)

        if not save_dir:
            save_dir = f"saved_models/{self.inferencer.model.language_model.name}"

        # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["0", "1"]
        metric = "f1_macro"
        processor = TextPairClassificationProcessor(
            tokenizer=self.inferencer.processor.tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            dev_split=dev_split,
            test_filename=test_filename,
            data_dir=Path(data_dir),
            delimiter="\t")

        # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them
        # and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor,
                             batch_size=batch_size,
                             distributed=False,
                             max_processes=num_processes)

        # 3. Create an optimizer and pass the already initialized model
        model, optimizer, lr_schedule = initialize_optimizer(
            model=self.inferencer.model,
            learning_rate=learning_rate,
            schedule_opts={
                "name": "LinearWarmup",
                "warmup_proportion": warmup_proportion
            },
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device,
            use_amp=use_amp,
        )
        # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(model=model,
                          optimizer=optimizer,
                          data_silo=data_silo,
                          epochs=n_epochs,
                          n_gpu=n_gpu,
                          lr_schedule=lr_schedule,
                          evaluate_every=evaluate_every,
                          device=device,
                          use_amp=use_amp,
                          disable_tqdm=not self.progress_bar)

        # 5. Let it grow!
        self.inferencer.model = trainer.train()
        self.save(Path(save_dir))
예제 #16
0
def test_text_pair_regression(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "microsoft/MiniLM-L12-H384-uncased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextPairRegressionProcessor(tokenizer=tokenizer,
                                            label_list=None,
                                            metric="f1_macro",
                                            max_seq_len=128,
                                            train_filename="sample.tsv",
                                            dev_filename="sample.tsv",
                                            test_filename=None,
                                            data_dir=Path("samples/text_pair"),
                                            delimiter="\t")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = RegressionHead()

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/text_pair_regression_model")
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert np.isclose(result[0]["predictions"][0]["pred"], 0.7976, rtol=0.05)
    model.close_multiprocessing_pool()
예제 #17
0
파일: test_ner.py 프로젝트: skiran252/FARM
def test_ner(caplog, use_fast):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False,
        use_fast=use_fast,
    )

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1",
                             multithreading_rust=False)

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    del model
    del processor
    del optimizer
    del data_silo
    del trainer

    basic_texts = [
        {
            "text": "Paris is a town in France."
        },
    ]
    model = Inferencer.load(
        model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english",
        num_processes=0,
        task_type="ner",
        use_fast=use_fast)
    # labels arent correctly inserted from transformers
    # They are converted to LABEL_1 ... LABEL_N
    # For the inference result to contain predictions we need them in IOB NER format
    model.processor.tasks["ner"]["label_list"][-1] = "B-LOC"
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Paris"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
예제 #18
0
def test_text_pair_classification(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "microsoft/MiniLM-L12-H384-uncased"
    label_list = ["0", "1", "2"]

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model)

    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        train_filename="sample.tsv",
        dev_filename="sample.tsv",
        test_filename=None,
        data_dir=Path("samples/text_pair"),
        delimiter="\t")

    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = LanguageModel.load(lang_model)
    prediction_head = TextClassificationHead(num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=5e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    trainer.train()

    save_dir = Path("testsave/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # For correct Text Pair Classification on raw dictionaries, we need to put both texts (text, text_b) into a tuple
    # See corresponding operation in the file_to_dicts method of TextPairClassificationProcessor here: https://github.com/deepset-ai/FARM/blob/5ab5b1620cb51ceb874d4b30c887e377ad1a6e9a/farm/data_handler/processor.py#L744
    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["label"] == "1"
    assert np.isclose(result[0]["predictions"][0]["probability"],
                      0.3781,
                      rtol=0.05)
    model.close_multiprocessing_pool()
예제 #19
0
    def train(self,
              data_dir,
              train_filename,
              dev_filename=None,
              test_file_name=None,
              use_gpu=True,
              batch_size=10,
              n_epochs=2,
              learning_rate=1e-5,
              max_seq_len=256,
              warmup_proportion=0.2,
              dev_split=0.1,
              evaluate_every=300,
              save_dir=None):
        """
        Fine-tune a model on a QA dataset. Options:
        - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data)
        - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool)

        :param data_dir: Path to directory containing your training data in SQuAD style
        :param train_filename: filename of training data
        :param dev_filename: filename of dev / eval data
        :param test_file_name: filename of test data
        :param dev_split: Instead of specifying a dev_filename you can also specify a ratio (e.g. 0.1) here
                          that get's split off from training data for eval.
        :param use_gpu: Whether to use GPU (if available)
        :param batch_size: Number of samples the model receives in one batch for training
        :param n_epochs: number of iterations on the whole training data set
        :param learning_rate: learning rate of the optimizer
        :param max_seq_len: maximum text length (in tokens). Everything longer gets cut down.
        :param warmup_proportion: Proportion of training steps until maximum learning rate is reached.
                                  Until that point LR is increasing linearly. After that it's decreasing again linearly.
                                  Options for different schedules are available in FARM.
        :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset
        :param save_dir: Path to store the final model
        :return: None
        """

        if dev_filename:
            dev_split = None

        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=use_gpu)

        if not save_dir:
            save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}"
        save_dir = Path(save_dir)

        # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=self.inferencer.processor.tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            dev_split=dev_split,
            test_filename=test_file_name,
            data_dir=Path(data_dir),
        )

        # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them
        # and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor,
                             batch_size=batch_size,
                             distributed=False)

        # 3. Create an optimizer and pass the already initialized model
        model, optimizer, lr_schedule = initialize_optimizer(
            model=self.inferencer.model,
            learning_rate=learning_rate,
            schedule_opts={
                "name": "LinearWarmup",
                "warmup_proportion": warmup_proportion
            },
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device)
        # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )
        # 5. Let it grow!
        self.inferencer.model = trainer.train()
        self.save(save_dir)
예제 #20
0
def train_from_scratch(args):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    #TODO prettify this loading of params from two sources (cmd + json)
    cmd_args = parse_arguments()
    args["local_rank"] = cmd_args.local_rank
    logging.info(f'local_rank: {args["local_rank"]}')

    next_sent_task = bool(int(args.get("next_sent_task", 1)))
    distributed = True
    use_amp = args.get("use_amp", None)
    use_amp = None if use_amp == "" else use_amp

    # Only the main process should log here
    if args["local_rank"] in [-1, 0]:
        ml_logger = StdoutLogger(tracking_uri=None)
        ml_logger.init_experiment(experiment_name="train_from_scratch",
                                  run_name="run")

    set_all_seeds(seed=39)

    device, n_gpu = initialize_device_settings(use_cuda=True,
                                               local_rank=args["local_rank"],
                                               use_amp=use_amp)
    effective_batch_size = int(args["per_gpu_batch_size"]) * int(
        args["gradient_accumulation_steps"]
    ) * torch.distributed.get_world_size()

    logging.info(
        f'Training with effective batch size of {effective_batch_size} '
        f'(per_gpu_batch_size = {int(args["per_gpu_batch_size"])}, gradient_accumulation_steps={int(args["gradient_accumulation_steps"])}, n_gpus = {torch.distributed.get_world_size()} )'
    )

    save_dir = Path("/opt/ml/model")
    data_dir = Path("/opt/ml/input/data/input_channel")

    # Split and shuffle training data
    if args["local_rank"] in [-1, 0]:
        randomize_and_split_file(data_dir / args["train_file"],
                                 output_dir=data_dir / "split_files")
    # let other processes wait for splitted files from rank 0
    torch.distributed.barrier()

    args["train_file"] = data_dir / "split_files"

    # 1.Create a tokenizer
    tokenizer = BertTokenizer(data_dir / args["vocab_file"],
                              do_lower_case=bool(int(args["do_lower_case"])))

    # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset
    processor = BertStyleLMProcessor(data_dir=data_dir,
                                     tokenizer=tokenizer,
                                     max_seq_len=int(args["max_seq_len"]),
                                     train_filename=args.get("train_file"),
                                     dev_filename=args.get("dev_file", None),
                                     test_filename=args.get("test_file", None),
                                     next_sent_pred_style=args.get(
                                         "next_sent_pred_style", "bert-style"),
                                     max_docs=args.get("max_docs", None),
                                     next_sent_pred=next_sent_task)

    # 3. Create a DataSilo that loads several datasets (train/dev/test) and provides DataLoaders for them
    data_silo = StreamingDataSilo(processor=processor,
                                  batch_size=int(args["per_gpu_batch_size"]),
                                  dataloader_workers=int(
                                      args.get("data_loader_workers", 8)),
                                  distributed=distributed)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size)

    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead(768, tokenizer.vocab_size)
    if next_sent_task:
        next_sentence_head = NextSentenceHead(num_labels=2,
                                              task_name="nextsentence")
        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[lm_prediction_head, next_sentence_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token", "per_sequence"],
            device=device,
        )
    else:
        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[lm_prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=float(args["learning_rate"]),
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": float(args["warmup_proportion"])
        },
        n_batches=len(data_silo.get_data_loader("train")),
        n_epochs=int(args["n_epochs"]),
        device=device,
        grad_acc_steps=int(args["gradient_accumulation_steps"]),
        distributed=distributed,
        use_amp=use_amp,
        local_rank=args["local_rank"])

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    if args.get("checkpoint_every"):
        checkpoint_every = int(args["checkpoint_every"])
        checkpoint_root_dir = Path("/opt/ml/checkpoints/training")
    else:
        checkpoint_every = None
        checkpoint_root_dir = None

    trainer = Trainer.create_or_load_checkpoint(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=int(args["n_epochs"]),
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=int(args["evaluate_every"]),
        log_loss_every=int(args.get("log_loss_every", 500)),
        log_learning_rate=bool(int(args.get("log_learning_rate", 0))),
        device=device,
        local_rank=args["local_rank"],
        grad_acc_steps=int(args["gradient_accumulation_steps"]),
        checkpoint_every=checkpoint_every,
        checkpoint_root_dir=checkpoint_root_dir,
        checkpoints_to_keep=int(args.get("checkpoints_to_keep", 10)),
        disable_tqdm=True,
        use_amp=use_amp,
    )

    # 7. Let it grow!
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)
예제 #21
0
파일: farm.py 프로젝트: ruanjiyang/haystack
    def train(self,
              data_dir: str,
              train_filename: str,
              dev_filename: Optional[str] = None,
              test_file_name: Optional[str] = None,
              use_gpu: Optional[bool] = None,
              batch_size: int = 10,
              n_epochs: int = 2,
              learning_rate: float = 1e-5,
              max_seq_len: Optional[int] = None,
              warmup_proportion: float = 0.2,
              dev_split: float = 0,
              evaluate_every: int = 300,
              save_dir: Optional[str] = None,
              num_processes: Optional[int] = None):
        """
        Fine-tune a model on a QA dataset. Options:
        - Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data)
        - Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool)

        :param data_dir: Path to directory containing your training data in SQuAD style
        :param train_filename: filename of training data
        :param dev_filename: filename of dev / eval data
        :param test_file_name: filename of test data
        :param dev_split: Instead of specifying a dev_filename you can also specify a ratio (e.g. 0.1) here
                          that get's split off from training data for eval. Might not work on machines with few cores.
                          Default value of 0 does not split away a dev set.
        :param use_gpu: Whether to use GPU (if available)
        :param batch_size: Number of samples the model receives in one batch for training
        :param n_epochs: number of iterations on the whole training data set
        :param learning_rate: learning rate of the optimizer
        :param max_seq_len: maximum text length (in tokens). Everything longer gets cut down.
        :param warmup_proportion: Proportion of training steps until maximum learning rate is reached.
                                  Until that point LR is increasing linearly. After that it's decreasing again linearly.
                                  Options for different schedules are available in FARM.
        :param evaluate_every: Evaluate the model every X steps on the hold-out eval dataset
        :param save_dir: Path to store the final model
        :param num_processes: The number of processes for `multiprocessing.Pool` during preprocessing.
                              Set to value of 1 to disable multiprocessing. When set to 1 you cannot split away a dev set from train set
                              Set to None to use all CPU cores minus one.
        :return: None
        """

        if dev_filename:
            dev_split = 0

        if num_processes is None:
            num_processes = multiprocessing.cpu_count() - 1 or 1

        set_all_seeds(seed=42)

        # For these variables, by default, we use the value set when initializing the FARMReader.
        # These can also be manually set when train() is called if you want a different value at train vs inference
        if use_gpu is None:
            use_gpu = self.use_gpu
        if max_seq_len is None:
            max_seq_len = self.max_seq_len

        device, n_gpu = initialize_device_settings(use_cuda=use_gpu)

        if not save_dir:
            save_dir = f"../../saved_models/{self.inferencer.model.language_model.name}"

        # 1. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=self.inferencer.processor.tokenizer,
            max_seq_len=max_seq_len,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            dev_split=dev_split,
            test_filename=test_file_name,
            data_dir=Path(data_dir),
        )

        # 2. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them
        # and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor,
                             batch_size=batch_size,
                             distributed=False,
                             max_processes=num_processes)

        # Quick-fix until this is fixed upstream in FARM:
        # We must avoid applying DataParallel twice (once when loading the inferencer,
        # once when calling initalize_optimizer)
        self.inferencer.model.save("tmp_model")
        model = BaseAdaptiveModel.load(load_dir="tmp_model",
                                       device=device,
                                       strict=True)
        shutil.rmtree('tmp_model')

        # 3. Create an optimizer and pass the already initialized model
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            # model=self.inferencer.model,
            learning_rate=learning_rate,
            schedule_opts={
                "name": "LinearWarmup",
                "warmup_proportion": warmup_proportion
            },
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device)
        # 4. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )

        # 5. Let it grow!
        self.inferencer.model = trainer.train()
        self.save(Path(save_dir))
예제 #22
0
def test_lm_finetuning(caplog):
    caplog.set_level(logging.CRITICAL)
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 5
    evaluate_every = 2
    lang_model = "bert-base-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False, never_split_chars=["-", "_"]
    )

    processor = BertStyleLMProcessor(
        data_dir="samples/lm_finetuning",
        train_filename="train-sample.txt",
        test_filename="test-sample.txt",
        dev_filename=None,
        tokenizer=tokenizer,
        max_seq_len=64,
    )
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    language_model = Bert.load(lang_model)
    lm_prediction_head = BertLMHead.load(lang_model)
    next_sentence_head = NextSentenceHead.load(lang_model)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    model = trainer.train(model)

    save_dir = "testsave/lm_finetuning"
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Farmer's life is great."},
        {"text": "It's nothing for big city kids though."},
    ]
    model = Inferencer.load(save_dir, embedder_only=True)
    result = model.extract_vectors(dicts=basic_texts)
    assert result[0]["context"] == ['Farmer', "'", 's', 'life', 'is', 'great', '.']
    assert result[0]["vec"].shape == (768,)
    assert (result[0]["vec"][0] - 0.3826) < 0.01
예제 #23
0
def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_question_answering")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 2
    evaluate_every = 2000
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("../data/squad20"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = QuestionAnsweringHead()

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-english-qa-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    QA_input = [{
        "questions": ["Who counted the game among the best ever made?"],
        "text":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)[0]

    pprint.pprint(result)
    model.close_multiprocessing_pool()

    # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
    filename = os.path.join(processor.data_dir, processor.dev_filename)
    result = model.inference_from_file(file=filename, return_json=False)
    result_squad = [x.to_squad_eval() for x in result]

    write_squad_predictions(predictions=result_squad,
                            predictions_filename=filename,
                            out_filename="predictions.json")
예제 #24
0
def test_nq(caplog=None):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    batch_size = 2
    n_epochs = 1
    evaluate_every = 4
    base_LM_model = "distilbert-base-uncased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model,
                               do_lower_case=True)
    processor = NaturalQuestionsProcessor(tokenizer=tokenizer,
                                          max_seq_len=20,
                                          doc_stride=10,
                                          max_query_length=6,
                                          train_filename="train_sample.jsonl",
                                          dev_filename="dev_sample.jsonl",
                                          data_dir=Path("samples/nq"))

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(base_LM_model)
    qa_head = QuestionAnsweringHead()
    classification_head = TextClassificationHead(
        num_labels=len(processor.answer_type_list))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[qa_head, classification_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)
    trainer.train()
    save_dir = Path("testsave/nq")
    model.save(save_dir)
    processor.save(save_dir)

    inferencer = Inferencer.load(save_dir,
                                 batch_size=2,
                                 gpu=False,
                                 num_processes=0)

    qa_format_1 = [{
        "questions": ["Who counted the game among the best ever made?"],
        "text":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]
    qa_format_2 = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created.",
    }]

    result1 = inferencer.inference_from_dicts(dicts=qa_format_1)
    result2 = inferencer.inference_from_dicts(dicts=qa_format_2)
    assert result1 == result2
예제 #25
0
    def ner(self, task, model_type, n_epochs, batch_size, evaluate_every,
            use_cude):
        aml_run = he.get_context()
        # Check task
        if cu.tasks.get(str(task)).get('type') != 'ner':
            raise Exception('NOT A NER TASK')
        language = cu.params.get('language')

        # Data
        dt_task = dt.Data(task=task)

        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        lang_model = he.get_farm_model(model_type, language)
        save_dir = dt_task.get_path('model_dir')
        # ner_labels = dt_task.load('fn_label', header=None)[0].to_list()
        ner_labels = [
            "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
            "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
        ]

        # n_epochs = 4
        # batch_size = 32
        # evaluate_every = 750
        # lang_model =  "xlm-roberta-large"

        # AML log
        try:
            aml_run.log('task', task)
            aml_run.log('language', language)
            aml_run.log('n_epochs', n_epochs)
            aml_run.log('batch_size', batch_size)
            aml_run.log('lang_model', lang_model)
            aml_run.log_list('label_list', label_list)
        except:
            pass

        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                                   do_lower_case=False)

        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        processor = NERProcessor(tokenizer=tokenizer,
                                 max_seq_len=128,
                                 data_dir=dt_task.data_dir,
                                 metric="seq_f1",
                                 label_list=ner_labels)

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => NER
        prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=1e-5,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device,
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )

        # 7. Let it grow
        trainer.train()

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)
예제 #26
0
def test_ner_amp(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename=Path("train-sample.txt"),
                             dev_filename=Path("dev-sample.txt"),
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_token"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "1980 kam der Crown von Toyota"
        },
    ]
    model = Inferencer.load(save_dir, gpu=True)
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
예제 #27
0
    def __init__(self,
                 model,
                 processor,
                 task_type,
                 batch_size=4,
                 gpu=False,
                 name=None,
                 return_class_probs=False,
                 extraction_strategy=None,
                 extraction_layer=None,
                 s3e_stats=None,
                 num_processes=None,
                 disable_tqdm=False):
        """
        Initializes Inferencer from an AdaptiveModel and a Processor instance.

        :param model: AdaptiveModel to run in inference mode
        :type model: AdaptiveModel
        :param processor: A dataset specific Processor object which will turn input (file or dict) into a Pytorch Dataset.
        :type processor: Processor
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param name: Name for the current Inferencer model, displayed in the REST API
        :type name: string
        :param return_class_probs: either return probability distribution over all labels or the prob of the associated label
        :type return_class_probs: bool
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors),
                               's3e' (sentence vector via S3E pooling, see https://arxiv.org/abs/2002.09620)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`.
                              Set to value of 1 (or 0) to disable multiprocessing.
                              Set to None to let Inferencer use all CPU cores minus one.
                              If you want to debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :return: An instance of the Inferencer.

        """
        # Init device and distributed settings
        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)

        self.processor = processor
        self.model = model
        self.model.eval()
        self.batch_size = batch_size
        self.device = device
        self.language = self.model.get_language()
        self.task_type = task_type
        self.disable_tqdm = disable_tqdm

        if task_type == "embeddings":
            if not extraction_layer or not extraction_strategy:
                logger.warning(
                    "Using task_type='embeddings', but couldn't find one of the args `extraction_layer` and `extraction_strategy`. "
                    "Since FARM 0.4.2, you set both when initializing the Inferencer and then call inferencer.inference_from_dicts() instead of inferencer.extract_vectors()"
                )
            self.model.prediction_heads = torch.nn.ModuleList([])
            self.model.language_model.extraction_layer = extraction_layer
            self.model.language_model.extraction_strategy = extraction_strategy
            self.model.language_model.s3e_stats = s3e_stats

        # TODO add support for multiple prediction heads

        self.name = name if name != None else f"anonymous-{self.task_type}"
        self.return_class_probs = return_class_probs

        model.connect_heads_with_processor(processor.tasks,
                                           require_labels=False)
        set_all_seeds(42)

        self._set_multiprocessing_pool(num_processes)
예제 #28
0
def train_from_scratch():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="")
    ml_logger.init_experiment(experiment_name="from_scratch", run_name="debug")

    #########################
    ######## Settings
    ########################
    set_all_seeds(seed=39)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    evaluate_every = 5000
    vocab_size = 30522
    # dev_filename = None
    save_dir = Path("saved_models/train_from_scratch")

    n_epochs = 10
    learning_rate = 1e-4
    warmup_proportion = 0.05
    batch_size = 16  # (probably only possible via gradient accumulation steps)
    max_seq_len = 64

    data_dir = Path("data/lm_finetune_nips")
    train_filename = "train.txt"
    dev_filename = "dev.txt"

    # 1.Create a tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    processor = BertStyleLMProcessor(
        data_dir=data_dir,
        tokenizer=tokenizer, max_seq_len=max_seq_len,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and
    #    calculates a few descriptive statistics of our datasets
    stream_data_silo = StreamingDataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.from_scratch("bert", vocab_size)

    # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning
    lm_prediction_head = BertLMHead(768, vocab_size)
    next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence")

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[lm_prediction_head, next_sentence_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token", "per_sequence"],
        device=device,
    )

    # 5. Create an optimizer

    # calculate approx number of training batches for streaming data silo
    total_lines = sum(1 for line in open(data_dir/train_filename))
    empty_lines = sum(1 if line == "\n" else 0 for line in open(data_dir/train_filename))
    n_batches = int((total_lines - (2 * empty_lines)) / batch_size)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=learning_rate,
        schedule_opts={"name": "LinearWarmup", "warmup_proportion": warmup_proportion},
        n_batches=n_batches,
        n_epochs=n_epochs,
        device=device,
        grad_acc_steps=8,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer.create_or_load_checkpoint(
        model=model,
        optimizer=optimizer,
        data_silo=stream_data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
        grad_acc_steps=8,
        checkpoint_root_dir=Path("saved_models/train_from_scratch/checkpoints"),
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)
예제 #29
0
def doc_classification_cola():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 5
    batch_size = 100
    evaluate_every = 20
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # Here we load Cola 2018 Data.

    label_list = ["0", "1"]
    metric = "mcc"

    processor = TextClassificationProcessor(tokenizer=tokenizer,
                                            max_seq_len=64,
                                            data_dir=Path("../data/cola"),
                                            dev_filename=Path("dev.tsv"),
                                            dev_split=None,
                                            test_filename=None,
                                            label_list=label_list,
                                            metric=metric,
                                            label_column_name="label"
                                            )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)

    # language_model = Roberta.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Text classification
    prediction_head = TextClassificationHead(
        layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])],
        class_weights=data_silo.calculate_class_weights(task_name="text_classification"))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    model = trainer.train(model)

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/bert-doc-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {"text": "The box contained the ball from the tree."},
        {"text": "I'll fix you a drink."},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #30
0
def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_text_pair_classification")

    ##########################
    ########## Settings ######
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset.
    # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label"
    processor = TextPairClassificationProcessor(
        tokenizer=tokenizer,
        label_list=label_list,
        metric="f1_macro",
        max_seq_len=128,
        train_filename="train.tsv",
        dev_filename="dev.tsv",
        test_filename=None,
        data_dir=Path("../data/asnq_binary"),
        delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list))

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_sequence_continuous"],
                          device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      data_silo=data_silo,
                      epochs=n_epochs,
                      n_gpu=n_gpu,
                      lr_schedule=lr_schedule,
                      evaluate_every=evaluate_every,
                      device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/text_pair_classification_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    # For correct Text Pair Classification on raw dictionaries (inference mode), we need to put both
    # texts (text, text_b) into a tuple.
    # See corresponding conversion in the file_to_dicts() method of TextPairClassificationProcessor: https://github.com/deepset-ai/FARM/blob/5ab5b1620cb51ceb874d4b30c887e377ad1a6e9a/farm/data_handler/processor.py#L744
    basic_texts = [
        {
            "text":
            ("how many times have real madrid won the champions league in a row",
             "They have also won the competition the most times in a row, winning it five times from 1956 to 1960"
             )
        },
        {
            "text": ("how many seasons of the blacklist are there on netflix",
                     "Retrieved March 27 , 2018 .")
        },
    ]

    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)

    print(result)
    model.close_multiprocessing_pool()