예제 #1
0
    def convert_from_transformers(cls, model_name_or_path, device, task_type):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :return: AdaptiveModel
        """
        lm = LanguageModel.load(model_name_or_path)
        #TODO Infer type of head automatically from config

        if task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_token",
                                 device=device)
        elif task_type == "text_classification":
            ph = TextClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_sequence",
                                 device=device)
        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[ph],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types="per_token",
                                 device=device)
        elif task_type == "embeddings":
            adaptive_model = cls(language_model=lm,
                                 prediction_heads=[],
                                 embeds_dropout_prob=0.1,
                                 lm_output_types=["per_token", "per_sequence"],
                                 device=device)
        else:
            raise NotImplementedError(
                f"Huggingface's transformer models of type {task_type} are not supported yet"
            )

        return adaptive_model
예제 #2
0
    def convert_from_transformers(cls, model_name_or_path, device, task_type, processor=None):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """
        lm = LanguageModel.load(model_name_or_path)
        # TODO Infer type of head automatically from config

        if task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1,
                                 lm_output_types="per_token", device=device)
        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationhead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment.")
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1,
                                 lm_output_types="per_sequence", device=device)
        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path)
            adaptive_model = cls(language_model=lm, prediction_heads=[ph], embeds_dropout_prob=0.1,
                                 lm_output_types="per_token", device=device)
        elif task_type == "embeddings":
            adaptive_model = cls(language_model=lm, prediction_heads=[], embeds_dropout_prob=0.1,
                                 lm_output_types=["per_token", "per_sequence"], device=device)
        else:
            raise NotImplementedError(f"Huggingface's transformer models of type {task_type} are not supported yet")

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
예제 #3
0
def test_multiple_prediction_heads():
    model = "bert-base-german-cased"
    lm = LanguageModel.load(model)
    ph1 = TextClassificationHead(num_labels=3, label_list=["negative", "neutral", "positive"])
    ph2 = TokenClassificationHead(num_labels=3, label_list=["PER", "LOC", "ORG"])
    adaptive_model = AdaptiveModel(language_model=lm, prediction_heads=[ph1, ph2], embeds_dropout_prob=0.1,
                                   lm_output_types="per_token", device="cpu")
    transformer_models = Converter.convert_to_transformers(adaptive_model)
    assert isinstance(transformer_models[0], BertForSequenceClassification)
    assert isinstance(transformer_models[1], BertForTokenClassification)
    del lm
    del transformer_models
    del adaptive_model
예제 #4
0
파일: ner.py 프로젝트: yizhiwan/FARM
processor = NERProcessor(tokenizer=tokenizer,
                         max_seq_len=128,
                         data_dir="../data/conll03-de",
                         metric="seq_f1",
                         label_list=ner_labels)

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(
    task_name="ner",
    layer_dims=[768, len(processor.tasks["ner"]["label_list"])])

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_token"],
    device=device,
)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    warmup_proportion=0.1,
예제 #5
0
파일: ner.py 프로젝트: yon606/FARM
def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_ner")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-german-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=128,
                             data_dir=Path("../data/conll03-de"),
                             delimiter=" ",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(processor=processor, batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = "saved_models/bert-german-ner-tutorial"
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
        {
            "text": "Martin Müller spielt Handball in Berlin"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    print(result)
예제 #6
0
파일: test_ner.py 프로젝트: theintz/FARM
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 8
    evaluate_every = 50
    lang_model = "bert-base-german-cased"

    tokenizer = BertTokenizer.from_pretrained(
        pretrained_model_name_or_path=lang_model, do_lower_case=False)

    processor = GermEval14Processor(tokenizer=tokenizer,
                                    max_seq_len=64,
                                    data_dir="samples/ner",
                                    train_file="train-sample.txt",
                                    dev_file="dev-sample.txt",
                                    test_file=None)

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = Bert.load(lang_model)
    prediction_head = TokenClassificationHead(
        layer_dims=[768, len(processor.label_list)])

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_examples=data_silo.n_samples("train"),
        batch_size=batch_size,
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = "testsave/ner"
    model = trainer.train(model)
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text":
            "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"
        },
    ]
    model = Inferencer(save_dir)
    result = model.run_inference(dicts=basic_texts)
    assert result[0]["predictions"][0]["context"] == "Tagesspiegel,"
    assert abs(result[0]["predictions"][0]["probability"] - 0.213869) <= 0.0001
예제 #7
0
파일: test_ner.py 프로젝트: jucho2725/FARM
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"

    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=False
    )

    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
                  "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer,
        max_seq_len=8,
        data_dir="samples/ner",
        train_filename="train-sample.txt",
        dev_filename="dev-sample.txt",
        test_filename=None,
        delimiter=" ",
        label_list=ner_labels,
        metric="seq_f1"
    )

    data_silo = DataSilo(processor=processor, batch_size=batch_size)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(layer_dims=[768, len(ner_labels)])

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    optimizer, warmup_linear = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        warmup_proportion=0.1,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
    )

    trainer = Trainer(
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = "testsave/ner"
    model = trainer.train(model)
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"},
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts)
    assert result[0]["predictions"][0]["context"] == "sagte"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
 def execML(self, job):
     start_time = time.time()
     if job.task == 'analyse':
         basic_texts = []
         # Will donwload and store dataset...
         sample = self.downloadAndConvertText(job, job.data_sample)
         for text in sample.encode('utf-8').splitlines():
             basic_texts.append({'text': text.decode('utf-8')})
         # Will donwload and store model...
         self.downloadAndStoreZIPModel(job, job.model)
         self.updateJobStatus(job, 'analysing')
         save_dir = 'tmp/' + job.model['id']
         model = Inferencer.load(save_dir)
         result = model.inference_from_dicts(dicts=basic_texts)
         self.persistResult(job, result)
         model.close_multiprocessing_pool()
         self.updateJobStatus(job, 'completed')
     elif job.task == 'train':
         self.updateJobStatus(job, 'training')
         # Will donwload and store dataset...
         self.downloadAndStoreZIPDataset(job, job.data_source)
         # Will donwload and store model...
         self.downloadAndStoreZIPModel(job, job.model)
         set_all_seeds(seed=42)
         device, n_gpu = initialize_device_settings(use_cuda=True)
         n_epochs = 4
         evaluate_every = 400
         do_lower_case = False
         batch_size = 32
         lang_model = os.path.join(Path.cwd(), 'tmp', job.model['id'])
         ner_labels = [
             "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER",
             "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
         ]
         # 1. Create a tokenizer
         tokenizer = Tokenizer.load(
             pretrained_model_name_or_path=lang_model,
             do_lower_case=do_lower_case,
             tokenizer_class='BertTokenizer'
         )  #tokenizer_class='BertTokenizer'
         # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
         processor = NERProcessor(tokenizer=tokenizer,
                                  max_seq_len=128,
                                  data_dir=str(
                                      os.path.join(Path.cwd(), 'tmp',
                                                   job.data_source['id'])),
                                  delimiter=' ',
                                  metric='seq_f1',
                                  label_list=ner_labels)
         # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
         data_silo = DataSilo(processor=processor,
                              batch_size=batch_size,
                              max_processes=1)
         # 4. Create an AdaptiveModel
         # 4.1 which consists of a pretrained language model as a basis
         language_model = LanguageModel.load(lang_model)
         # 4.2 and a prediction head on top that is suited for our task => NER
         prediction_head = TokenClassificationHead(
             num_labels=len(ner_labels))
         model = AdaptiveModel(
             language_model=language_model,
             prediction_heads=[prediction_head],
             embeds_dropout_prob=0.1,
             lm_output_types=['per_token'],
             device=device,
         )
         # 5. Create an optimizer
         model, optimizer, lr_schedule = initialize_optimizer(
             model=model,
             learning_rate=1e-5,
             n_batches=len(data_silo.loaders["train"]),
             n_epochs=n_epochs,
             device=device,
         )
         # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
         trainer = Trainer(
             model=model,
             optimizer=optimizer,
             data_silo=data_silo,
             epochs=n_epochs,
             n_gpu=n_gpu,
             lr_schedule=lr_schedule,
             evaluate_every=evaluate_every,
             device=device,
         )
         # 7. Let it grow
         trainer.train()
         # 8. Hooray! You have a model. Store it:
         newModelId = str(uuid.uuid4())
         save_dir = 'tmp/' + newModelId
         model.save(save_dir)
         processor.save(save_dir)
         model.close_multiprocessing_pool()
         self.persistZIPModel(newModelId, job)
         self.updateJobStatus(job, 'completed')
     elapsed_time = time.time() - start_time
     print('Execution time max: ',
           elapsed_time,
           'for job.id:',
           job.id,
           flush=True)
     return {'status': True, 'code': 'ok', 'msg': 'success'}
예제 #9
0
파일: test_ner.py 프로젝트: yon606/FARM
def test_ner(caplog):
    if caplog:
        caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 3
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Paris is a town in France."
        },
    ]
    model = Inferencer.load(
        model_name_or_path="dbmdz/bert-base-cased-finetuned-conll03-english",
        num_processes=0,
        task_type="ner")
    # labels arent correctly inserted from transformers
    # They are converted to LABEL_1 ... LABEL_N
    # For the inference result to contain predictions we need them in IOB NER format
    model.processor.tasks["ner"]["label_list"][-1] = "B-LOC"
    result = model.inference_from_dicts(dicts=basic_texts)

    assert result[0]["predictions"][0]["context"] == "Paris"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
예제 #10
0
from farm.evaluation.metrics import register_metrics
register_metrics('f1_weighted', custom_f1_score)

metric = 'f1_weighted'
processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification")
processor.add_task(name="token_level_task", label_list=TRIGGER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner")


data_silo = DataSilo(processor=processor,
                    batch_size=BATCH_SIZE
                    )

language_model = LanguageModel.load(LANG_MODEL)

document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task")
token_level_task_head = TokenClassificationHead(num_labels=len(TRIGGER_LABELS), task_name="token_level_task")

model = AdaptiveModel(
  language_model=language_model,
  prediction_heads=[document_level_task_head, token_level_task_head],
  embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
  lm_output_types=["per_sequence", "per_token"],
  device=DEVICE,
  loss_aggregation_fn=my_loss_agg)

model, optimizer, lr_schedule = initialize_optimizer(
  model=model,
  device=DEVICE,
  learning_rate=LEARNING_RATE,
  n_batches=len(data_silo.loaders["train"]),
  n_epochs=N_EPOCHS)
예제 #11
0
    def ner(self, task, model_type, n_epochs, batch_size, evaluate_every,
            use_cude):
        aml_run = he.get_context()
        # Check task
        if cu.tasks.get(str(task)).get('type') != 'ner':
            raise Exception('NOT A NER TASK')
        language = cu.params.get('language')

        # Data
        dt_task = dt.Data(task=task)

        set_all_seeds(seed=42)
        device, n_gpu = initialize_device_settings(use_cuda=True)
        lang_model = he.get_farm_model(model_type, language)
        save_dir = dt_task.get_path('model_dir')
        # ner_labels = dt_task.load('fn_label', header=None)[0].to_list()
        ner_labels = [
            "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
            "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
        ]

        # n_epochs = 4
        # batch_size = 32
        # evaluate_every = 750
        # lang_model =  "xlm-roberta-large"

        # AML log
        try:
            aml_run.log('task', task)
            aml_run.log('language', language)
            aml_run.log('n_epochs', n_epochs)
            aml_run.log('batch_size', batch_size)
            aml_run.log('lang_model', lang_model)
            aml_run.log_list('label_list', label_list)
        except:
            pass

        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                                   do_lower_case=False)

        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        processor = NERProcessor(tokenizer=tokenizer,
                                 max_seq_len=128,
                                 data_dir=dt_task.data_dir,
                                 metric="seq_f1",
                                 label_list=ner_labels)

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(lang_model)
        # b) and a prediction head on top that is suited for our task => NER
        prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=1e-5,
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            device=device,
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
        trainer = Trainer(
            model=model,
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )

        # 7. Let it grow
        trainer.train()

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)
예제 #12
0
    def convert_from_transformers(model_name_or_path,
                                  device,
                                  revision=None,
                                  task_type=None,
                                  processor=None,
                                  **kwargs):
        """
        Load a (downstream) model from huggingface's transformers format. Use cases:
         - continue training in FARM (e.g. take a squad QA model and fine-tune on your own data)
         - compare models without switching frameworks
         - use model directly for inference

        :param model_name_or_path: local path of a saved model or name of a public one.
                                              Exemplary public names:
                                              - distilbert-base-uncased-distilled-squad
                                              - deepset/bert-large-uncased-whole-word-masking-squad2

                                              See https://huggingface.co/models for full list
        :param device: "cpu" or "cuda"
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param task_type: One of :
                          - 'question_answering'
                          - 'text_classification'
                          - 'embeddings'
                          More tasks coming soon ...
        :param processor: populates prediction head with information coming from tasks
        :type processor: Processor
        :return: AdaptiveModel
        """

        lm = LanguageModel.load(model_name_or_path,
                                revision=revision,
                                **kwargs)
        if task_type is None:
            # Infer task type from config
            architecture = lm.model.config.architectures[0]
            if "MaskedLM" in architecture:
                task_type = "lm"
            elif "QuestionAnswering" in architecture:
                task_type = "question_answering"
            elif "SequenceClassification" in architecture:
                if lm.model.config.num_labels == 1:
                    task_type = "regression"
                else:
                    task_type = "text_classification"
            elif "TokenClassification" in architecture:
                task_type = "ner"
            else:
                logger.error(
                    "Could not infer task type from model config. Please provide task type manually. "
                    "('lm', 'question_answering', 'regression', 'text_classification', 'ner' or 'embeddings')"
                )

        if task_type == "lm":
            ph = BertLMHead.load(model_name_or_path,
                                 revision=revision,
                                 **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "question_answering":
            ph = QuestionAnsweringHead.load(model_name_or_path,
                                            revision=revision,
                                            **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "regression":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Regression with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = RegressionHead.load(model_name_or_path, **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "text_classification":
            if "roberta" in model_name_or_path:
                # The RobertaClassificationHead has components: input2dense, dropout, tanh, dense2output
                # The tanh function cannot be mapped to current FARM style linear Feed Forward PredictionHeads.
                logger.error(
                    "Conversion for Text Classification with Roberta or XLMRoberta not possible at the moment."
                )
                raise NotImplementedError
            ph = TextClassificationHead.load(model_name_or_path,
                                             revision=revision,
                                             **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_sequence",
                                              device=device)

        elif task_type == "ner":
            ph = TokenClassificationHead.load(model_name_or_path,
                                              revision=revision,
                                              **kwargs)
            adaptive_model = am.AdaptiveModel(language_model=lm,
                                              prediction_heads=[ph],
                                              embeds_dropout_prob=0.1,
                                              lm_output_types="per_token",
                                              device=device)

        elif task_type == "embeddings":
            adaptive_model = am.AdaptiveModel(
                language_model=lm,
                prediction_heads=[],
                embeds_dropout_prob=0.1,
                lm_output_types=["per_token", "per_sequence"],
                device=device)

        if processor:
            adaptive_model.connect_heads_with_processor(processor.tasks)

        return adaptive_model
예제 #13
0
def ner(task: str, lm: str):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=use_cuda)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=use_cuda,
                                               use_amp=use_amp)
    n_epochs = 10
    batch_size = 32
    evaluate_every = 1000
    model_dir = MODEL_DIR
    if lm == 'bert-hgcrw':
        lang_model = "redewiedergabe/bert-base-historical-german-rw-cased"
        model_dir += '_bert-hgcrw'
    elif lm == 'lmgot01':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_01")
        model_dir += '_lmgot01'
    elif lm == 'lmgot02':
        lang_model = Path(
            "/home/stud/wangsadirdja/pyfarmbert/models/lm/lmgot_02")
        model_dir += '_lmgot02'
    else:
        lang_model = "bert-base-german-cased"
    if task != 'all':
        model_dir += '_' + task
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    if task == 'direct':
        ner_labels = ["[PAD]", "X", "O", "B-DIR", "I-DIR"]
    elif task == 'indirect':
        ner_labels = ["[PAD]", "X", "O", "B-IND", "I-IND"]
    elif task == 'reported':
        ner_labels = ["[PAD]", "X", "O", "B-REP", "I-REP"]
    else:
        ner_labels = [
            "[PAD]", "X", "O", "B-DIR", "I-DIR", "B-IND", "I-IND", "B-REP",
            "I-REP"
        ]

    data_dir = DATA_DIR
    if task != 'all':
        data_dir += task + '/'
    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=64,
                             data_dir=Path(data_dir),
                             delimiter="\t",
                             metric="seq_f1",
                             label_list=ner_labels)

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 1
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = model_dir
    model.save(save_dir)
    processor.save(save_dir)
예제 #14
0
파일: test_ner_amp.py 프로젝트: wwmmqq/FARM
def test_ner_amp(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 1
    batch_size = 2
    evaluate_every = 1
    lang_model = "bert-base-german-cased"
    if AMP_AVAILABLE:
        use_amp = 'O1'
    else:
        use_amp = None

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename=Path("train-sample.txt"),
                             dev_filename=Path("dev-sample.txt"),
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(language_model=language_model,
                          prediction_heads=[prediction_head],
                          embeds_dropout_prob=0.1,
                          lm_output_types=["per_token"],
                          device=device)

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-05,
        schedule_opts=None,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
        use_amp=use_amp)

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "1980 kam der Crown von Toyota"
        },
    ]
    model = Inferencer.load(save_dir, gpu=True)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    assert result[0]["predictions"][0]["context"] == "Crown"
    assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
예제 #15
0
파일: test_ner.py 프로젝트: wwmmqq/FARM
def test_ner(caplog):
    caplog.set_level(logging.CRITICAL)

    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=False)
    n_epochs = 5
    batch_size = 2
    evaluate_every = 1
    lang_model = "distilbert-base-german-cased"

    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=False)

    ner_labels = [
        "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
        "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
    ]

    processor = NERProcessor(tokenizer=tokenizer,
                             max_seq_len=8,
                             data_dir=Path("samples/ner"),
                             train_filename="train-sample.txt",
                             dev_filename="dev-sample.txt",
                             test_filename=None,
                             delimiter=" ",
                             label_list=ner_labels,
                             metric="seq_f1")

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         max_processes=1)
    language_model = LanguageModel.load(lang_model)
    prediction_head = TokenClassificationHead(num_labels=13)

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=2e-5,
        #optimizer_opts={'name': 'AdamW', 'lr': 2E-05},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=1,
        device=device,
        schedule_opts={
            'name': 'LinearWarmup',
            'warmup_proportion': 0.1
        })
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    save_dir = Path("testsave/ner")
    model = trainer.train()
    model.save(save_dir)
    processor.save(save_dir)

    basic_texts = [
        {
            "text": "Albrecht Lehman ist eine Person"
        },
    ]
    model = Inferencer.load(save_dir)
    result = model.inference_from_dicts(dicts=basic_texts, max_processes=1)
    #print(result)
    #assert result[0]["predictions"][0]["context"] == "sagte"
    #assert isinstance(result[0]["predictions"][0]["probability"], np.float32)
    result2 = model.inference_from_dicts(dicts=basic_texts,
                                         rest_api_schema=True)
    assert result == result2
예제 #16
0
파일: ner.py 프로젝트: tholor/FARM
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=lang_model, do_lower_case=False)

# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
processor = NERProcessor(tokenizer=tokenizer,
                         max_seq_len=128,
                         data_dir="../data/conll03-de")

# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(processor=processor, batch_size=batch_size)

# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = Bert.load(lang_model)
# b) and a prediction head on top that is suited for our task => NER
prediction_head = TokenClassificationHead(
    layer_dims=[768, len(processor.label_list)])

model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[prediction_head],
    embeds_dropout_prob=0.1,
    lm_output_types=["per_token"],
    device=device,
)

# 5. Create an optimizer
optimizer, warmup_linear = initialize_optimizer(
    model=model,
    learning_rate=2e-5,
    warmup_proportion=0.1,
    n_batches=len(data_silo.loaders["train"]),
예제 #17
0
def ner():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42, deterministic_cudnn=True)
    use_amp = None
    device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp)
    n_epochs = 4
    batch_size = 32
    evaluate_every = 400
    lang_model = "bert-base-cased"
    do_lower_case = False

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
    )

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor
    ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"]

    processor = NERProcessor(
        tokenizer=tokenizer, max_seq_len=128, data_dir=Path(DATA_DIR), delimiter=" ", metric="seq_f1", label_list=ner_labels
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_loader_worker = 15
    data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=data_loader_worker)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => NER
    prediction_head = TokenClassificationHead(num_labels=len(ner_labels))

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device,
    )

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = MODEL_DIR
    model.save(save_dir)
    processor.save(save_dir)