Python AdaptiveModel.inference_from_file示例

编程语言: Python

命名空间/包名称: farm.modeling.adaptive_model

类/类型: AdaptiveModel

方法/功能: inference_from_file

hotexamples.com的示例: 5

Python AdaptiveModel.inference_from_file - 已找到5个示例。这些是从开源项目中提取的最受好评的farm.modeling.adaptive_model.AdaptiveModel.inference_from_file现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

save(30)

AdaptiveModel(30)

inference_from_dicts(24)

load(22)

convert_from_transformers(18)

close_multiprocessing_pool(16)

run_inference(7)

inference_from_file(5)

convert_to_onnx(3)

extract_vectors(2)

close_multiprcessing_pool(1)

connect_heads_with_processor(1)

half(1)

示例#1

显示文件

def question_answering():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM",
                              run_name="Run_question_answering")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 24
    n_epochs = 2
    evaluate_every = 2000
    lang_model = "roberta-base"
    do_lower_case = False  # roberta is a cased model
    train_filename = "train-v2.0.json"
    dev_filename = "dev-v2.0.json"

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
                               do_lower_case=do_lower_case)
    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    label_list = ["start_token", "end_token"]
    metric = "squad"
    processor = SquadProcessor(
        tokenizer=tokenizer,
        max_seq_len=384,
        label_list=label_list,
        metric=metric,
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=None,
        data_dir=Path("../data/squad20"),
    )

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task => Question Answering
    prediction_head = QuestionAnsweringHead()

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_token"],
        device=device,
    )

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=3e-5,
        schedule_opts={
            "name": "LinearWarmup",
            "warmup_proportion": 0.2
        },
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        device=device)
    # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("../saved_models/bert-english-qa-tutorial")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    QA_input = [{
        "qas": ["Who counted the game among the best ever made?"],
        "context":
        "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
    }]

    model = QAInferencer.load(save_dir, batch_size=40, gpu=True)
    result = model.inference_from_dicts(dicts=QA_input)[0]

    pprint.pprint(result)

    # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
    filename = os.path.join(processor.data_dir, processor.dev_filename)
    result = model.inference_from_file(file=filename, return_json=False)
    result_squad = [x.to_squad_eval() for x in result]

    write_squad_predictions(predictions=result_squad,
                            predictions_filename=filename,
                            out_filename="predictions.json")

示例#2

显示文件

文件： xlmr_qa_demo.py 项目： antocapp/FARM

def xlmr_qa_demo():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa")

    #########################
    ######## Settings
    ########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    batch_size = 3
    grad_acc_steps = 8
    n_epochs = 2
    evaluate_every = 200
    base_LM_model = "xlm-roberta-large"

    data_dir = Path("../data/squad20")
    train_filename = Path("train-v2.0.json")
    dev_filename = Path("dev-v2.0.json")

    save_dir = Path("../saved_models/xlmr-large-qa")

    inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json")
    predictions_file = save_dir / "predictions.json"
    full_predictions_file = save_dir / "full_predictions.json"
    max_processes_for_inference = 8
    train = True
    inference = False

    if train:
        # 1.Create a tokenizer
        tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model)
        # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
        label_list = ["start_token", "end_token"]
        metric = "squad"
        processor = SquadProcessor(
            tokenizer=tokenizer,
            max_seq_len=384,
            label_list=label_list,
            metric=metric,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=None,
            data_dir=data_dir,
            dev_split=0.0
        )

        # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
        data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1)

        # 4. Create an AdaptiveModel
        # a) which consists of a pretrained language model as a basis
        language_model = LanguageModel.load(base_LM_model, n_added_tokens=3)
        # b) and a prediction head on top that is suited for our task => Question Answering
        prediction_head = QuestionAnsweringHead()

        model = AdaptiveModel(
            language_model=language_model,
            prediction_heads=[prediction_head],
            embeds_dropout_prob=0.1,
            lm_output_types=["per_token"],
            device=device,
        )

        # 5. Create an optimizer
        model, optimizer, lr_schedule = initialize_optimizer(
            model=model,
            learning_rate=3e-5,
            schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2},
            n_batches=len(data_silo.loaders["train"]),
            n_epochs=n_epochs,
            grad_acc_steps=grad_acc_steps,
            device=device
        )

        # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
        trainer = Trainer(
            optimizer=optimizer,
            data_silo=data_silo,
            epochs=n_epochs,
            n_gpu=n_gpu,
            lr_schedule=lr_schedule,
            evaluate_every=evaluate_every,
            device=device,
        )
        # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
        model = trainer.train(model)

        # 8. Hooray! You have a model. Store it:
        model.save(save_dir)
        processor.save(save_dir)


    if inference:
        model = Inferencer.load(save_dir, batch_size=32, gpu=True)
        full_result = model.inference_from_file(
            file=inference_file,
            max_processes=max_processes_for_inference,
        )

        for x in full_result:
            print(x)
            print()

        result = {r["id"]: r["preds"][0][0] for r in full_result}
        full_result = {r["id"]: r["preds"] for r in full_result}

        json.dump(result,
                  open(predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)
        json.dump(full_result,
                  open(full_predictions_file, "w"),
                  indent=4,
                  ensure_ascii=False)

示例#3

显示文件

文件： question_answering_debug.py 项目： yizhiwan/FARM

        epochs=n_epochs,
        n_gpu=n_gpu,
        warmup_linear=warmup_linear,
        evaluate_every=evaluate_every,
        device=device,
    )
    # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
    model = trainer.train(model)

    # 8. Hooray! You have a model. Store it:
    model.save(save_dir)
    processor.save(save_dir)

if inference:
    model = Inferencer.load(save_dir, batch_size=32, gpu=True)
    full_result = model.inference_from_file(
        file=inference_file, use_multiprocessing=inference_multiprocessing)

    for x in full_result:
        print(x)
        print()

    result = {r["id"]: r["preds"][0][0] for r in full_result}
    full_result = {r["id"]: r["preds"] for r in full_result}

    json.dump(result,
              open(predictions_file, "w"),
              indent=4,
              ensure_ascii=False)
    json.dump(full_result,
              open(full_predictions_file, "w"),
              indent=4,

示例#4

显示文件

def text_pair_classification():
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
    ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification")

    ##########################
    ########## Settings
    ##########################
    set_all_seeds(seed=42)
    device, n_gpu = initialize_device_settings(use_cuda=True)
    n_epochs = 2
    batch_size = 64
    evaluate_every = 500
    lang_model = "bert-base-cased"
    label_list = ["0", "1"]
    train_filename = "train.tsv"
    dev_filename = "dev_200k.tsv"

    # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking
    generate_data = False
    data_dir = Path("../data/msmarco_passage")
    predictions_raw_filename = "predictions_raw.txt"
    predictions_filename = "predictions.txt"
    train_source_filename = "triples.train.1m.tsv"
    qrels_filename = "qrels.dev.tsv"
    queries_filename = "queries.dev.tsv"
    passages_filename = "collection.tsv"
    top1000_filename = "top1000.dev"

    # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once!
    # The final format is a tsv file with 3 columns (text, text_b and label)
    if generate_data:
        reformat_msmarco_train(data_dir / train_source_filename,
                               data_dir / train_filename)
        reformat_msmarco_dev(data_dir / queries_filename,
                             data_dir / passages_filename,
                             data_dir / qrels_filename,
                             data_dir / top1000_filename,
                             data_dir / dev_filename)

    # 1.Create a tokenizer
    tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=lang_model,
        do_lower_case=False)

    # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
    #    Evaluation during training will be performed on a slice of the train set
    #    We will be using the msmarco dev set as our final evaluation set
    processor = TextPairClassificationProcessor(tokenizer=tokenizer,
                                                label_list=label_list,
                                                metric="f1_macro",
                                                train_filename=train_filename,
                                                test_filename=None,
                                                dev_split=0.001,
                                                max_seq_len=128,
                                                data_dir=data_dir,
                                                delimiter="\t")

    # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
    data_silo = DataSilo(
        processor=processor,
        batch_size=batch_size)

    # 4. Create an AdaptiveModel
    # a) which consists of a pretrained language model as a basis
    language_model = LanguageModel.load(lang_model)
    # b) and a prediction head on top that is suited for our task
    prediction_head = TextClassificationHead(num_labels=len(label_list),
                                             class_weights=data_silo.calculate_class_weights(
                                                 task_name="text_classification"),
                                             )

    model = AdaptiveModel(
        language_model=language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm_output_types=["per_sequence_continuous"],
        device=device)

    # 5. Create an optimizer
    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        device=device,
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs)

    # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device)

    # 7. Let it grow
    trainer.train()

    # 8. Hooray! You have a model. Store it:
    save_dir = Path("saved_models/passage_ranking_model")
    model.save(save_dir)
    processor.save(save_dir)

    # 9. Load it & harvest your fruits (Inference)
    #    Add your own text adapted to the dataset you provide
    model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128)
    result = model.inference_from_file(data_dir / dev_filename)

    write_msmarco_results(result, save_dir / predictions_raw_filename)

    msmarco_evaluation(preds_file=save_dir / predictions_raw_filename,
                       dev_file=data_dir / dev_filename,
                       qrels_file=data_dir / qrels_filename,
                       output_file=save_dir / predictions_filename)

    model.close_multiprocessing_pool()

示例#5

显示文件

model = trainer.train(model)

# 8. Hooray! You have a model. Store it:
save_dir = "../saved_models/bert-english-qa-tutorial"
model.save(save_dir)
processor.save(save_dir)

# 9. Load it & harvest your fruits (Inference)
QA_input = [
        {
            "questions": ["Who counted the game among the best ever made?"],
            "text":  "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created."
        }]

model = Inferencer.load(save_dir, batch_size=40, gpu=True)
result = model.inference_from_dicts(dicts=QA_input)

for x in result:
    pprint.pprint(x)

# 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk
filename = os.path.join(processor.data_dir,processor.dev_filename)
result = model.inference_from_file(file=filename)
write_squad_predictions(
    predictions=result,
    predictions_filename=filename,
    out_filename="predictions.json"
)