def build_task_data(params: Params, data_supplier) -> SeqTagTaskData: dataset_dict: Dict[str, List[TaggedSequence]] = data_supplier() ner_labels = ["[PAD]", NIT] + list( set(tag for taggedseqs in dataset_dict.values() for taggedseq in taggedseqs for tok, tag in taggedseq)) ml_logger = MLFlowLogger(tracking_uri=os.environ["HOME"] + "/data/mlflow_experiments/mlruns") ml_logger.init_experiment(experiment_name="Sequence_Tagging", run_name="Run_ner") lang_model = "bert-base-cased" do_lower_case = False tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) processor = NERProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=None, # noqa metric="seq_f1", label_list=ner_labels, ) task_data = { "num_labels": len(ner_labels), "lang_model": lang_model, "ml_logger": ml_logger, "processor": processor, "params": params, } return SeqTagTaskData(data=dataset_dict, task_data=task_data)
def eval_question_similarity(y_true, y_pred, lang, model_name, params, user=None, log_to_mlflow=True, run_name="default"): # basic metrics mean_diff = np.mean(np.abs(y_true - y_pred)) roc_auc = roc_auc_score(y_true, y_pred) f1 = f1_score(y_true, y_pred.round(0)) metrics = {"roc_auc": roc_auc, "mean_abs_diff": mean_diff, "f1_score": f1} print(metrics) # log experiment results to MLFlow (visit https://public-mlflow.deepset.ai/) if log_to_mlflow: params["lang"] = lang params["model_name"] = model_name if user: params["user"] = user ml_logger = MLFlowLogger( tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="COVID-question-sim", run_name=run_name) ml_logger.log_params(params) ml_logger.log_metrics(metrics, step=0)
def main(): config_files = [ "experiments/ner/conll2003_de_config.json", "experiments/ner/germEval14_config.json", "experiments/text_classification/germEval18Fine_config.json", "experiments/text_classification/germEval18Coarse_config.json", "experiments/text_classification/gnad_config.json", "experiments/qa/squad20_config.json", ] for conf_file in config_files: experiments = load_experiments(conf_file) for args in experiments: logger.info( "\n***********************************************" f"\n************* Experiment: {args.task.name} ************" "\n************************************************" ) ml_logger = MLFlowLogger(tracking_uri=args.logging.mlflow_url) ml_logger.init_experiment( experiment_name=args.logging.mlflow_experiment, run_name=args.logging.mlflow_run_name, nested=args.logging.mlflow_nested, ) run_experiment(args)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification")
def train_from_scratch(): args = parse_arguments() use_amp = "O2" # using "O2" here allows roughly 30% larger batch_sizes and 45% speed up logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Only the main process should log here if args.local_rank in [-1, 0]: ml_logger = MLFlowLogger( tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank, use_amp=use_amp) save_dir = Path("saved_models/train_from_scratch") data_dir = Path("data/test") # Option A) just using a single file # train_filename = "train.txt" # Option B) (recommended when using StreamingDataSilo): # split and shuffle that file to have random order within and across epochs randomize_and_split_file(data_dir / "train.txt", output_dir=Path("data/split_files"), docs_per_file=1000) train_filename = Path("data/split_files") dev_filename = "dev.txt" distributed = args.local_rank != -1 max_seq_len = 128 batch_size = 8 #if distributed: this is per_gpu grad_acc = 1 learning_rate = 1e-4 warmup_proportion = 0.05 n_epochs = 2 evaluate_every = 15000 log_loss_every = 2 checkpoint_every = 500 checkpoint_root_dir = Path("checkpoints") checkpoints_to_keep = 4 next_sent_pred_style = "bert-style" #or "sentence" max_docs = None # Choose enough workers to queue sufficient batches during training. # Optimal number depends on your GPU speed, CPU speed and number of cores # 16 works well on a 4x V100 machine with 16 cores (AWS: p3.8xlarge). For a single GPU you will need less. data_loader_workers = 1 # 1.Create a tokenizer tokenizer = Tokenizer.load("bert-base-uncased", do_lower_case=True) # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, next_sent_pred_style=next_sent_pred_style, max_docs=max_docs) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets # stream_data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed) stream_data_silo = StreamingDataSilo( processor=processor, batch_size=batch_size, distributed=distributed, dataloader_workers=data_loader_workers) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead(num_labels=2, task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=grad_acc, distributed=distributed, use_amp=use_amp, local_rank=args.local_rank) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, log_loss_every=log_loss_every, device=device, grad_acc_steps=grad_acc, local_rank=args.local_rank, checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, checkpoints_to_keep=checkpoints_to_keep, use_amp=use_amp) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if args.local_rank != -1: torch.distributed.destroy_process_group()
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) n_epochs = 3 batch_size = 8 evaluate_every = 450 lang_model = "/bert-base-chinese" #BERT中文模型的路径 #模型下载地址https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list =["城乡建设","卫生计生","商贸旅游","劳动和社会保障","教育文体","交通运输","环境保护"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=507, data_dir=Path("/BERT留言分类数据集"), #存放文本分类数据的文件夹路径,数据格式:第一列按字符分隔的text,第二列label,之间用制表符分隔。第一行需要有"text"与"label" dev_filename=None, #Path("dev.tsv"), dev_split=0.1, test_filename="/BERT留言分类数据集/test.tsv", label_list=label_list, metric=metric, label_column_name="label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights(task_name="text_classification")) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("/BERT文本分类输出的模型") model.save(save_dir) processor.save(save_dir)
def dense_passage_retrieval(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="FARM-dense_passage_retrieval", run_name="Run_dpr") ########################## ########## Settings ########################## set_all_seeds(seed=42) batch_size = 4 n_epochs = 3 distributed = False # enable for multi GPU training via DDP evaluate_every = 1000 question_lang_model = "facebook/dpr-question_encoder-single-nq-base" passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base" do_lower_case = True use_fast = True embed_title = True num_hard_negatives = 1 similarity_function = "dot_product" train_filename = "nq-train.json" dev_filename = "nq-dev.json" test_filename = "nq-dev.json" max_samples = None # load a smaller dataset (e.g. for debugging) # For multi GPU Training via DDP we need to get the local rank args = parse_arguments() device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank) # 1.Create question and passage tokenizers query_tokenizer = Tokenizer.load( pretrained_model_name_or_path=question_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) passage_tokenizer = Tokenizer.load( pretrained_model_name_or_path=passage_lang_model, do_lower_case=do_lower_case, use_fast=use_fast) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # data_dir "data/retriever" should contain DPR training and dev files downloaded from https://github.com/facebookresearch/DPR # i.e., nq-train.json, nq-dev.json or trivia-train.json, trivia-dev.json label_list = ["hard_negative", "positive"] metric = "text_similarity_metric" processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, max_seq_len_query=64, max_seq_len_passage=256, label_list=label_list, metric=metric, data_dir="../data/retriever", train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, embed_title=embed_title, num_hard_negatives=num_hard_negatives, max_samples=max_samples) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed) # 4. Create an BiAdaptiveModel+ # a) which consists of 2 pretrained language models as a basis question_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRQuestionEncoder") passage_language_model = LanguageModel.load( pretrained_model_name_or_path="bert-base-uncased", language_model_class="DPRContextEncoder") # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = TextSimilarityHead( similarity_function=similarity_function) model = BiAdaptiveModel( language_model1=question_language_model, language_model2=passage_language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm1_output_types=["per_sequence"], lm2_output_types=["per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \ "eps": 1e-08}, schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=1, device=device, distributed=distributed ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/dpr-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Evaluate test_data_loader = data_silo.get_data_loader("test") if test_data_loader is not None: evaluator_test = Evaluator(data_loader=test_data_loader, tasks=data_silo.processor.tasks, device=device) model.connect_heads_with_processor(processor.tasks) test_result = evaluator_test.eval(model)
from farm.experiment import initialize_optimizer from farm.infer import Inferencer from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import LanguageModel from farm.modeling.prediction_head import RegressionHead from farm.modeling.tokenization import Tokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_regression") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 32 evaluate_every = 30 lang_model = "bert-base-cased" # 1.Create a tokenizer tokenizer = Tokenizer.from_pretrained(pretrained_model_name_or_path=lang_model, do_lower_case=False)
def finetune_sentence_level(args): logging.basicConfig( format="%(asctime)s %(levelname)s %(name)s %(message)s", datefmt="%d-%m-%y %H:%M:%S", level=logging.INFO) args.logger = logging.getLogger(__name__) if args.do_logfile: filehandler = logging.FileHandler( os.path.join(args.log_dir, f"{args.run_name}.log")) args.logger.addHandler(filehandler) args.logger.info(vars(args)) # Setup MLFlow ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) set_all_seeds(seed=args.seed) args.device, args.n_gpu = initialize_device_settings(use_cuda=True) # Create a tokenizer tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name, do_lower_case=args.do_lower_case, tokenizer_class=tok_class) # Create a processor for the dataset processor = load_processor(args, tokenizer) # Create a DataSilo that loads several datasets (train/dev/test) # provides DataLoaders and calculates descriptive statistics data_silo = DataSilo(processor=processor, batch_size=args.batch_size) if args.do_feat_embeds: args.feat_size = processor.feat_size # We do cross-validation if args.folds > 1: evaluate_kfold(args, data_silo, processor) else: adapt_model = train_on_split(args, data_silo, processor) evaluator_test = MultitaskEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=args.device) result = evaluator_test.eval(adapt_model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len( data_silo.get_data_loader("test"))) pred_tsv = pd.DataFrame() args.logger.info("Test results:") for res in result[1:]: args.logger.info(f"__{res['task_name']}__") if args.train_mode == "classification": metrics = classification_metrics(res.get("preds"), res.get("labels")) args.logger.info(metrics) else: metrics = regression_metrics(res.get("preds"), res.get("labels")) for metric in metrics.keys(): args.logger.info(f"{metric}: {metrics[metric]}") if args.save_predictions: pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")[0] pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")[0] if args.save_predictions: save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}.tsv")) # Load trained model and perform inference dicts = [ { "text": "The intense interest aroused in the public has now somewhat subsided." }, { "text": "The quick brown fox jumped over the lazy dog." }, ] model = MultitaskInferencer.load(args.save_dir, gpu=True, level="sentence") result = model.inference_from_dicts(dicts=dicts) args.logger.info("Inference example:") args.logger.info(result)
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_fasttext") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load fasttext from a local path: #fasttext_model = "../saved_models/fasttext-german-uncased" # or through s3 fasttext_model = "fasttext-german-uncased" do_lower_case = True max_features = 10_000 # maximum number of unique words we will transform device, n_gpu = initialize_device_settings(use_cuda=True) # 1. To make Fasttext work within FARM and with advanced aggregation strategies, we need a fixed vocabulary and associated Wordembeddings ft_converter = Fasttext_converter( pretrained_model_name_or_path=fasttext_model, do_lower_case=do_lower_case, data_path=Path("../data/germeval18"), train_filename="train.tsv", output_path=Path("../saved_models/fasttext-german-uncased-converted"), language="German", max_features=max_features) # We convert the data to have fixed size vocab and embeddings vocab_counts = ft_converter.convert_on_data() # 2. Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=ft_converter.output_path, do_lower_case=do_lower_case) # 3. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=ft_converter.data_path, label_list=label_list, train_filename=ft_converter.train_filename, dev_split=0, test_filename="test.tsv", metric=metric, label_column_name="coarse_label") # 4. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size, max_processes=1 ) # multiprocessing with WordembeddingTokenizer is not optimal - so disable it # 5. Create an AdaptiveModel # a) which consists of the newly created embedding model as a basis. language_model = LanguageModel.load(ft_converter.output_path) # b) and a prediction head on top that is suited for our task => Text classification # Since we do not have a powerful Transformer based Language Model, we need a slightly deeper NN # for going the Classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 6. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-3, device=device, n_batches=len( data_silo.get_data_loader("train") ), #len(data_silo.loaders["train"]),streaming: len(data_silo.get_data_loader("train")) n_epochs=n_epochs) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 8. Let it grow trainer.train()
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 2000 lang_model = "roberta-base" do_lower_case = False # roberta is a cased model train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=Path("../data/squad20"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": 0.2 }, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-english-qa-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) QA_input = [{ "qas": ["Who counted the game among the best ever made?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." }] model = QAInferencer.load(save_dir, batch_size=40, gpu=True) result = model.inference_from_dicts(dicts=QA_input)[0] pprint.pprint(result) # 10. Do Inference on whole SQuAD Dataset & write the predictions file to disk filename = os.path.join(processor.data_dir, processor.dev_filename) result = model.inference_from_file(file=filename, return_json=False) result_squad = [x.to_squad_eval() for x in result] write_squad_predictions(predictions=result_squad, predictions_filename=filename, out_filename="predictions.json")
def ner(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_ner") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 400 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # See test/sample/ner/train-sample.txt for an example of the data format that is expected by the Processor ner_labels = [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/conll03-de"), delimiter=" ", metric="seq_f1", label_list=ner_labels) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => NER prediction_head = TokenClassificationHead(num_labels=len(ner_labels)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device, ) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = "saved_models/bert-german-ner-tutorial" model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def train_from_scratch(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="from_scratch", run_name="debug") ######################### ######## Settings ######################## set_all_seeds(seed=39) device, n_gpu = initialize_device_settings(use_cuda=True) evaluate_every = 5000 vocab_size = 30522 # dev_filename = None save_dir = Path("saved_models/train_from_scratch") n_epochs = 10 learning_rate = 1e-4 warmup_proportion = 0.05 batch_size = 16 # (probably only possible via gradient accumulation steps) max_seq_len = 64 data_dir = Path("data/lm_finetune_nips") train_filename = "train.txt" dev_filename = "dev.txt" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = BertStyleLMProcessor( data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets stream_data_silo = StreamingDataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=8, ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=8, checkpoint_root_dir=Path( "saved_models/train_from_scratch/checkpoints"), ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir)
def train_from_scratch(): # We need the local rank argument for DDP args = parse_arguments() use_amp = "O2" logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="") ml_logger.init_experiment(experiment_name="train_from_scratch", run_name="run") set_all_seeds(seed=39) # device, n_gpu = initialize_device_settings(use_cuda=True) device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank, use_amp=use_amp) evaluate_every = 10000 save_dir = Path("saved_models/train_from_scratch") data_dir = Path("data/lm_finetune_nips") train_filename = "train.txt" # dev_filename = "dev.txt" max_seq_len = 128 batch_size = 80 grad_acc = 3 learning_rate = 0.0001 warmup_proportion = 0.01 n_epochs = 5 vocab_file = "bert-base-uncased-vocab.txt" # 1.Create a tokenizer tokenizer = BertTokenizer(data_dir / vocab_file, do_lower_case=True) # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # 2. Create a DataProcessor that handles all the conversion from raw text into a PyTorch Dataset # limiting max docs to divisible of 64 (world_size * num_workers) processor = BertStyleLMProcessor(data_dir=data_dir, tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=None, test_filename=None) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and # calculates a few descriptive statistics of our datasets stream_data_silo = StreamingDataSilo(processor=processor, batch_size=batch_size, distributed=True, dataloader_workers=16) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.from_scratch("bert", tokenizer.vocab_size) # b) and *two* prediction heads on top that are suited for our task => Language Model finetuning lm_prediction_head = BertLMHead(768, tokenizer.vocab_size) next_sentence_head = NextSentenceHead([768, 2], task_name="nextsentence") model = AdaptiveModel( language_model=language_model, prediction_heads=[lm_prediction_head, next_sentence_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=learning_rate, schedule_opts={ "name": "LinearWarmup", "warmup_proportion": warmup_proportion }, n_batches=len(stream_data_silo.get_data_loader("train")), n_epochs=n_epochs, device=device, grad_acc_steps=grad_acc, distributed=True, use_amp=use_amp, local_rank=args.local_rank) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time # if args.get("checkpoint_every"): # checkpoint_every = int(args["checkpoint_every"]) # checkpoint_root_dir = Path("/opt/ml/checkpoints/training") # else: checkpoint_every = None checkpoint_root_dir = None trainer = Trainer.create_or_load_checkpoint( model=model, optimizer=optimizer, data_silo=stream_data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, grad_acc_steps=grad_acc, checkpoint_every=checkpoint_every, checkpoint_root_dir=checkpoint_root_dir, use_amp=use_amp, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train()
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification_glove") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 3 batch_size = 32 evaluate_every = 100 # load from a local path: lang_model = Path("../saved_models/glove-german-uncased") # or through s3 #lang_model = "glove-german-uncased" do_lower_case = True device, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, dev_split=0, test_filename="test.tsv", train_filename="train.tsv", metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a data_silo = DataSilo(processor=processor, batch_size=batch_size, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of an embedding model as a basis. # Word embedding models only converts words it has seen during training to embedding vectors. language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( layer_dims=[300, 600, len(label_list)], class_weights=data_silo.calculate_class_weights( task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train()
def doc_classification_cola(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_cola") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 5 batch_size = 100 evaluate_every = 20 lang_model = "bert-base-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load Cola 2018 Data. label_list = ["0", "1"] metric = "mcc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/cola"), dev_filename=Path("dev.tsv"), dev_split=None, test_filename=None, label_list=label_list, metric=metric, label_column_name="label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # language_model = Roberta.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=2e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "The box contained the ball from the tree." }, { "text": "I'll fix you a drink." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def doc_classification_multilabel(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 1 batch_size = 32 evaluate_every = 500 lang_model = "bert-base-uncased" do_lower_case = True # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"] metric = "acc" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/toxic-comments"), label_list=label_list, label_column_name="label", metric=metric, quote_char='"', multilabel=True, train_filename="train.tsv", dev_filename="val.tsv", test_filename=None, dev_split=0, ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = MultiLabelTextClassificationHead(num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/bert-german-multi-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "You f*****g bastards"}, {"text": "What a lovely world"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
def main(args): print(f"[INFO] PyTorch Version: {torch.__version__}") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("[INFO] Devices available: {}".format(device)) checkpoint_path = Path(args.ckpt_path) / args.run_name ml_logger = MLFlowLogger(tracking_uri=args.tracking_uri) ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) tokenizer = Tokenizer.load( pretrained_model_name_or_path=args.pretrained_model_name_or_path, do_lower_case=False) # Processor if args.task_name == "text_classification": processor = TextClassificationProcessor( tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=None, test_filename=args.test_filename, header=0, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, label_column_name=args.label_column_name, text_column_name=args.text_column_name) elif args.task_name == "question_answering": processor = SquadProcessor(tokenizer=tokenizer, train_filename=args.train_filename, dev_filename=args.test_filename, test_filename=args.test_filename, max_seq_len=args.max_seq_len, data_dir=args.data_dir, label_list=args.label_list, metric=args.metric, max_query_length=64, doc_stride=128, max_answers=1) else: raise ValueError("task name error") processor.save(checkpoint_path) # DataSilo data_silo = DataSilo(processor=processor, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size, caching=True, cache_path=checkpoint_path) # LanguageModel: Build pretrained language model language_model = LanguageModel.load(args.pretrained_model_name_or_path, language="korean") # PredictionHead: Build predictor layer if args.task_name == "text_classification": # If you do classification on imbalanced classes, consider using class weights. # They change the loss function to down-weight frequent classes. prediction_head = TextClassificationHead( num_labels=len(args.label_list), class_weights=data_silo.calculate_class_weights( task_name=args.task_name)) elif args.task_name == "question_answering": prediction_head = QuestionAnsweringHead( layer_dims=[768, 2], task_name=args.task_name, ) else: raise ValueError("task name error") # AdaptiveModel: Combine all if args.task_name == "text_classification": lm_output_types = ["per_sequence"] elif args.task_name == "question_answering": lm_output_types = ["per_token"] else: raise ValueError("task name error") model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=args.embeds_dropout_prob, lm_output_types=lm_output_types, device=device) # Initialize Optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, device=device, learning_rate=args.learning_rate, n_batches=len(data_silo.loaders["train"]), n_epochs=args.n_epochs) # EarlyStopping earlymetric = "f1" if args.task_name == "question_answering" else "acc" mode = "max" if args.task_name in [ "text_classification", "question_answering" ] else "min" earlystop = EarlyStopping(save_dir=checkpoint_path, metric=earlymetric, mode=mode, patience=5) # Trainer trainer = Trainer( model=model, optimizer=optimizer, lr_schedule=lr_schedule, data_silo=data_silo, early_stopping=earlystop, evaluate_every=args.evaluate_every, checkpoints_to_keep=args.checkpoints_to_keep, checkpoint_root_dir=checkpoint_path, checkpoint_every=args.checkpoint_every, epochs=args.n_epochs, n_gpu=args.n_gpu, device=device, ) # now train! model = trainer.train()
def doc_classifcation(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" # or a local path: # lang_model = Path("../saved_models/farm-bert-base-cased") use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data. label_list = ["OTHER", "OFFENSE"] metric = "f1_macro" processor = TextClassificationProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label" ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( class_weights=data_silo.calculate_class_weights(task_name="text_classification"), num_labels=len(label_list)) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ {"text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei"}, {"text": "Martin Müller spielt Handball in Berlin"}, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import LanguageModel from farm.modeling.optimization import initialize_optimizer from farm.modeling.prediction_head import QuestionAnsweringHead from farm.modeling.tokenization import Tokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 2 evaluate_every = 500 base_LM_model = "bert-base-cased" train_filename="train-v2.0.json" dev_filename="dev-v2.0.json" # 1.Create a tokenizer tokenizer = Tokenizer.load(
def question_answering(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_natural_questions") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 24 n_epochs = 1 evaluate_every = 500 lang_model = "deepset/roberta-base-squad2" # start with a model that can already extract answers do_lower_case = False # roberta is a cased model train_filename = "train_medium.jsonl" dev_filename = "dev_medium.jsonl" keep_is_impossible = 0.15 # downsample negative examples after data conversion downsample_context_size = 300 # reduce length of wikipedia articles to relevant part around the answer # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case ) # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart html_tags = [ "<Th>","</Th>", "<Td>","</Td>", "<Tr>","</Tr>", "<Li>","</Li>", "<P>" ,"</P>", "<Ul>","</Ul>", "<H1>","</H1>", "<H2>","</H2>", "<H3>","</H3>", "<H4>","</H4>", "<H5>", "</H5>", "<Td_colspan=", ] tokenizer.add_tokens(html_tags) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = NaturalQuestionsProcessor( tokenizer=tokenizer, max_seq_len=384, train_filename=train_filename, dev_filename=dev_filename, keep_no_answer=keep_is_impossible, downsample_context_size=downsample_context_size, data_dir=Path("../data/natural_questions"), ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, caching=True) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model,n_added_tokens=len(html_tags)) # b) and in case of Natural Questions we need two Prediction Heads # one for extractive Question Answering qa_head = QuestionAnsweringHead() # another one for answering yes/no questions or deciding if the given text passage might contain an answer classification_head = TextClassificationHead(num_labels=len(processor.answer_type_list)) # answer_type_list = ["is_impossible", "span", "yes", "no"] model = AdaptiveModel( language_model=language_model, prediction_heads=[qa_head, classification_head], embeds_dropout_prob=0.1, lm_output_types=["per_token", "per_sequence"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("../saved_models/roberta-base-squad2-nq") model.save(save_dir) processor.save(save_dir) # 9. Since training on the whole NQ corpus requires substantial compute resources we trained and uploaded a model on s3 fetch_archive_from_http("https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/models/roberta-base-squad2-nq.zip", output_dir="../saved_models/farm") QA_input = [ { "qas": ["Did GameTrailers rated Twilight Princess as one of the best games ever created?"], "context": "Twilight Princess was released to universal critical acclaim and commercial success. It received perfect scores from major publications such as 1UP.com, Computer and Video Games, Electronic Gaming Monthly, Game Informer, GamesRadar, and GameSpy. On the review aggregators GameRankings and Metacritic, Twilight Princess has average scores of 95% and 95 for the Wii version and scores of 95% and 96 for the GameCube version. GameTrailers in their review called it one of the greatest games ever created." } ] model = QAInferencer.load(model_name_or_path="../saved_models/farm/roberta-base-squad2-nq", batch_size=batch_size, gpu=True) result = model.inference_from_dicts(dicts=QA_input, return_json=False) # result is a list of QAPred objects print(f"\nQuestion: Did GameTrailers rated Twilight Princess as one of the best games ever created?" f"\nAnswer from model: {result[0].prediction[0].answer}") model.close_multiprocessing_pool()
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ###### ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset. # The TextPairClassificationProcessor expects a csv with columns called "text', "text_b" and "label" processor = TextPairClassificationProcessor( tokenizer=tokenizer, label_list=label_list, metric="f1_macro", max_seq_len=128, dev_filename="dev.tsv", test_filename=None, data_dir=Path("../data/asnq_binary"), delimiter="\t") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=5e-6, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/text_pair_classification_model") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide basic_texts = [ { "text": "how many times have real madrid won the champions league in a row", "text_b": "They have also won the competition the most times in a row, winning it five times from 1956 to 1960" }, { "text": "how many seasons of the blacklist are there on netflix", "text_b": "Retrieved March 27 , 2018 ." }, ] model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result)
from farm.modeling.optimization import initialize_optimizer from farm.infer import Inferencer from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import LanguageModel from farm.modeling.prediction_head import TextClassificationHead from farm.modeling.tokenization import Tokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_doc_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) n_epochs = 1 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model,
from farm.infer import Inferencer from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.prediction_head import QuestionAnsweringHead from farm.modeling.language_model import LanguageModel from farm.modeling.tokenization import Tokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="SQuAD", run_name="qa_albert") ######################### ######## Settings ######################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=False) batch_size = 32 n_epochs = 2 evaluate_every = 1 base_LM_model = "albert" train_filename = "train-v2.0.json" dev_filename = "dev-v2.0.json" save_dir = "../saved_models/qa_medium_albert" inference_file = "../data/squad20/subsets/5ad3ff1b604f3c001a3ffc74.json" predictions_file = save_dir + "/predictions.json"
from farm.infer import Inferencer from farm.modeling.adaptive_model import AdaptiveModel from farm.modeling.language_model import Bert from farm.modeling.prediction_head import TokenClassificationHead from farm.modeling.tokenization import BertTokenizer from farm.train import Trainer from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_minimal_example_ner") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 4 batch_size = 32 evaluate_every = 50 lang_model = "bert-base-german-cased" # 1.Create a tokenizer tokenizer = BertTokenizer.from_pretrained( pretrained_model_name_or_path=lang_model, do_lower_case=False)
def text_pair_classification(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_text_pair_classification") ########################## ########## Settings ########################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 2 batch_size = 64 evaluate_every = 500 lang_model = "bert-base-cased" label_list = ["0", "1"] train_filename = "train.tsv" dev_filename = "dev_200k.tsv" # The source data can be found here https://github.com/microsoft/MSMARCO-Passage-Ranking generate_data = False data_dir = Path("../data/msmarco_passage") predictions_raw_filename = "predictions_raw.txt" predictions_filename = "predictions.txt" train_source_filename = "triples.train.1m.tsv" qrels_filename = "qrels.dev.tsv" queries_filename = "queries.dev.tsv" passages_filename = "collection.tsv" top1000_filename = "top1000.dev" # 0. Preprocess and save MSMarco data in a format that can be ingested by FARM models. Only needs to be done once! # The final format is a tsv file with 3 columns (text, text_b and label) if generate_data: reformat_msmarco_train(data_dir / train_source_filename, data_dir / train_filename) reformat_msmarco_dev(data_dir / queries_filename, data_dir / passages_filename, data_dir / qrels_filename, data_dir / top1000_filename, data_dir / dev_filename) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=lang_model, do_lower_case=False) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Evaluation during training will be performed on a slice of the train set # We will be using the msmarco dev set as our final evaluation set processor = TextPairClassificationProcessor(tokenizer=tokenizer, label_list=label_list, metric="f1_macro", train_filename=train_filename, test_filename=None, dev_split=0.001, max_seq_len=128, data_dir=data_dir, delimiter="\t") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo( processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task prediction_head = TextClassificationHead(num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification"), ) model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_sequence_continuous"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=1e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. Store it: save_dir = Path("saved_models/passage_ranking_model") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) # Add your own text adapted to the dataset you provide model = Inferencer.load(save_dir, gpu=True, max_seq_len=128, batch_size=128) result = model.inference_from_file(data_dir / dev_filename) write_msmarco_results(result, save_dir / predictions_raw_filename) msmarco_evaluation(preds_file=save_dir / predictions_raw_filename, dev_file=data_dir / dev_filename, qrels_file=data_dir / qrels_filename, output_file=save_dir / predictions_filename) model.close_multiprocessing_pool()
def doc_classification_with_earlystopping(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") # for local logging instead: # ml_logger = MLFlowLogger(tracking_uri="logs") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1") ########################## ########## Settings ########################## set_all_seeds(seed=42) use_amp = None device, n_gpu = initialize_device_settings(use_cuda=True) n_epochs = 20 batch_size = 32 evaluate_every = 100 lang_model = "bert-base-german-cased" do_lower_case = False # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset # Here we load GermEval 2018 Data automaticaly if it is not available. # GermEval 2018 only has train.tsv and test.tsv dataset - no dev.tsv # The processor wants to know the possible labels ... label_list = ["OTHER", "OFFENSE"] # The evaluation on the dev-set can be done with one of the predefined metrics or with a # metric defined as a function from (preds, labels) to a dict that contains all the actual # metrics values. The function must get registered under a string name and the string name must # be used. def mymetrics(preds, labels): acc = simple_accuracy(preds, labels) f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER") f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE") f1macro = f1_score(y_true=labels, y_pred=preds, average="macro") f1micro = f1_score(y_true=labels, y_pred=preds, average="macro") return { "acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro } register_metrics('mymetrics', mymetrics) metric = 'mymetrics' processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=64, data_dir=Path("../data/germeval18"), label_list=label_list, metric=metric, label_column_name="coarse_label") # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(lang_model) # b) and a prediction head on top that is suited for our task => Text classification prediction_head = TextClassificationHead( num_labels=len(label_list), class_weights=data_silo.calculate_class_weights( task_name="text_classification")) model = AdaptiveModel(language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.2, lm_output_types=["per_sequence"], device=device) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=0.5e-5, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, use_amp=use_amp) # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time # Also create an EarlyStopping instance and pass it on to the trainer # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. earlystopping = EarlyStopping( metric="f1_offense", mode= "max", # use the metric from our own metrics function instead of loss # metric="f1_macro", mode="max", # use f1_macro from the dev evaluator of the trainer # metric="loss", mode="min", # use loss from the dev evaluator of the trainer save_dir=Path("saved_models/bert-german-doc-tutorial-es" ), # where to save the best model patience= 5 # number of evaluations to wait for improvement before terminating the training ) trainer = Trainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, early_stopping=earlystopping) # 7. Let it grow trainer.train() # 8. Hooray! You have a model. # NOTE: if early stopping is used, the best model has been stored already in the directory # defined with the EarlyStopping instance # The model we have at this moment is the model from the last training epoch that was carried # out before early stopping terminated the training save_dir = Path("saved_models/bert-german-doc-tutorial") model.save(save_dir) processor.save(save_dir) # 9. Load it & harvest your fruits (Inference) basic_texts = [ { "text": "Schartau sagte dem Tagesspiegel, dass Fischer ein Idiot sei" }, { "text": "Martin Müller spielt Handball in Berlin" }, ] # Load from the final epoch directory and apply print("LOADING INFERENCER FROM FINAL MODEL DURING TRAINING") model = Inferencer.load(save_dir) result = model.inference_from_dicts(dicts=basic_texts) print(result) model.close_multiprocessing_pool() # Load from saved best model print("LOADING INFERENCER FROM BEST MODEL DURING TRAINING") model = Inferencer.load(earlystopping.save_dir) result = model.inference_from_dicts(dicts=basic_texts) print("APPLICATION ON BEST MODEL") print(result) model.close_multiprocessing_pool()
def xlmr_qa_demo(): logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name="Public_FARM", run_name="run_xmlr_qa") ######################### ######## Settings ######################## set_all_seeds(seed=42) device, n_gpu = initialize_device_settings(use_cuda=True) batch_size = 3 grad_acc_steps = 8 n_epochs = 2 evaluate_every = 200 base_LM_model = "xlm-roberta-large" data_dir = Path("../data/squad20") train_filename = Path("train-v2.0.json") dev_filename = Path("dev-v2.0.json") save_dir = Path("../saved_models/xlmr-large-qa") inference_file = Path("../data/MLQA_V1/dev/dev-context-de-question-de.json") predictions_file = save_dir / "predictions.json" full_predictions_file = save_dir / "full_predictions.json" max_processes_for_inference = 8 train = True inference = False if train: # 1.Create a tokenizer tokenizer = Tokenizer.load(pretrained_model_name_or_path=base_LM_model) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset label_list = ["start_token", "end_token"] metric = "squad" processor = SquadProcessor( tokenizer=tokenizer, max_seq_len=384, label_list=label_list, metric=metric, train_filename=train_filename, dev_filename=dev_filename, test_filename=None, data_dir=data_dir, dev_split=0.0 ) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False, max_processes=1) # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(base_LM_model, n_added_tokens=3) # b) and a prediction head on top that is suited for our task => Question Answering prediction_head = QuestionAnsweringHead() model = AdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=0.1, lm_output_types=["per_token"], device=device, ) # 5. Create an optimizer model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=3e-5, schedule_opts={"name": "LinearWarmup", "warmup_proportion": 0.2}, n_batches=len(data_silo.loaders["train"]), n_epochs=n_epochs, grad_acc_steps=grad_acc_steps, device=device ) # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=n_epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=evaluate_every, device=device, ) # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai model = trainer.train(model) # 8. Hooray! You have a model. Store it: model.save(save_dir) processor.save(save_dir) if inference: model = Inferencer.load(save_dir, batch_size=32, gpu=True) full_result = model.inference_from_file( file=inference_file, max_processes=max_processes_for_inference, ) for x in full_result: print(x) print() result = {r["id"]: r["preds"][0][0] for r in full_result} full_result = {r["id"]: r["preds"] for r in full_result} json.dump(result, open(predictions_file, "w"), indent=4, ensure_ascii=False) json.dump(full_result, open(full_predictions_file, "w"), indent=4, ensure_ascii=False)
def doc_classification( task_config, model_name_or_path, cache_dir, data_dir, save_dir, model_dir, run_name="0", lr=1e-05, warmup_steps=5000, balance_classes=True, embeds_dropout=0.1, epochs=200, # large because we use early stopping by default batch_size=20, grad_acc_steps=1, early_stopping_metric="roc_auc", early_stopping_mode="max", early_stopping_patience=10, model_class="Bert", tokenizer_class="BertTokenizer", do_lower_case=False, do_train=True, do_eval=True, do_hpo=False, print_preds=False, print_dev_preds=False, max_seq_len=512, seed=11, eval_every=500, use_amp=False, use_cuda=True, ): # Load task config task_config = yaml.safe_load(open(task_config)) data_dir = data_dir save_dir = save_dir model_dir = model_dir # Create label list from args list or (for large label lists) create from file by splitting by space if isinstance(task_config["data"]["label_list"], list): label_list = task_config["data"]["label_list"] else: with open(data_dir / 'labels' / task_config["data"]["label_list"]) as code_file: label_list = code_file.read().split(" ") # Register Outcome Metrics register_task_metrics(label_list) # General Settings set_all_seeds(seed=seed) device, n_gpu = initialize_device_settings(use_cuda=use_cuda, use_amp=use_amp) # 1.Create a tokenizer tokenizer = Tokenizer.load( pretrained_model_name_or_path=model_name_or_path, tokenizer_class=tokenizer_class, do_lower_case=do_lower_case) # 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset processor = TextClassificationProcessor( tokenizer=tokenizer, max_seq_len=max_seq_len, data_dir=data_dir, label_list=label_list, metric=task_config["metric"], multilabel=task_config["multilabel"], train_filename=task_config["data"]["train_filename"], dev_filename=task_config["data"]["dev_filename"], dev_split=task_config["data"]["dev_split"] if "dev_split" in task_config["data"] else None, test_filename=task_config["data"]["test_filename"], delimiter=task_config["data"]["parsing"]["delimiter"], quote_char=task_config["data"]["parsing"]["quote_char"], label_column_name=task_config["data"]["parsing"]["label_column"]) # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a # few descriptive statistics of our datasets data_silo = DataSilo(processor=processor, caching=True, cache_path=Path(cache_dir), batch_size=batch_size) if do_train: # Setup MLFlow logger ml_logger = MLFlowLogger(tracking_uri=task_config["log_dir"]) ml_logger.init_experiment( experiment_name=task_config["experiment_name"], run_name=f'{task_config["experiment_name"]}_{run_name}') # 4. Create an AdaptiveModel # a) which consists of a pretrained language model as a basis language_model = LanguageModel.load(model_name_or_path, language_model_class=model_class) # b) and a prediction head on top that is suited for our task # Define class weights if balance_classes: class_weights = data_silo.calculate_class_weights( task_name=task_config["task_type"]) else: class_weights = None # Create Multi- or Single-Label Classification Heads if task_config["multilabel"]: prediction_head = MultiLabelTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) else: prediction_head = ExtendedTextClassificationHead( class_weights=class_weights, num_labels=len(label_list)) model = ExtendedAdaptiveModel( language_model=language_model, prediction_heads=[prediction_head], embeds_dropout_prob=embeds_dropout, lm_output_types=[task_config["output_type"]], device=device) # 5. Create an optimizer schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": warmup_steps } model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=lr, device=device, n_batches=len(data_silo.loaders["train"]), n_epochs=epochs, use_amp=use_amp, grad_acc_steps=grad_acc_steps, schedule_opts=schedule_opts) # 6. Create an early stopping instance early_stopping = None if early_stopping_mode != "none": early_stopping = EarlyStopping(mode=early_stopping_mode, min_delta=0.0001, save_dir=model_dir, metric=early_stopping_metric, patience=early_stopping_patience) # 7. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it # from time to time trainer = ExtendedTrainer(model=model, optimizer=optimizer, data_silo=data_silo, epochs=epochs, n_gpu=n_gpu, lr_schedule=lr_schedule, evaluate_every=eval_every, early_stopping=early_stopping, device=device, grad_acc_steps=grad_acc_steps, evaluator_test=do_eval) def score_callback(eval_score, train_loss): tune.report(roc_auc_dev=eval_score, train_loss=train_loss) # 8. Train the model trainer.train(score_callback=score_callback if do_hpo else None) # 9. Save model if not saved in early stopping model.save(model_dir + "/final_model") processor.save(model_dir + "/final_model") if do_eval: # Load newly trained model or existing model if do_train: model_dir = model_dir else: model_dir = Path(model_name_or_path) logger.info("###### Eval on TEST SET #####") evaluator_test = ExtendedEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=device) # Load trained model for evaluation model = ExtendedAdaptiveModel.load(model_dir, device) model.connect_heads_with_processor(data_silo.processor.tasks, require_labels=True) # Evaluate results = evaluator_test.eval(model, return_preds_and_labels=True) # Log results utils.log_results(results, dataset_name="test", steps=len(evaluator_test.data_loader), save_path=model_dir + "/eval_results.txt") if print_preds: # Print model test predictions utils.save_predictions(results, save_dir=model_dir, multilabel=task_config["multilabel"]) if print_dev_preds: # Evaluate on dev set, e.g. for threshold tuning evaluator_dev = Evaluator( data_loader=data_silo.get_data_loader("dev"), tasks=data_silo.processor.tasks, device=device) dev_results = evaluator_dev.eval(model, return_preds_and_labels=True) utils.log_results(dev_results, dataset_name="dev", steps=len(evaluator_dev.data_loader), save_path=model_dir + "/eval_dev_results.txt") # Print model dev predictions utils.save_predictions(dev_results, save_dir=model_dir, multilabel=task_config["multilabel"], dataset_name="dev")
def run_experiment(args): logger.info("\n***********************************************" f"\n************* Experiment: {args.task.name} ************" "\n************************************************") ml_logger = MlLogger(tracking_uri=args.logging.mlflow_url) ml_logger.init_experiment( experiment_name=args.logging.mlflow_experiment, run_name=args.logging.mlflow_run_name, nested=args.logging.mlflow_nested, ) validate_args(args) distributed = bool(args.general.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings( use_cuda=args.general.cuda, local_rank=args.general.local_rank, fp16=args.general.fp16, ) args.parameter.batch_size = int(args.parameter.batch_size // args.parameter.gradient_accumulation_steps) if n_gpu > 1: args.parameter.batch_size = args.parameter.batch_size * n_gpu set_all_seeds(args.general.seed) # Prepare Data tokenizer = Tokenizer.load(args.parameter.model, do_lower_case=args.parameter.lower_case) processor = Processor.load( tokenizer=tokenizer, max_seq_len=args.parameter.max_seq_len, data_dir=args.general.data_dir, **args.task.toDict( ), # args is of type DotMap and needs conversion to std python dicts ) data_silo = DataSilo( processor=processor, batch_size=args.parameter.batch_size, distributed=distributed, ) class_weights = None if args.parameter.balance_classes: task_names = list(processor.tasks.keys()) if len(task_names) > 1: raise NotImplementedError( f"Balancing classes is currently not supported for multitask experiments. Got tasks: {task_names} " ) class_weights = data_silo.calculate_class_weights( task_name=task_names[0]) model = get_adaptive_model( lm_output_type=args.parameter.lm_output_type, prediction_heads=args.parameter.prediction_head, layer_dims=args.parameter.layer_dims, model=args.parameter.model, device=device, class_weights=class_weights, embeds_dropout_prob=args.parameter.embeds_dropout_prob, ) # Init optimizer # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this? optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=args.parameter.learning_rate, warmup_proportion=args.parameter.warmup_proportion, loss_scale=args.general.loss_scale, fp16=args.general.fp16, n_batches=len(data_silo.loaders["train"]), grad_acc_steps=args.parameter.gradient_accumulation_steps, n_epochs=args.parameter.epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=args.parameter.epochs, n_gpu=n_gpu, grad_acc_steps=args.parameter.gradient_accumulation_steps, fp16=args.general.fp16, local_rank=args.general.local_rank, warmup_linear=warmup_linear, evaluate_every=args.logging.eval_every, device=device, ) model = trainer.train(model) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.task.name}" ) processor.save(f"{args.general.output_dir}/{model_name}") model.save(f"{args.general.output_dir}/{model_name}")