Пример #1
0
def train(no_cache: bool, dataset_path: str, data_config_name: str,
          training_args: TrainingArguments, tokenizer: RobertaTokenizerFast):

    print(f"tokenizer vocab size: {tokenizer.vocab_size}")

    print(f"\nLoading datasets found in {dataset_path}.")
    train_dataset, eval_dataset, test_dataset = load_dataset(
        'EMBO/biolang',
        data_config_name,
        data_dir=dataset_path,
        split=["train", "validation", "test"],
        # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS,
        cache_dir=CACHE)

    if data_config_name != "MLM":
        data_collator = DataCollatorForTargetedMasking(
            tokenizer=tokenizer, max_length=config.max_length)
    else:
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=True)

    print(f"\nTraining with {len(train_dataset)} examples.")
    print(f"Evaluating on {len(eval_dataset)} examples.")

    if config.from_pretrained:
        model = RobertaForMaskedLM.from_pretrained(config.from_pretrained)
    else:
        model_config = RobertaConfig(
            vocab_size=config.vocab_size,
            max_position_embeddings=config.max_length +
            2,  # max_length + 2 for start/end token
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1,
        )
        model = RobertaForMaskedLM(config=model_config)

    training_args.remove_unused_columns = False  # we need pos_mask and special_tokens_mask in collator

    print("\nTraining arguments:")
    print(training_args)

    trainer = MyTrainer(model=model,
                        args=training_args,
                        data_collator=data_collator,
                        train_dataset=train_dataset,
                        eval_dataset=eval_dataset,
                        compute_metrics=compute_metrics,
                        callbacks=[ShowExample(tokenizer)])

    print(f"CUDA available: {torch.cuda.is_available()}")
    trainer.train()
    trainer.save_model(training_args.output_dir)

    print(f"Testing on {len(test_dataset)}.")
    pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test')
    print(f"{pred.metrics}")
Пример #2
0
def train(no_cache: bool, dataset_path: str, data_config_name: str,
          training_args: TrainingArguments, tokenizer: RobertaTokenizerFast):
    print(f"tokenizer vocab size: {tokenizer.vocab_size}")

    print(f"\nLoading and tokenizing datasets found in {dataset_path}.")
    train_dataset, eval_dataset, test_dataset = load_dataset(
        'EMBO/sd-nlp',
        # './tokcl/loader.py',
        data_config_name,
        script_version="main",
        # data_dir=dataset_path,
        split=["train", "validation", "test"],
        # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS,
        cache_dir=CACHE)
    print(f"\nTraining with {len(train_dataset)} examples.")
    print(f"Evaluating on {len(eval_dataset)} examples.")

    if data_config_name in ["NER", "ROLES"]:
        # use our fancy data collator that randomly masks some of the inputs to enforce context learning
        training_args.remove_unused_columns = False  # we need tag_mask
        data_collator = DataCollatorForMaskedTokenClassification(
            tokenizer=tokenizer,
            max_length=config.max_length,
            masking_probability=training_args.masking_probability,
            replacement_probability=training_args.replacement_probability,
            select_labels=training_args.select_labels)
    else:
        # normal token classification
        data_collator = DataCollatorForTokenClassification(
            tokenizer=tokenizer, max_length=config.max_length)

    num_labels = train_dataset.info.features['labels'].feature.num_classes
    label_list = train_dataset.info.features['labels'].feature.names
    print(f"\nTraining on {num_labels} features:")
    print(", ".join(label_list))

    compute_metrics = MetricsComputer(label_list=label_list)

    model = RobertaForTokenClassification.from_pretrained(
        LM_MODEL_PATH,
        num_labels=num_labels,
        max_position_embeddings=config.max_length + 2)

    print("\nTraining arguments:")
    print(training_args)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      compute_metrics=compute_metrics,
                      callbacks=[ShowExample(tokenizer)])

    print(f"CUDA available: {torch.cuda.is_available()}")

    trainer.train()
    trainer.save_model(training_args.output_dir)

    print(f"Testing on {len(test_dataset)}.")
    pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test')
    print(f"{pred.metrics}")