Exemplo n.º 1
0
 def test_load_real_metric(self, metric_name):
     with tempfile.TemporaryDirectory() as temp_data_dir:
         download_config = DownloadConfig()
         download_config.download_mode = GenerateMode.FORCE_REDOWNLOAD
         load_metric(metric_name,
                     data_dir=temp_data_dir,
                     download_config=download_config)
Exemplo n.º 2
0
    def _eval_end(self, outputs, split="val"):
        val_loss_mean = torch.stack([x[f"{split}_loss"] for x in outputs]).mean().detach().cpu()
        preds = np.concatenate([x["pred"] for x in outputs], axis=0)

        if self.hparams.glue_output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif self.hparams.glue_output_mode == "regression":
            preds = np.squeeze(preds)

        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)

        results = {f"{split}_loss": val_loss_mean}

        to_return = (results, preds)

        # For validation dataset, include metric results.
        if split != "test":
            # HACK - to avoid pickle error I didn't assign this as class attribute
            metric = nlp.load_metric("glue", name=self.hparams.task)
            # HACK - the .tolist() call here is to prevent an error:
            # pyarrow.lib.ArrowInvalid: Floating point value truncated error
            results.update(metric.compute(preds.tolist(), out_label_ids.tolist()))

        # Test dataset should include idxs for submission
        else:
            idxs = np.concatenate([x["idx"] for x in outputs], axis=0)
            to_return += (idxs,)

        return to_return
Exemplo n.º 3
0
    def compute_rouge(
        self,
        dataset,
        document_column_name,
        summary_colunm_name,
        rouge_types=["rouge1", "rouge2", "rougeL"],
        **kwargs,
    ):
        """
        Generate hypotheses and compute ROUGE score between summaries and hypotheses
        Args:
            dataset (nlp.Dataset): dataset containing document to summarize
            document_column_name (str): name of the column of the dataset containing documents
            summary_colunm_name (str): name of the column of the dataset containing summaries
            rouge_types (lst(str)): list of ROUGE types you want to compute
            **kwargs: arguments to pass to the run function
        Return:
            score (dict(Score)): dict of ROUGE types with the score (see nlp metrics for details)
        """

        dataset = self.get_summaries(dataset, document_column_name, **kwargs)

        rouge_metric = load_metric("rouge")

        def compute_rouge_batch(example):
            predictions = example[f"{self.name}_hypothesis"]
            references = example[summary_colunm_name]
            rouge_metric.add_batch(predictions, references)

        dataset.map(compute_rouge_batch, batched=True)
        return dataset, rouge_metric.compute(rouge_types=rouge_types)
Exemplo n.º 4
0
    def __init__(self, source_language: str, target_language: str):

        self.source_language = source_language
        self.target_language = target_language

        forward_model_name = f'Helsinki-NLP/opus-mt-{source_language}-{target_language}'
        self.forward_model = MarianMTModel.from_pretrained(forward_model_name)
        self.forward_tokenizer = MarianTokenizer.from_pretrained(
            forward_model_name)
        self.forward_model.to('cuda')

        backward_model_name = f'Helsinki-NLP/opus-mt-{target_language}-{source_language}'
        self.backward_model = MarianMTModel.from_pretrained(
            backward_model_name)
        self.backward_tokenizer = MarianTokenizer.from_pretrained(
            backward_model_name)
        self.backward_model.to('cuda')

        self.bleu_metric = load_metric("bleu")
        self.bert_score_metric = load_metric("bertscore", device="cuda")
Exemplo n.º 5
0
 def _init_rouge(self, rouge_type, rouge_method):
     self.rouge_metric = load_metric("rouge")
     self.rouge_type = rouge_type
     if rouge_method == "precision":
         self.rouge_method = 0
     elif rouge_method == "recall":
         self.rouge_method = 1
     elif rouge_method == "fmeasure":
         self.rouge_method = 2
     else:
         raise ValueError(
             'rouge_method must be "precision", "recall" or "fmeasure"')
 def __init__(self, name, rouge_type="rouge2", rouge_method="precision"):
     super().__init__(name)
     self.rouge_metric = load_metric("rouge")
     self.rouge_type = rouge_type
     if rouge_method == "precision":
         self.rouge_method = 0
     elif rouge_method == "recall":
         self.rouge_method = 1
     elif rouge_method == "fmeasure":
         self.rouge_method = 2
     else:
         raise ValueError(
             'rouge_method must be "precision", "recall" or "fmeasure"')
Exemplo n.º 7
0
    def test_something(self):
        # Print all the available datasets
        print([dataset.id for dataset in nlp.list_datasets()])

        # Load a dataset and print the first examples in the training set
        squad_dataset = nlp.load_dataset('squad')
        print(squad_dataset['train'][0])

        # List all the available metrics
        print([metric.id for metric in nlp.list_metrics()])

        # Load a metric
        squad_metric = nlp.load_metric('squad')
        self.assertEqual(True, True)
Exemplo n.º 8
0
    def test_load_real_metric(self, metric_name):
        with tempfile.TemporaryDirectory() as temp_data_dir:
            download_config = DownloadConfig()
            download_config.force_download = True
            name = None
            if metric_name == "glue":
                name = "sst2"
            metric = load_metric(metric_name,
                                 name=name,
                                 data_dir=temp_data_dir,
                                 download_config=download_config)

            parameters = inspect.signature(metric._compute).parameters
            self.assertTrue("predictions" in parameters)
            self.assertTrue("references" in parameters)
            self.assertTrue(
                all([p.kind != p.VAR_KEYWORD
                     for p in parameters.values()]))  # no **kwargs
Exemplo n.º 9
0
    def __init__(self, hparams) -> None:
        super(T5FineTuner, self).__init__()
        self.hparams = hparams
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparams.model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name)
        self.rouge_metric = load_metric('rouge')

        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            self.freeze_params(self.model.get_encoder())
            self.assert_all_frozen(self.model.get_encoder())

        n_observations_per_split = {
            "train": self.hparams.n_train,
            "validation": self.hparams.n_val,
            "test": self.hparams.n_test,
        }
        self.n_obs = {
            k: v if v >= 0 else None
            for k, v in n_observations_per_split.items()
        }
 def __init__(self, module, compute_args={}):
     self.scorer = nlp.load_metric(module)
     self.compute_args = compute_args
Exemplo n.º 11
0
import sacrebleu
from nlp import load_metric

# target = open("./data/en-fi/test.trg")
# output = open("./outputs/en-fi.txt")

target = open("./data/en-fi/newstest2017-enfi.fi")
output = open("./outputs/newstest2017-en-fi.txt")

bert_score_metric = load_metric("bertscore", device="cuda")

targets = []
outputs = []

for target_sample, output_sample in zip(target, output):
    targets.append(target_sample)
    outputs.append(output_sample)

print(sacrebleu.corpus_bleu(outputs, [targets]).score)

print(bert_score_metric.compute(
    outputs,
    targets,
    lang='fi',
    model_type="roberta-base",
    device="cuda"
)['f1'].mean())
Exemplo n.º 12
0
    save(train_loader, save_path)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-data_path', default='../datasets/', type=str)
    parser.add_argument('-data_name', default='debate', type=str)
    parser.add_argument('-mode', default='train', type=str)
    parser.add_argument('-batch_size', default=4, type=int)
    parser.add_argument('-random_seed', type=int, default=199744)
    parser.add_argument('-minor_data', action='store_true')
    parser.add_argument('-percentage', default=100, type=int)
    args = parser.parse_args()

    import nlp
    rouge = nlp.load_metric('rouge')
    # set random seed
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    torch.backends.cudnn.deterministic = True

    # if use minor data
    if args.minor_data:
        print('makeing dataset for {}% data'.format(str(args.percentage)))
        if not os.path.exists(args.data_path + args.data_name + '/minor_data'):
            os.makedirs(args.data_path + args.data_name + 'minor_data')
    multi_news_builder(args)
Exemplo n.º 13
0
    def __init__(self, config, word_emb, con_emb, word_vocab, concept_vocab,
                 relation_vocab):
        super(DualTransformer, self).__init__()

        self.config = config
        self.word_vocab = word_vocab
        self.concept_vocab = concept_vocab
        self.relation_vocab = relation_vocab

        self.enc_word_embedding = self.build_embedding(word_emb, word_vocab,
                                                       self.config.d_enc_sent)
        self.word_encoder = SentTransformer(config, self.enc_word_embedding,
                                            word_vocab)
        if config.dual_enc and self.concept_vocab is not None and relation_vocab is not None:
            if config.share_con_vocab:
                self.enc_concept_embedding = self.enc_word_embedding
            else:
                self.enc_concept_embedding = self.build_embedding(
                    con_emb, concept_vocab, self.config.d_enc_concept)

            self.graph_encoder = GraphTransformer(config,
                                                  self.enc_concept_embedding,
                                                  concept_vocab,
                                                  relation_vocab)
        else:
            self.graph_encoder = None
        self.dec_word_embedding = self.enc_word_embedding
        self.position_encoder = PositionalEncoding(config.d_dec)
        dual_mode = getattr(config, "dual_mode", "cat")
        if config.dual_enc:
            if dual_mode == "cat":
                decoder_layer = DoubleAttnTransformerDecoderLayer(
                    d_model=config.d_dec,
                    d_sent=config.d_enc_sent,
                    d_con=config.d_enc_concept,
                    heads=config.n_head,
                    d_ff=1024,
                    dropout=config.dropout,
                    att_drop=config.dropout,
                    dual_enc=config.
                    dual_enc,  # dual_enc=False when use single sentence encoder
                )
            elif dual_mode == "graph_first":
                decoder_layer = DoubleAttnTransformerDecoderLayerGraphFirst(
                    d_model=config.d_dec,
                    d_enc=config.d_model +
                    config.d_concept if config.dual_enc else config.d_model,
                    heads=config.n_head,
                    d_ff=1024,
                    dropout=config.dropout,
                    att_drop=config.dropout,
                    dual_enc=config.
                    dual_enc,  # dual_enc=False when use single sentence encoder
                )
            elif dual_mode == "sent_first":
                decoder_layer = DoubleAttnTransformerDecoderLayerSentFirst(
                    d_model=config.d_dec,
                    d_enc=config.d_model +
                    config.d_concept if config.dual_enc else config.d_model,
                    heads=config.n_head,
                    d_ff=1024,
                    dropout=config.dropout,
                    att_drop=config.dropout,
                    dual_enc=config.
                    dual_enc,  # dual_enc=False when use single sentence encoder
                )
            else:
                print(
                    'Invalid dual_mode, should in (cat, graph_first, sent_first)'
                )
        else:
            decoder_layer = DoubleAttnTransformerDecoderLayer(
                d_model=config.d_dec,
                d_sent=config.d_enc_sent,
                d_con=config.d_enc_concept,
                heads=config.n_head,
                d_ff=1024,
                dropout=config.dropout,
                att_drop=config.dropout,
                dual_enc=config.
                dual_enc,  # dual_enc=False when use single sentence encoder
            )
        decoder_norm = nn.LayerNorm(config.d_dec)
        self.decoder = DoubleAttnTransformerDecoder(decoder_layer,
                                                    config.num_layer,
                                                    decoder_norm)

        if word_vocab is not None:
            self.word_vocab_size = len(self.word_vocab)
            self.BOS = self.word_vocab["<bos>"]
            self.EOS = self.word_vocab["<eos>"]

        self.projector = nn.Linear(config.d_dec, self.word_vocab_size)
        if self.config.share_vocab:  # existing bugs to be fixed
            self.projector.weight = self.dec_word_embedding.weight
        if self.config.use_kl_loss:
            self.kl = nn.KLDivLoss(size_average=False)

        if self.config.rl_ratio > 0.0 and self.config.rl_type == "bertscore":
            self.rl_metric = nlp.load_metric("bertscore")
```python
import nlp
# You need to give the total number of parallel python processes (num_process) and the id of each process (process_id)
bleu_metric = nlp.load_metric('bleu', process_id=torch.distributed.get_rank(),b num_process=torch.distributed.get_world_size())

for batch in dataloader:
    model_input, targets = batch
    predictions = model(model_inputs)
    bleu_metric.add_batch(predictions, targets)
score = bleu_metric.compute()  # Compute the score on the first node by default (can be set to compute on each node as well)
```

Example with a NER metric: `seqeval`
"""

ner_metric = nlp.load_metric('seqeval')
references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
predictions =  [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
ner_metric.compute(predictions, references)

"""# Adding a new dataset or a new metric

They are two ways to add new datasets and metrics in `nlp`:

- datasets can be added with a Pull-Request adding a script in the `datasets` folder of the [`nlp` repository](https://github.com/huggingface/nlp)

=> once the PR is merged, the dataset can be instantiate by it's folder name e.g. `nlp.load_dataset('squad')`. If you want HuggingFace to host the data as well you will need to ask the HuggingFace team to upload the data.

- datasets can also be added with a direct upload using `nlp` CLI as a user or organization (like for models in `transformers`). In this case the dataset will be accessible under the gien user/organization name, e.g. `nlp.load_dataset('thomwolf/squad')`. In this case you can upload the data yourself at the same time and in the same folder.

We will add a full tutorial on how to add and upload datasets soon.
Exemplo n.º 15
0
def main():
    args = get_args()

    dataset_dict = {
        "stsb": nlp.load_dataset('glue', name="stsb"),
        "rte": nlp.load_dataset('glue', name="rte"),
        "commonsense_qa": nlp.load_dataset('commonsense_qa'),
    }

    for task_name, dataset in dataset_dict.items():
        print(task_name)
        print(dataset_dict[task_name]["train"][0])
        print()

    multitask_model = MultitaskModel.create(
        model_name=model_name,
        model_type_dict={
            "stsb": transformers.AutoModelForSequenceClassification,
            "rte": transformers.AutoModelForSequenceClassification,
            "commonsense_qa": transformers.AutoModelForMultipleChoice,
        },
        model_config_dict={
            "stsb":
            transformers.AutoConfig.from_pretrained(model_name, num_labels=1),
            "rte":
            transformers.AutoConfig.from_pretrained(model_name, num_labels=2),
            "commonsense_qa":
            transformers.AutoConfig.from_pretrained(model_name),
        })

    if model_name.startswith("roberta-"):
        print(multitask_model.encoder.embeddings.word_embeddings.weight.
              data_ptr())
        print(multitask_model.taskmodels_dict["stsb"].roberta.embeddings.
              word_embeddings.weight.data_ptr())
        print(multitask_model.taskmodels_dict["rte"].roberta.embeddings.
              word_embeddings.weight.data_ptr())
        print(multitask_model.taskmodels_dict["commonsense_qa"].roberta.
              embeddings.word_embeddings.weight.data_ptr())

    convert_func_dict = {
        "stsb": convert_to_stsb_features,
        "rte": convert_to_rte_features,
        "commonsense_qa": convert_to_commonsense_qa_features,
    }

    columns_dict = {
        "stsb": ['input_ids', 'attention_mask', 'labels'],
        "rte": ['input_ids', 'attention_mask', 'labels'],
        "commonsense_qa": ['input_ids', 'attention_mask', 'labels'],
    }

    features_dict = {}
    for task_name, dataset in dataset_dict.items():
        features_dict[task_name] = {}
        for phase, phase_dataset in dataset.items():
            features_dict[task_name][phase] = phase_dataset.map(
                convert_func_dict[task_name],
                batched=True,
                load_from_cache_file=False,
            )
            print(task_name, phase, len(phase_dataset),
                  len(features_dict[task_name][phase]))
            features_dict[task_name][phase].set_format(
                type="torch",
                columns=columns_dict[task_name],
            )
            print(task_name, phase, len(phase_dataset),
                  len(features_dict[task_name][phase]))

    train_dataset = {
        task_name: dataset["train"]
        for task_name, dataset in features_dict.items()
    }
    trainer = MultitaskTrainer(
        model=multitask_model,
        args=transformers.TrainingArguments(
            output_dir=args.job_dir,
            overwrite_output_dir=True,
            learning_rate=1e-5,
            do_train=True,
            num_train_epochs=3,
            per_device_train_batch_size=args.batch_size,
            save_steps=3000,
        ),
        data_collator=NLPDataCollator(),
        train_dataset=train_dataset,
    )
    trainer.train()

    preds_dict = {}
    for task_name in ["rte", "stsb", "commonsense_qa"]:
        eval_dataloader = DataLoaderWithTaskname(
            task_name,
            trainer.get_eval_dataloader(
                eval_dataset=features_dict[task_name]["validation"]))
        print(eval_dataloader.data_loader.collate_fn)
        preds_dict[task_name] = trainer._prediction_loop(
            eval_dataloader,
            description=f"Validation: {task_name}",
        )

    # Evalute RTE
    nlp.load_metric('glue', name="rte").compute(
        np.argmax(preds_dict["rte"].predictions, axis=1),
        preds_dict["rte"].label_ids,
    )

    # Evalute STS-B
    nlp.load_metric('glue', name="stsb").compute(
        preds_dict["stsb"].predictions.flatten(),
        preds_dict["stsb"].label_ids,
    )

    # Evalute Commonsense QA
    np.mean(
        np.argmax(preds_dict["commonsense_qa"].predictions, axis=1) ==
        preds_dict["commonsense_qa"].label_ids)
Exemplo n.º 16
0
import nlp

# Print all the available datasets
res = nlp.list_datasets()

# Load a dataset and print the first examples in the training set
squad_dataset = nlp.load_dataset('squad')
print(squad_dataset['train'][0])

# List all the available metrics
print(nlp.list_metrics())

# Load a metric
squad_metric = nlp.load_metric('squad')
Exemplo n.º 17
0
def stats(dataset_script_path, dataset_cache_path, do_rouge):
    def words_counter(text):
        text = text.translate(str.maketrans(punctuation, " " * len(punctuation)))
        return len(text.split(" "))

    def sentences_counter(text):
        return len(sent_tokenize(text))

    rouge_metric = load_metric("rouge")
    num_sources = []
    sum_num_words = []
    sum_num_sentences = []
    doc_num_words = []
    doc_num_sentences = []

    def compute_stats(example):

        # Rouge score
        prediction = example["clean_document"]
        reference = example["clean_summary"]
        rouge_metric.add(prediction, reference)

        # Number of sources
        num_sources.append(example["document"].count("|||") + 1)

        # Summary length
        sum_num_words.append(words_counter(example["clean_summary"]))
        sum_num_sentences.append(sentences_counter(example["clean_summary"]))

        # Document length
        doc_num_words.append(words_counter(example["clean_document"]))
        doc_num_sentences.append(sentences_counter(example["clean_document"]))

    dataset = load_dataset(
        dataset_script_path, cache_dir=dataset_cache_path, split="train+test+validation"
    )

    dataset = dataset.map(compute_stats)

    if do_rouge:
        rouge_stats = rouge_metric.compute(rouge_types=["rouge1", "rouge2", "rougeL"])

    # Print number of examples
    print(f"The dataset contains {len(dataset)} examples.")

    # Print number of sources stats
    print(
        "number of article with:\n - 1 source: {}\n - 2 sources: {}\n - 3 sources: {}\n - 4 sources: {}\n - more sources: {}".format(
            num_sources.count(1),
            num_sources.count(2),
            num_sources.count(3),
            num_sources.count(4),
            len(num_sources)
            - num_sources.count(1)
            - num_sources.count(2)
            - num_sources.count(3)
            - num_sources.count(4),
        )
    )

    # Print length stats
    print(
        "number of words in document:\t{}\nnumber of sentences in document:\t{}\nnumber of words in summary:\t{}\nnumber of sentences in summary:\t{}\n".format(
            np.mean(doc_num_words),
            np.mean(doc_num_sentences),
            np.mean(sum_num_words),
            np.mean(sum_num_sentences),
        )
    )

    # Print ROUGE stats
    if do_rouge:
        print(
            "Rouge-1 R:\t{}\nRouge-2 R:\t{}\nRouge-L R:\t{}\n".format(
                rouge_stats["rouge1"].mid.recall,
                rouge_stats["rouge2"].mid.recall,
                rouge_stats["rougeL"].mid.recall,
            )
        )
    return None
Exemplo n.º 18
0
                        default=DEFAULT_MODEL_NAME)
    args = parser.parse_args()
    LOG.info("Parsed arguments %s", args)

    # Step 1: preprocess the dataset and load data
    lcsts = LCSTS(args.training_path,
                  args.val_path,
                  args.test_path,
                  output_path=args.preprocess_output_path)

    LOG.info("Train files saved to path {}".format(lcsts.train_merged_csv))
    LOG.info("Validation files saved to path {}".format(lcsts.val_merged_csv))
    LOG.info("Test files saved to path {}".format(lcsts.test_merged_csv))
    tokenizer = load_tokenizer(args.model_name)
    # load rouge for validation
    rouge = nlp.load_metric("rouge")

    # Load tokenizer
    if torch.cuda.device_count() > 0:
        with torch.cuda.device(0):
            import sys
            print('__Python VERSION:', sys.version)
            print('__pyTorch VERSION:', torch.__version__)
            print('__CUDA VERSION')
            from subprocess import call
            # call(["nvcc", "--version"]) does not work
            print('__CUDNN VERSION:', torch.backends.cudnn.version())
            print('__Number CUDA Devices:', torch.cuda.device_count())
            print('__Devices')
            call([
                "nvidia-smi", "--format=csv",