Пример #1
0
 def _format_results(self,
                     results: list,
                     as_dict=False,
                     user_uploaded=False
                     ) -> (List[HFModelResult], Dict[str, HFModelResult]):
     """
     Takes raw HuggingFace API results and makes them easier to read and work with
     """
     results = apply(HFModelResult, results)
     if not user_uploaded:
         results = [r for r in results if '/' not in r.name]
     if as_dict:
         dicts = apply(Self.to_dict(), results)
         results = {m['model_name']: m for m in dicts}
     return results
Пример #2
0
    def after_pred(self):
        "Generate SquadResults"
        for i, example_index in enumerate(self.example_indices):
            eval_feature = self.features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [self.pred[output][i] for output in self.pred]
            output = apply(Self.numpy(), to_detach(output))

            if isinstance(self.learn.model, self.xmodel_instances):
                # Some models like the ones in `self.xmodel_instances` use 5 arguments for their predictions
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                self.learn.pred = SquadResult(unique_id,
                                              start_logits,
                                              end_logits,
                                              start_top_index=start_top_index,
                                              end_top_index=end_top_index,
                                              cls_logits=cls_logits)
            else:
                start_logits, end_logits = output
                self.learn.pred = SquadResult(unique_id, start_logits,
                                              end_logits)
Пример #3
0
    def predict(
        self,
        text: Union[List[str], str],
        mini_batch_size: int = 32,
        grouped_entities: bool = True,
        **kwargs,
    ) -> List[List[Dict]]:
        """Predict method for running inference using the pre-trained token tagger model.
        Returns a list of lists of tagged entities.

        * **text** - String, list of strings, sentences, or list of sentences to run inference on
        * **mini_batch_size** - Mini batch size
        * **grouped_entities** - Set True to get whole entity span strings (Default True)
        * ****kwargs**(Optional) - Optional arguments for the Transformers tagger
        """
        if isinstance(text, str):
            text = [text]
        results: List[Dict] = []

        dataset = self._tokenize(text)
        dl = DataLoader(dataset, batch_size=mini_batch_size)

        logger.info(f'Running prediction on {len(dataset)} text sequences')
        logger.info(f'Batch size = {mini_batch_size}')

        outputs, _ = super().get_preds(dl=dl)

        inputs = apply(to_device, [b for b in dl], device='cpu')
        inputs = torch.cat(*inputs)
        inputs = apply(Self.numpy(), inputs)

        outputs = torch.cat([o['logits'] for o in outputs])
        outputs = apply(to_detach, outputs, cpu=True)
        outputs = apply(Self.numpy(), outputs)

        # Iterate through batch for tagged token predictions
        for idx, pred in enumerate(outputs):
            entities = pred
            input_ids = inputs[idx]
            tagged_entities = self._generate_tagged_entities(
                entities=entities,
                input_ids=input_ids,
                grouped_entities=grouped_entities)
            results += tagged_entities

        return results
Пример #4
0
    def predict(
        self,
        text: Union[List[str], str],
        t5_prefix: str = 'translate English to German',
        mini_batch_size: int = 32,
        num_beams: int = 1,
        min_length: int = 0,
        max_length: int = 128,
        early_stopping: bool = True,
        **kwargs,
    ) -> List[str]:
        """Predict method for running inference using the pre-trained sequence classifier model.  Keyword arguments
        for parameters of the method `Transformers.PreTrainedModel.generate()` can be used as well.

        * **text** - String, list of strings, sentences, or list of sentences to run inference on
        * **t5_prefix**(Optional) - The pre-appended prefix for the specificied task. Only in use for T5-type models.
        * **mini_batch_size** - Mini batch size
        * **num_beams** - Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search.  Default to 1.
        * **min_length** -  The min length of the sequence to be generated. Default to 0
        * **max_length** - The max length of the sequence to be generated. Between min_length and infinity. Default to 128
        * **early_stopping** - if set to True beam search is stopped when at least num_beams sentences finished per batch.
        * ****kwargs**(Optional) - Optional arguments for the Transformers `PreTrainedModel.generate()` method
        """

        # Make all inputs lists
        if isinstance(text, str):
            text = [text]

        # T5 requires 'translate: ' precursor text for pre-trained translator
        if isinstance(self.model, T5ForConditionalGeneration):
            text = [f'{t5_prefix}: {t}' for t in text]

        dataset = self._tokenize(text)
        dl = DataLoader(dataset, batch_size=mini_batch_size)
        translations = []

        logger.info(f'Running translator on {len(dataset)} text sequences')
        logger.info(f'Batch size = {mini_batch_size}')

        cb = GeneratorCallback(num_beams, min_length, max_length, early_stopping, **kwargs)

        preds,_ = super().get_preds(dl=dl, cbs=[cb])

        preds = apply(Self.squeeze(0), preds)

        for o in preds:
            translations.append(
                [
                    self.tokenizer.decode(
                        o,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=False,
                    )
                ].pop()
            )

        return translations
Пример #5
0
    def predict(
        self,
        text: Union[List[str], str],
        mini_batch_size: int = 32,
        num_tokens_to_produce: int = 50,
        **kwargs,
    ) -> List[str]:
        """Predict method for running inference using the pre-trained sequence classifier model.  Keyword arguments
        for parameters of the method `Transformers.PreTrainedModel.generate()` can be used as well.

        * **text** - String, list of strings, sentences, or list of sentences to run inference on
        * **mini_batch_size** - Mini batch size
        * **num_tokens_to_produce** - Number of tokens you want to generate
        * ****kwargs**(Optional) - Optional arguments for the Transformers `PreTrainedModel.generate()` method
        """
        with torch.no_grad():

            # Make all inputs lists
            if isinstance(text, str):
                text = [text]

            dataset = self._tokenize(text)
            dataloader = DataLoader(dataset, batch_size=mini_batch_size)
            results = []

            logger.info(
                f'Running text generator on {len(dataset)} text sequences')
            logger.info(f'Batch size = {mini_batch_size}')
            for batch in progress_bar(dataloader):
                self.model.eval()
                batch = apply(to_device, batch)

                if len(batch) == 3:
                    inputs = {
                        'input_ids': batch[0],
                        'attention_masks': batch[1],
                        'token_type_ids': batch[2],
                    }
                else:
                    inputs = {
                        'input_ids': batch[0],
                        'attention_masks': batch[1],
                    }
                # model.generate() does not have batch inference implemented yet
                generated_text = self._batch_generate(
                    inputs=inputs,
                    seq_len=batch[0].shape[1],
                    num_tokens_to_produce=num_tokens_to_produce,
                )
                results += generated_text

        return results