示例#1
0
文件: eval.py 项目: stmnk/haystack
def semantic_answer_similarity(
    predictions: List[List[str]],
    gold_labels: List[List[str]],
    sas_model_name_or_path:
    str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
) -> Tuple[List[float], List[float]]:
    """
    Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
    Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels
                        b) the highest similarity of all predictions to gold labels

    :param predictions: Predicted answers as list of multiple preds per question
    :param gold_labels: Labels as list of multiple possible answers per question
    :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string
                                     pointing to downloadable models.


    :return top_1_sas, top_k_sas
    """
    assert len(predictions) == len(gold_labels)

    config = AutoConfig.from_pretrained(sas_model_name_or_path)
    cross_encoder_used = False
    if config.architectures is not None:
        cross_encoder_used = any([
            arch.endswith('ForSequenceClassification')
            for arch in config.architectures
        ])

    # Compute similarities
    top_1_sas = []
    top_k_sas = []

    # Based on Modelstring we can load either Bi-Encoders or Cross Encoders.
    # Similarity computation changes for both approaches
    if cross_encoder_used:
        model = CrossEncoder(sas_model_name_or_path)
        for preds, labels in zip(predictions, gold_labels):
            # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards
            grid = []
            for p in preds:
                for l in labels:
                    grid.append((p, l))
            scores = model.predict(grid)
            top_1_sas.append(np.max(scores[:len(labels)]))
            top_k_sas.append(np.max(scores))
    else:
        # For Bi-encoders we can flatten predictions and labels into one list
        model = SentenceTransformer(sas_model_name_or_path)
        lengths: List[Tuple[int, int]] = []
        all_texts: List[str] = []
        for p, l in zip(predictions, gold_labels):  # type: ignore
            # TODO potentially exclude (near) exact matches from computations
            all_texts.extend(p)
            all_texts.extend(l)
            lengths.append((len(p), len(l)))
        # then compute embeddings
        embeddings = model.encode(all_texts)

        # then select which embeddings will be used for similarity computations
        current_position = 0
        for i, (len_p, len_l) in enumerate(lengths):
            pred_embeddings = embeddings[current_position:current_position +
                                         len_p, :]
            current_position += len_p
            label_embeddings = embeddings[current_position:current_position +
                                          len_l, :]
            current_position += len_l
            sims = cosine_similarity(pred_embeddings, label_embeddings)
            top_1_sas.append(np.max(sims[0, :]))
            top_k_sas.append(np.max(sims))

    return top_1_sas, top_k_sas
示例#2
0
class CESemanticSimilarityMetric(MetricBase):
    """This metric computes the semantic similarity of two sentences using Cross Encoder model.

    By default the we use stsb-roberta-large model.

    see `https://github.com/UKPLab/sentence-transformers` for more information.
    """
    def __init__(self,
                 ce_pretrained_model="stsb-roberta-large",
                 ce_gpu_id=-1,
                 **kargs):
        """Initialize ce model."""
        super(CESemanticSimilarityMetric, self).__init__()

        if ce_gpu_id == -1:
            logger.warning("CE metric is running on CPU.")
            device = "cpu"
        else:
            logger.info("CE metric is running on GPU %d.", ce_gpu_id)
            device = "cuda:%d" % ce_gpu_id

        logger.info("load ce model.")

        # TODO: use resources utils to manage model.

        self._model = CrossEncoder(
            resources.get_transformers(ce_pretrained_model), device=device)

    def _get_emb(self, sentences):
        """Compute the embedding of sentences."""
        return self._model.encode(sentences)

    def measure_batch(self,
                      origin,
                      paraphrase_list,
                      data_record=None,
                      paraphrase_field="text0"):
        """Measure the metric on a batch of paraphrase_list.

        Args:
            origin (str): the original text.
            paraphrase_list (list): a set of paraphrase_list.
            data_record (dict): the corresponding data record of original text.
            paraphrase_field (str): the field name to paraphrase.

        Returns:
            (list): a list containing the USE similarity metric for each paraphrase.
        """
        return [
            float(x)
            for x in self._model.predict([(origin, paraphrase)
                                          for paraphrase in paraphrase_list])
        ]

    def measure_example(self,
                        origin,
                        paraphrase,
                        data_record=None,
                        paraphrase_field="text0"):
        """Compute the perplexity ratio.

        Args:
            origin (str): original text.
            paraphrase (str): paraphrased text.
            data_record: ignored.
            paraphrase_field: ignored.
        """
        return float(self._model.predict([(origin, paraphrase)])[0])