Exemplo n.º 1
0
class SentenceEmbedding(Embedding):
    def __init__(self):
        logging.debug(
            f'torch.__config__.parallel_info():\n {torch.__config__.parallel_info()}'
        )
        self.__model_name = 'distiluse-base-multilingual-cased'
        # self.__text_embed = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
        self.__text_embed = SentenceTransformer(self.__model_name)
        self.__dim = self.__text_embed[-1].out_features

    def get_dim(self) -> int:
        return self.__dim

    def get_version(self) -> str:
        return 'sentence_transformers/' + self.__model_name

    @property
    def batch_size(self) -> int:
        return 32

    def transform(self, *, url: str = None, data: bytes = None) -> np.ndarray:
        x = self._transform_1(url=url)
        x = self._transform_2([x])
        return x[0]

    def _transform_1(self, *, url: str = None) -> str:
        if url is None:
            raise NotImplementedError
        if url.startswith('data:,'):
            text = url.split(',', maxsplit=1)[1]
        else:
            raise RequestIgnored('Not a sentence')
        if len(text) > self.__text_embed.get_max_seq_length():
            warnings.warn('Text truncated while embedding')
        return text

    def _transform_2(self, texts: List[str]):
        vv = self.__text_embed.encode(texts,
                                      batch_size=min(32, len(texts)),
                                      show_progress_bar=False)

        assert len(vv.shape) == 2 and vv.shape[0] == len(
            texts) and vv.shape[1] == self.__dim

        vv /= np.sum(vv**2, axis=1, keepdims=True)**0.5  # L2 normalization
        return vv

    def batching_map(self,
                     *,
                     url: Optional[str] = None,
                     data: Optional[bytes] = None):
        return self._transform_1(url=url)

    def batching_transform(self, *, batch: Any) -> Iterable[np.ndarray]:
        return self._transform_2(batch)
Exemplo n.º 2
0
    def convert_input_examples(self, examples: List[InputExampleSTS],
                               model: SentenceTransformer):
        """
        Converts input examples to a SmartBatchingDataset usable to train the model with
        SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader

        smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors.

        :param examples:
            the input examples for the training
        :param model
            the Sentence BERT model for the conversion
        :return: a SmartBatchingDataset usable to train the model with SentenceTransformer.smart_batching_collate as the collate_fn
            for the DataLoader
        """
        num_texts = len(examples[0].texts)
        inputs = [[] for _ in range(num_texts)]
        labels = []
        too_long = [0] * num_texts
        label_type = None
        iterator = examples
        max_seq_length = model.get_max_seq_length()

        if self.show_progress_bar:
            iterator = tqdm(iterator, desc="Convert dataset")

        for ex_index, example in enumerate(iterator):
            if label_type is None:
                if isinstance(example.label, int):
                    label_type = torch.long
                elif isinstance(example.label, float):
                    label_type = torch.float
            tokenized_texts = [model.tokenize(text) for text in example.texts]

            for i, token in enumerate(tokenized_texts):
                if max_seq_length != None and max_seq_length > 0 and len(
                        token) >= max_seq_length:
                    too_long[i] += 1

            labels.append(example.label)
            for i in range(num_texts):
                inputs[i].append(tokenized_texts[i])

        tensor_labels = torch.tensor(labels, dtype=label_type)

        logging.info("Num sentences: %d" % (len(examples)))
        for i in range(num_texts):
            logging.info(
                "Sentences {} longer than max_seqence_length: {}".format(
                    i, too_long[i]))

        self.tokens = inputs
        self.labels = tensor_labels
Exemplo n.º 3
0
def get_embedding(model,
                  reviews,
                  batch_size=16,
                  unigram_df=None,
                  token_info=False):
    """
    This function embeds the reviews using a Sentence Transformer model. 
    When token_info is True, the individual tokens embeddings and a 
    corresponding token_df is also returned. The token_df has the form
    "token"   | "sentence_id" | "pos"
    where "pos" is obtained using the spacy tokens (by finding the closest 
    unigram for the each token, if no match is found, "pos" is set to None)    

    Note that the sentence_embeddings slightly differ (magnitude of around 
    1e-07) depending on token_info is True, because the pooling is implemented 
    differently. 

    :param model: a SentenceTransformer model or a string that can be loaded
    via SentenceTransformer(model)
    :param reviews: a pandas Series or list containing the reviews 
    :param batch_size: The batchsize 
    :param unigram_df: When "token_df" is contained in components than the 
    unigram dataframe has to be passed (as obtained by get_ngram_df with n = 1)
    :param token_info: It true token_embeddings and token_df are also returned 

    :return: sentence_embeddings, (token_embeddings,  token_df) 
    """
    # input checking------------------------------------------------------------
    assert isinstance(model, (sentence_transformers.SentenceTransformer, str))
    if isinstance(model, str):
        try:
            model = SentenceTransformer(model)
        except:
            raise Exception("incorrect model string")
    assert isinstance(reviews,
                      (list, pd.core.frame.DataFrame, pd.core.series.Series))
    if isinstance(reviews, (pd.core.frame.DataFrame, pd.core.series.Series)):
        reviews = list(reviews)
    assert isinstance(batch_size, (int, np.int32, np.int64))
    assert isinstance(token_info, bool)
    if token_info:
        assert isinstance(unigram_df, pd.core.frame.DataFrame)
        assert "unigram" in unigram_df.columns
        assert "pos" in unigram_df.columns

    # get the pooling method that is used by the model
    options = [
        "pooling_mode_cls_token", "pooling_mode_mean_tokens",
        "pooling_mode_max_tokens", "pooling_mode_mean_sqrt_len_tokens"
    ]

    chosen_option_logical = \
        [getattr(model._last_module(), option) for option in options]

    pooling_names = ["cls", "mean", "max", "mean_sqrt_len"]

    pooling = pooling_names[int(np.where(chosen_option_logical)[0])]

    # the token information can only be obtained for the mean and cls pooling
    # (could be extended though)
    if pooling not in ["cls", "mean"]:
        assert not token_info

    # early exit if only return sentence embeddings-----------------------------
    if not token_info:
        sentence_embeddings = model.encode(reviews,
                                           show_progress_bar=True,
                                           output_value="sentence_embedding",
                                           batch_size=batch_size)
        sentence_embeddings = np.vstack(sentence_embeddings)
        return sentence_embeddings

    # token_info is True--------------------------------------------------------

    # now we are in the case where not only the sentence_embeddings, but also
    # the information about the tokens is relevant. We first create the token
    # embeddings and then do the pooling manually
    token_embeddings = model.encode(reviews,
                                    show_progress_bar=True,
                                    output_value="token_embeddings",
                                    batch_size=batch_size)

    max_length = model.get_max_seq_length()

    # note that in python [0,1,2][0:100] gives back [0,1,2]
    bert_ids = [model.tokenize(text)[0:(max_length - 2)] for text in reviews]
    # (we have to add the tokens for [CLS] and [SEP] manually)
    bert_ids = [[101] + x + [102] for x in bert_ids]
    lengths = [len(x) for x in bert_ids]
    sentence_ids = np.repeat(np.arange(len(bert_ids)), lengths)

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=True)

    # currently bert_ids is a list of lists that we have to flatten
    bert_ids = pd.Series([x for y in bert_ids for x in y])

    # decode ids to tokens
    tokens = pd.Series([tokenizer.decode([x]) for x in bert_ids])

    # now we have to obtain the POS-tags. the only tricky thing is that
    # the BERT-tokens and the spacy tokens might not necessarily conincide,
    # therefore we use the get_close_matches from the difflib module
    # that finds the matches between the bert tokens and the spacy unigrams
    # there are three possibilities:
    # 1. case: no close matches found -> pos set to None
    # 2. case: one close match found -> use that
    # 3. case: more than one close match found:
    #   e.g. indices [10, 45] (spacy tokens) while the bert token is 35
    #   we then want to obtain the token that is closest to the bert token.
    #   However we have to adjust to the different number of tokens / unigrams
    #   assume that the review contains 102 bert tokens but 50 spacy tokens
    #   when not adjusting for the length we would pick 35, because
    #   |35 - 45| = 10 and |10 - 25| = 20
    #   when adjusting for the length we calculate
    #   argmin(|10 * 100/50 - (35-1)|, |45 * 100/50 - (35 - 1)|) =
    #   argmin(14, 56) and we now pick the first one
    #   note that we use 100 instead of 102 because of the cls and sep token
    #   that don"t appear in the spacy tokens.

    bert_pos_list = list()

    for sentence_id in set(sentence_ids):  # only loop over every review once
        bert_tokens = list(tokens[sentence_ids == sentence_id])
        spacy_tokens = list(
            unigram_df["unigram"][unigram_df["sentence_id"] == sentence_id])
        spacy_pos = \
            list(unigram_df["pos"][unigram_df["sentence_id"] == sentence_id])

        n_bert_tokens = len(bert_tokens)
        n_spacy_tokens = len(spacy_tokens)

        # remove the padded tokens from the token embeddings
        # note that token_embeddings is a list such that token_embeddings[i]
        # contains a numpy array with the (padded) token_embeddings
        # the padding originates from the batching when calcualting the
        # embeddings
        token_embeddings[sentence_id] = \
            token_embeddings[sentence_id][:n_bert_tokens]
        for i, bert_token in enumerate(bert_tokens):
            if bert_token in ["[CLS]", "[SEP]"]:
                bert_pos_list.append(None)
                continue
            # get the closest token matches between the bert token and the
            # spacy tokens from the current review
            matches = get_close_matches_indexes(bert_token, spacy_tokens)
            if len(matches) == 0:
                bert_pos_list.append(None)
            elif len(matches) == 1:
                bert_pos_list.append(spacy_pos[matches[0]])
            else:  # see explanation above
                # in case of multiple matches the first on is chosen
                # (this behaviour is inherited from np.argmin in adjusted_argmin)
                # note that the index of the current bert token is i - 1 and
                # not i (as we don"t consider the CLS and SEP token)
                closest_index = adjusted_argmin(index=i - 1,
                                                spacy_matches=matches,
                                                n_bert=n_bert_tokens - 2,
                                                n_spacy=n_spacy_tokens)
                bert_pos_list.append(spacy_pos[matches[closest_index]])

    # calculate the sentence embeddings from the token embeddings
    # note that the padded tokens were already removed in the previous loop

    if pooling == "mean":
        sentence_embeddings = [
            np.apply_along_axis(np.mean, 0, x) for x in token_embeddings
        ]
    else:  # "cls"
        sentence_embeddings = [x[0, :] for x in token_embeddings]

    sentence_embeddings = np.vstack(sentence_embeddings)

    token_embeddings = np.vstack(token_embeddings)

    bert_token_df = pd.DataFrame({
        "token": tokens,
        "sentence_id": sentence_ids,
        "pos": bert_pos_list
    })

    return sentence_embeddings, token_embeddings, bert_token_df