예제 #1
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a set of embeddings.

        Arguments:
            query: list of strings

        **Usage**

        ```python
        from whatlies.language import CountVectorLanguage
        lang = CountVectorLanguage(n_components=2, ngram_range=(1, 2), analyzer="char")
        lang[['pizza', 'pizzas', 'firehouse', 'firehydrant']]
        ```
        """
        orig_str = isinstance(query, str)
        if orig_str:
            query = [query]
        if any([len(q) == 0 for q in query]):
            raise ValueError(
                "You've passed an empty string to the language model which is not allowed."
            )
        if self.fitted_manual:
            X = self.cv.transform(query)
            X_vec = self.svd.transform(X)
        else:
            X = self.cv.fit_transform(query)
            X_vec = self.svd.fit_transform(X)
        if orig_str:
            return Embedding(name=query[0], vector=X_vec[0])
        return EmbeddingSet(
            *[Embedding(name=n, vector=v) for n, v in zip(query, X_vec)]
        )
예제 #2
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > from whatlies.language import GensimLanguage
        > lang = GensimLanguage("wordvectors.kv")
        > lang['computer']
        > lang = GensimLanguage("wordvectors.kv")
        > lang[['computer', 'human', 'dog']]
        ```
        """
        if isinstance(query, str):
            if " " in query:
                return Embedding(
                    query, np.sum([self[q].vector for q in query.split(" ")], axis=0)
                )
            try:
                vec = np.sum([self.kv[q] for q in query.split(" ")], axis=0)
            except KeyError:
                vec = np.zeros(self.kv.vector_size)
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
 def _get_embedding(self, query: str):
     features = np.array(self.model(query, padding=False)[0])
     special_tokens_mask = self.model.tokenizer(
         query, return_special_tokens_mask=True,
         return_tensors="np")["special_tokens_mask"][0]
     vec = features[np.logical_not(special_tokens_mask)].sum(axis=0)
     return Embedding(query, vec)
예제 #4
0
    def __getitem__(self, item):
        """
        Retreive a single embedding or a set of embeddings. We retreive the sentence encoding that
        belongs to the entire utterance.

        Arguments:
            item: single string or list of strings

        **Usage**
        ```python
        from whatlies.language import DIETLanguage("path/to/model.tar.gz")
        lang[['hi', 'hello', 'greetings']]
        ```
        """
        if isinstance(item, str):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=RuntimeWarning)
                msg = Message({"text": item})
                for p in self.pipeline:
                    p.process(msg)
                diagnostic_data = msg.as_dict_nlu()["diagnostic_data"]
                key_of_interest = [
                    k for k in diagnostic_data.keys() if "DIET" in k
                ][0]
                # It's assumed that the final token in the array here represents the __CLS__ token.
                # These are also known as the "sentence embeddings"
                tensors = diagnostic_data[key_of_interest]["text_transformed"]
                return Embedding(item, tensors[-1][-1])
        if isinstance(item, list):
            return EmbeddingSet(*[self[i] for i in item])
        raise ValueError(f"Item must be list of strings got {item}.")
예제 #5
0
    def __getitem__(
            self, query: Union[str,
                               List[str]]) -> Union[Embedding, EmbeddingSet]:
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**

        ```python
        > from whatlies.language import ConveRTLanguage
        > lang = ConveRTLanguage()
        > lang['bank']
        > lang = ConveRTLanguage()
        > lang[['bank of the river', 'money on the bank', 'bank']]
        ```
        """
        if isinstance(query, str):
            query_tensor = tf.convert_to_tensor([query])
            encoding = self.model(query_tensor)
            if self.signature == "encode_sequence":
                vec = encoding["sequence_encoding"].numpy().sum(axis=1)[0]
            else:
                vec = encoding["default"].numpy()[0]
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
예제 #6
0
    def from_names_X(cls, names, X):
        """
        Constructs an `EmbeddingSet` instance from the given embedding names and vectors.

        Arguments:
            names: an iterable containing the names of embeddings
            X: an iterable of 1D vectors, or a 2D numpy array; it should have the same length as `names`

        Usage:

        ```python
        from whatlies.embeddingset import EmbeddingSet

        names = ["foo", "bar", "buz"]
        vecs = [
            [0.1, 0.3],
            [0.7, 0.2],
            [0.1, 0.9],
        ]

        emb = EmbeddingSet.from_names_X(names, vecs)
        ```
        """
        X = np.array(X)
        if len(X) != len(names):
            raise ValueError(
                f"The number of given names ({len(names)}) and vectors ({len(X)}) should be the same."
            )
        return cls({n: Embedding(n, v) for n, v in zip(names, X)})
예제 #7
0
def get_embedding(vec, text):
    """gets a single whatlies embedding from a textlist and a vector

    Args:
        vec (numpy.array): embedding vector for text
        text (str): text to embed

    Returns:
        whatlies.Embedding: embedding constructed with the given vector and text
    """
    return Embedding(text, vec)
예제 #8
0
 def __getitem__(self, string):
     doc = self.nlp(string)
     vec = doc.vector
     start, end = 0, -1
     split_string = string.split(" ")
     for idx, word in enumerate(split_string):
         if word[0] == "[":
             start = idx
         if word[-1] == "]":
             end = idx + 1
     if start != 0:
         if end != -1:
             vec = doc[start:end].vector
     return Embedding(string, vec)
예제 #9
0
    def __getitem__(self, query):
        """
        Retreive a single embedding or a set of embeddings.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > lang = SpacyLanguage("en_core_web_md")
        > lang['duck|NOUN']
        > lang[['duck|NOUN'], ['duck|VERB']]
        ```
        """
        if isinstance(query, str):
            vec = self.s2v[query]
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
예제 #10
0
    def __getitem__(self, query: Union[str, List[str]]):
        """
        Retreive a single embedding or a set of embeddings. Depending on the spaCy model
        the strings can support multiple tokens of text but they can also use the Bert DSL.
        See the Language Options documentation: https://rasahq.github.io/whatlies/tutorial/languages/#bert-style.

        Arguments:
            query: single string or list of strings

        **Usage**
        ```python
        > lang = FasttextLanguage("cc.en.300.bin")
        > lang['python']
        > lang[['python'], ['snake']]
        > lang[['nobody expects'], ['the spanish inquisition']]
        ```
        """
        if isinstance(query, str):
            self._input_str_legal(query)
            vec = self.model.get_word_vector(query)
            return Embedding(query, vec)
        return EmbeddingSet(*[self[tok] for tok in query])
예제 #11
0
    def average(self, name=None):
        """
        Takes the average over all the embedding vectors in the embeddingset. Turns it into
        a new `Embedding`.

        Arguments:
            name: manually specify the name of the average embedding

        Usage:

        ```python
        from whatlies.embeddingset import EmbeddingSet

        foo = Embedding("foo", [1.0, 0.0])
        bar = Embedding("bar", [0.0, 1.0])
        emb = EmbeddingSet(foo, bar)

        emb.average().vector                   # [0.5, 0,5]
        emb.average(name="the-average").vector # [0.5, 0.5]
        ```
        """
        name = f"{self.name}.average()" if not name else name
        x = self.to_X()
        return Embedding(name, np.mean(x, axis=0))
예제 #12
0
 def _get_embedding(self, query: str) -> Embedding:
     return Embedding(query, self.model(query).vector)
예제 #13
0
 def _get_embedding(self, query: str) -> Embedding:
     vec = self.model([query]).numpy()[0]
     return Embedding(query, vec)
예제 #14
0
    def plot(
        self,
        kind: str = "arrow",
        x_axis: Union[int, str, Embedding] = 0,
        y_axis: Union[int, str, Embedding] = 1,
        axis_metric: Optional[Union[str, Callable, Sequence]] = None,
        x_label: Optional[str] = None,
        y_label: Optional[str] = None,
        title: Optional[str] = None,
        color: str = None,
        show_ops: bool = False,
        annot: bool = True,
        axis_option: Optional[str] = None,
    ):
        """
        Makes (perhaps inferior) matplotlib plot. Consider using `plot_interactive` instead.

        Arguments:
            kind: what kind of plot to make, can be `scatter`, `arrow` or `text`
            x_axis: the x-axis to be used, must be given when dim > 2; if an integer, the corresponding
                dimension of embedding is used.
            y_axis: the y-axis to be used, must be given when dim > 2; if an integer, the corresponding
                dimension of embedding is used.
            axis_metric: the metric used to project each embedding on the axes; only used when the corresponding
                axis (i.e. `x_axis` or `y_axis`) is a string or an `Embedding` instance. It could be a string
                (`'cosine_similarity'`, `'cosine_distance'` or `'euclidean'`), or a callable that takes two vectors as input
                and returns a scalar value as output. To set different metrics for x- and y-axis, a list or a tuple of
                two elements could be given. By default (`None`), normalized scalar projection (i.e. `>` operator) is used.
            x_label: an optional label used for x-axis; if not given, it is set based on value of `x_axis`.
            y_label: an optional label used for y-axis; if not given, it is set based on value of `y_axis`.
            title: an optional title for the plot.
            color: the color of the dots
            show_ops: setting to also show the applied operations, only works for `text`
            annot: should the points be annotated
            axis_option: a string which is passed as `option` argument to `matplotlib.pyplot.axis` in order to control
                axis properties (e.g. using `'equal'` make circles shown circular in the plot). This might be useful
                for preserving geometric relationships (e.g. orthogonality) in the generated plot. See `matplotlib.pyplot.axis`
                [documentation](https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.axis.html#matplotlib-pyplot-axis)
                for possible values and their description.
        """
        if isinstance(x_axis, str):
            x_axis = self[x_axis]
        if isinstance(y_axis, str):
            y_axis = self[y_axis]

        if isinstance(axis_metric, (list, tuple)):
            x_axis_metric = axis_metric[0]
            y_axis_metric = axis_metric[1]
        else:
            x_axis_metric = axis_metric
            y_axis_metric = axis_metric

        embeddings = []
        for emb in self.embeddings.values():
            x_val, x_lab = emb._get_plot_axis_value_and_label(x_axis,
                                                              x_axis_metric,
                                                              dir="x")
            y_val, y_lab = emb._get_plot_axis_value_and_label(y_axis,
                                                              y_axis_metric,
                                                              dir="y")
            emb_plot = Embedding(name=emb.name,
                                 vector=[x_val, y_val],
                                 orig=emb.orig)
            embeddings.append(emb_plot)
        x_label = x_lab if x_label is None else x_label
        y_label = y_lab if y_label is None else y_label
        handle_2d_plot(
            embeddings,
            kind=kind,
            color=color,
            xlabel=x_label,
            ylabel=y_label,
            title=title,
            show_operations=show_ops,
            annot=annot,
            axis_option=axis_option,
        )
        return self