def __getitem__(self, query: Union[str, List[str]]): """ Retreive a set of embeddings. Arguments: query: list of strings **Usage** ```python from whatlies.language import CountVectorLanguage lang = CountVectorLanguage(n_components=2, ngram_range=(1, 2), analyzer="char") lang[['pizza', 'pizzas', 'firehouse', 'firehydrant']] ``` """ orig_str = isinstance(query, str) if orig_str: query = [query] if any([len(q) == 0 for q in query]): raise ValueError( "You've passed an empty string to the language model which is not allowed." ) if self.fitted_manual: X = self.cv.transform(query) X_vec = self.svd.transform(X) else: X = self.cv.fit_transform(query) X_vec = self.svd.fit_transform(X) if orig_str: return Embedding(name=query[0], vector=X_vec[0]) return EmbeddingSet( *[Embedding(name=n, vector=v) for n, v in zip(query, X_vec)] )
def __getitem__(self, query: Union[str, List[str]]): """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > from whatlies.language import GensimLanguage > lang = GensimLanguage("wordvectors.kv") > lang['computer'] > lang = GensimLanguage("wordvectors.kv") > lang[['computer', 'human', 'dog']] ``` """ if isinstance(query, str): if " " in query: return Embedding( query, np.sum([self[q].vector for q in query.split(" ")], axis=0) ) try: vec = np.sum([self.kv[q] for q in query.split(" ")], axis=0) except KeyError: vec = np.zeros(self.kv.vector_size) return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def _get_embedding(self, query: str): features = np.array(self.model(query, padding=False)[0]) special_tokens_mask = self.model.tokenizer( query, return_special_tokens_mask=True, return_tensors="np")["special_tokens_mask"][0] vec = features[np.logical_not(special_tokens_mask)].sum(axis=0) return Embedding(query, vec)
def __getitem__(self, item): """ Retreive a single embedding or a set of embeddings. We retreive the sentence encoding that belongs to the entire utterance. Arguments: item: single string or list of strings **Usage** ```python from whatlies.language import DIETLanguage("path/to/model.tar.gz") lang[['hi', 'hello', 'greetings']] ``` """ if isinstance(item, str): with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=RuntimeWarning) msg = Message({"text": item}) for p in self.pipeline: p.process(msg) diagnostic_data = msg.as_dict_nlu()["diagnostic_data"] key_of_interest = [ k for k in diagnostic_data.keys() if "DIET" in k ][0] # It's assumed that the final token in the array here represents the __CLS__ token. # These are also known as the "sentence embeddings" tensors = diagnostic_data[key_of_interest]["text_transformed"] return Embedding(item, tensors[-1][-1]) if isinstance(item, list): return EmbeddingSet(*[self[i] for i in item]) raise ValueError(f"Item must be list of strings got {item}.")
def __getitem__( self, query: Union[str, List[str]]) -> Union[Embedding, EmbeddingSet]: """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > from whatlies.language import ConveRTLanguage > lang = ConveRTLanguage() > lang['bank'] > lang = ConveRTLanguage() > lang[['bank of the river', 'money on the bank', 'bank']] ``` """ if isinstance(query, str): query_tensor = tf.convert_to_tensor([query]) encoding = self.model(query_tensor) if self.signature == "encode_sequence": vec = encoding["sequence_encoding"].numpy().sum(axis=1)[0] else: vec = encoding["default"].numpy()[0] return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def from_names_X(cls, names, X): """ Constructs an `EmbeddingSet` instance from the given embedding names and vectors. Arguments: names: an iterable containing the names of embeddings X: an iterable of 1D vectors, or a 2D numpy array; it should have the same length as `names` Usage: ```python from whatlies.embeddingset import EmbeddingSet names = ["foo", "bar", "buz"] vecs = [ [0.1, 0.3], [0.7, 0.2], [0.1, 0.9], ] emb = EmbeddingSet.from_names_X(names, vecs) ``` """ X = np.array(X) if len(X) != len(names): raise ValueError( f"The number of given names ({len(names)}) and vectors ({len(X)}) should be the same." ) return cls({n: Embedding(n, v) for n, v in zip(names, X)})
def get_embedding(vec, text): """gets a single whatlies embedding from a textlist and a vector Args: vec (numpy.array): embedding vector for text text (str): text to embed Returns: whatlies.Embedding: embedding constructed with the given vector and text """ return Embedding(text, vec)
def __getitem__(self, string): doc = self.nlp(string) vec = doc.vector start, end = 0, -1 split_string = string.split(" ") for idx, word in enumerate(split_string): if word[0] == "[": start = idx if word[-1] == "]": end = idx + 1 if start != 0: if end != -1: vec = doc[start:end].vector return Embedding(string, vec)
def __getitem__(self, query): """ Retreive a single embedding or a set of embeddings. Arguments: query: single string or list of strings **Usage** ```python > lang = SpacyLanguage("en_core_web_md") > lang['duck|NOUN'] > lang[['duck|NOUN'], ['duck|VERB']] ``` """ if isinstance(query, str): vec = self.s2v[query] return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def __getitem__(self, query: Union[str, List[str]]): """ Retreive a single embedding or a set of embeddings. Depending on the spaCy model the strings can support multiple tokens of text but they can also use the Bert DSL. See the Language Options documentation: https://rasahq.github.io/whatlies/tutorial/languages/#bert-style. Arguments: query: single string or list of strings **Usage** ```python > lang = FasttextLanguage("cc.en.300.bin") > lang['python'] > lang[['python'], ['snake']] > lang[['nobody expects'], ['the spanish inquisition']] ``` """ if isinstance(query, str): self._input_str_legal(query) vec = self.model.get_word_vector(query) return Embedding(query, vec) return EmbeddingSet(*[self[tok] for tok in query])
def average(self, name=None): """ Takes the average over all the embedding vectors in the embeddingset. Turns it into a new `Embedding`. Arguments: name: manually specify the name of the average embedding Usage: ```python from whatlies.embeddingset import EmbeddingSet foo = Embedding("foo", [1.0, 0.0]) bar = Embedding("bar", [0.0, 1.0]) emb = EmbeddingSet(foo, bar) emb.average().vector # [0.5, 0,5] emb.average(name="the-average").vector # [0.5, 0.5] ``` """ name = f"{self.name}.average()" if not name else name x = self.to_X() return Embedding(name, np.mean(x, axis=0))
def _get_embedding(self, query: str) -> Embedding: return Embedding(query, self.model(query).vector)
def _get_embedding(self, query: str) -> Embedding: vec = self.model([query]).numpy()[0] return Embedding(query, vec)
def plot( self, kind: str = "arrow", x_axis: Union[int, str, Embedding] = 0, y_axis: Union[int, str, Embedding] = 1, axis_metric: Optional[Union[str, Callable, Sequence]] = None, x_label: Optional[str] = None, y_label: Optional[str] = None, title: Optional[str] = None, color: str = None, show_ops: bool = False, annot: bool = True, axis_option: Optional[str] = None, ): """ Makes (perhaps inferior) matplotlib plot. Consider using `plot_interactive` instead. Arguments: kind: what kind of plot to make, can be `scatter`, `arrow` or `text` x_axis: the x-axis to be used, must be given when dim > 2; if an integer, the corresponding dimension of embedding is used. y_axis: the y-axis to be used, must be given when dim > 2; if an integer, the corresponding dimension of embedding is used. axis_metric: the metric used to project each embedding on the axes; only used when the corresponding axis (i.e. `x_axis` or `y_axis`) is a string or an `Embedding` instance. It could be a string (`'cosine_similarity'`, `'cosine_distance'` or `'euclidean'`), or a callable that takes two vectors as input and returns a scalar value as output. To set different metrics for x- and y-axis, a list or a tuple of two elements could be given. By default (`None`), normalized scalar projection (i.e. `>` operator) is used. x_label: an optional label used for x-axis; if not given, it is set based on value of `x_axis`. y_label: an optional label used for y-axis; if not given, it is set based on value of `y_axis`. title: an optional title for the plot. color: the color of the dots show_ops: setting to also show the applied operations, only works for `text` annot: should the points be annotated axis_option: a string which is passed as `option` argument to `matplotlib.pyplot.axis` in order to control axis properties (e.g. using `'equal'` make circles shown circular in the plot). This might be useful for preserving geometric relationships (e.g. orthogonality) in the generated plot. See `matplotlib.pyplot.axis` [documentation](https://matplotlib.org/3.1.0/api/_as_gen/matplotlib.pyplot.axis.html#matplotlib-pyplot-axis) for possible values and their description. """ if isinstance(x_axis, str): x_axis = self[x_axis] if isinstance(y_axis, str): y_axis = self[y_axis] if isinstance(axis_metric, (list, tuple)): x_axis_metric = axis_metric[0] y_axis_metric = axis_metric[1] else: x_axis_metric = axis_metric y_axis_metric = axis_metric embeddings = [] for emb in self.embeddings.values(): x_val, x_lab = emb._get_plot_axis_value_and_label(x_axis, x_axis_metric, dir="x") y_val, y_lab = emb._get_plot_axis_value_and_label(y_axis, y_axis_metric, dir="y") emb_plot = Embedding(name=emb.name, vector=[x_val, y_val], orig=emb.orig) embeddings.append(emb_plot) x_label = x_lab if x_label is None else x_label y_label = y_lab if y_label is None else y_label handle_2d_plot( embeddings, kind=kind, color=color, xlabel=x_label, ylabel=y_label, title=title, show_operations=show_ops, annot=annot, axis_option=axis_option, ) return self