示例#1
0
class BPEmbLang:
    """
    This object is used to lazily fetch [Embedding][whatlies.embedding.Embedding]s or
    [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]s from a Byte-Pair Encoding
    backend. This object is meant for retreival, not plotting.

    This language represents token-free pre-trained subword embeddings. Originally created by
    Benjamin Heinzerling and Michael Strube.

    Important:
        These vectors will auto-download. You can also specify "multi" to download
        embeddings for multiple language at the same time. A full list of available
        languages can be found [here](https://nlp.h-its.org/bpemb). The article that
        belongs to this work can be found [here](http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf).

    Arguments:
        lang: name of the model to load

    **Usage**:

    ```python
    > from whatlies.language import BPEmbLang
    > lang = BPEmbLang(lang="en")
    > lang['python']
    > lang = BPEmbLang(lang="multi")
    > lang[['hund', 'hond', 'dog']]
    ```
    """
    def __init__(self, lang):
        self.module = BPEmb(lang=lang)

    def __getitem__(self, item):
        """
        Retreive a single embedding or a set of embeddings. If an embedding contains multiple
        sub-tokens then we'll average them before retreival.

        Arguments:
            item: single string or list of strings

        **Usage**
        ```python
        > lang = BPEmbLang(lang="en")
        > lang['python']
        > lang[['python', 'snake']]
        > lang[['nobody expects', 'the spanish inquisition']]
        ```
        """
        if isinstance(item, str):
            return Embedding(item, self.module.embed(item).mean(axis=0))
        if isinstance(item, list):
            return EmbeddingSet(*[self[i] for i in item])
        raise ValueError(f"Item must be list of string got {item}.")
示例#2
0
class BPEmb_Embedding_distance(Distance):
    # when we init ourn object we have to decide the dimention of vocab size(VS), dimention of the space
    # dim and the language.
    # For more info https://github.com/bheinzerling/bpemb
    def __init__(self,
                 lang="en",
                 dim=200,
                 vs=200000,
                 distance_metric="cosine"):
        from bpemb import BPEmb
        self.bpemb = BPEmb(lang=lang, dim=dim, vs=vs)
        self.distance_metric = distance_metric

    def distance(self, sentence, keyword):
        sentence_raked = rake(sentence, "")
        # Use the embedding only of the sentence after rake
        sentence_embeddings = np.concatenate(
            [self.bpemb.embed(s) for s in sentence])
        keyword_embeddings = self.bpemb.embed(keyword)
        # find the distance between them. Again euclidian distance now
        distance = scipy.spatial.distance.cdist(sentence_embeddings,
                                                keyword_embeddings,
                                                self.distance_metric)
        return distance.mean()

    def multiple_distances(self, sentence, list_keywords, stop_words):
        sentence_raked = rake(sentence, stop_words)
        # Use the embedding only of the sentence after rake
        sentence_embeddings = np.concatenate(
            [self.bpemb.embed(s) for s in sentence])
        result = []
        for keyword in list_keywords:
            keyword_embeddings = self.bpemb.embed(keyword)
            distance = scipy.spatial.distance.cdist(sentence_embeddings,
                                                    keyword_embeddings,
                                                    "cosine")
            result.append(distance.mean())
        return result, sentence_embeddings
class BPEmbeddings(EmbeddingMan):
    def __init__(self, case_sensitive=False, dim=None, bp_vocab_size=0):
        super(BPEmbeddings, self).__init__(case_sensitive=case_sensitive,
                                           dim=dim)
        self.bp_vocab_size = bp_vocab_size
        self.model = BPEmb(lang="en", dim=self.dim, vs=self.bp_vocab_size)

    def get_embedding_vec(self, word):
        if self.model is None:
            self.model = BPEmb(lang="en", dim=self.dim, vs=self.bp_vocab_size)
        if not self.case_sensitive:
            word = word.lower()
        vecs = self.model.embed(word)
        return np.reshape(np.sum(vecs, axis=0), (self.dim, ))
示例#4
0
def get_multibpe_embeddings(x: List[str],
                            multibpemb=None,
                            vs=1000000,
                            dim=300):
    if multibpemb is None:
        multibpemb = BPEmb(lang="multi", vs=vs, dim=dim)

    embeddings = []
    for sentence in x:
        features = multibpemb.embed(sentence)
        embeddings.append(features)

    embeddings = pad(embeddings, [0 for _ in range(dim)], 32)
    return embeddings
class BPEmbEmbeddingsModel(EmbeddingsModel):
    """
    BPEmb embeddings network from `BPEmb: Tokenization-free Pre-trained Subword Embeddings in 275 Languages
    <https://www.aclweb.org/anthology/L18-1473/>`_. The arguments are the same as the
    `BPEmb class <https://github.com/bheinzerling/bpemb/blob/master/bpemb/bpemb.py>`_

    Params:
        kwargs: Same as the :class:`~bpemb.BPEmb` class.
    """
    def __init__(self, **kwargs) -> None:
        super().__init__()
        self.model = BPEmb(**kwargs)

    def __call__(self, word: str) -> ndarray:
        """
        Callable method to get a word vector.

        Args:
            word (str): Word to get vector.

        Return:
            The BP embedding for a word.
        """
        return self.model.embed(word)
示例#6
0
class EmCore(CoreBase):
    """Annotation core based on spacy parser, pos tagger"""
    def __init__(self, lang: str):
        if not can_process_lang(lang):
            raise ValueError(f"'{lang}' is not valid language id.")
        self.lang_model_definition = lang_models[lang]
        self.nlp = spacy.load(self.lang_model_definition.spacy_vocab_name)
        self.bpemb = BPEmb(lang=lang, dim=300)

    def can_process(self, doc: Document,
                    text_process_params: TextProcessParams) -> bool:
        return True if doc.main_lang == self.lang_model_definition.lang else False

    def _vectorize_entity(self, entity: Entity, kw: Span):
        vector = np.zeros((1, self.bpemb.dim), dtype='float32')
        counter = 0
        for t in kw:
            if not t.is_stop:
                w_vector = self.bpemb.embed(t.lemma_).sum(axis=0)
                vector += w_vector
                counter += 1
        if counter > 1:
            vector /= counter
        entity.vector = vector

    def _vectorize_sent(self, sent: Sentence, sp_sent: Span):
        vector = np.zeros((1, self.bpemb.dim), dtype='float32')
        counter = 0
        processable_pos = {'NOUN', 'PROPN', 'ADJ', 'VERB'}
        stop_words = self.nlp.Defaults.stop_words
        for t in sp_sent:
            if t.pos_ not in processable_pos:
                continue
            if t.is_stop or t.lower_ in stop_words or t.lemma_ in stop_words:
                continue
            w_vector = self.bpemb.embed(t.lemma_).sum(axis=0)
            vector += w_vector
            counter += 1
        if counter > 1:
            vector /= counter
        sent.embedding = vector

    @staticmethod
    def _count_tokens(kw: Entity) -> int:
        matches = _re_tokenizer.findall(kw.lemma)
        return len(matches)

    @staticmethod
    def _get_kw_indices(doc_vector: np.ndarray, kw_candidates: List[Entity],
                        kw_num: int, diversity_factor: float) -> List[int]:
        """ Method selects indices of keywords in kw_candidates list.
        Prototype: _MMR in https://github.com/swisscom/ai-research-keyphrase-extraction/blob/master/swisscom_ai/research_keyphrase/model/method.py
        original article: https://arxiv.org/pdf/1801.04470.pdf

        :param doc_vector: numpy array document embeddings.
        :param kw_candidates: list with candidates to keywords. Items has @Entity type.
        :param kw_num: number of desired keywords.
        :param diversity_factor: control tradeoff between informativeness and diversity [0..1]
        """
        if kw_num == 0 or kw_num is None:
            kw_num = 10
        kw_num = min(kw_num, len(kw_candidates))
        if kw_num == 0:
            return []
        if kw_num >= len(kw_candidates):
            return list(range(kw_num))

        kw_vectors_nd = np.vstack([entity.vector for entity in kw_candidates])
        selected_candidates = []
        unselected_candidates = [c for c in range(len(kw_candidates))]

        # select the first keyword
        kw_distances = cosine_similarity(doc_vector, kw_vectors_nd)
        kw_distances_1d = np.reshape(kw_distances, (kw_distances.shape[1]))
        selected: int = np.argmax(kw_distances_1d)
        selected_candidates.append(selected)
        unselected_candidates.remove(selected)
        kw_counter = EmCore._count_tokens(kw_candidates[selected])

        # select other N-1 keywords
        for _ in range(kw_num - 1):
            kw_vectors_selected_nd = np.vstack(
                [kw_candidates[ind].vector for ind in selected_candidates])
            kw_vectors_unselected_nd = np.vstack(
                [kw_candidates[ind].vector for ind in unselected_candidates])
            similarities = cosine_similarity(kw_vectors_unselected_nd,
                                             kw_vectors_selected_nd)
            max_similarity_per_unselected = np.max(similarities, axis=1)
            ranks = []
            for i, unselected_ind in enumerate(unselected_candidates):
                doc_sim = kw_distances_1d[unselected_ind]
                max_kw_sim = max_similarity_per_unselected[i]
                rank = (1 - diversity_factor
                        ) * doc_sim - diversity_factor * max_kw_sim
                ranks.append(rank)
            selected = unselected_candidates[np.argmax(ranks)]
            selected_candidates.append(selected)
            unselected_candidates.remove(selected)
            kw_counter += EmCore._count_tokens(kw_candidates[selected])
            if kw_counter >= kw_num:
                break

        return selected_candidates

    def _extract_keywords(self, doc: Document, kw_candidates, kw_num: int):
        res = {}
        for fragment, kw in kw_candidates:
            lemma: str = kw.lemma_
            if lemma not in res:
                entity = Entity(lemma, EntityKind.KEYWORD)
                res[lemma] = entity
                self._vectorize_entity(entity, kw)
            entity = res[lemma]
            entity.entries.append(fragment)

        entities = list(res.values())

        doc_vector = np.zeros((1, self.bpemb.dim), dtype='float32')
        count = 0
        for entity in entities:
            doc_vector += entity.vector * len(entity.entries)
            count += len(entity.entries)
        if count > 1:
            doc_vector /= count

        doc.embedding = doc_vector

        kw_indices = self._get_kw_indices(doc_vector, entities, kw_num,
                                          _diversity_factor)
        doc.keywords += [entities[ind] for ind in kw_indices]

    def _find_noun_chunks(self, sent: Span) -> List[Span]:
        result = []

        def add_buff_to_result(buff: List[Token]):
            if len(buff) > 0:
                doc: Doc = buff[0].doc
                start = buff[-1].idx
                end = buff[0].idx + len(buff[0])
                result.append(doc.char_span(start, end))
                buff.clear()

        nouns = {'NOUN', 'PROPN'}
        stop_words = self.nlp.Defaults.stop_words
        buff = []
        for t in reversed(sent):
            if t.is_stop or t.lower_ in stop_words or t.lemma_ in stop_words or len(
                    t) < 3:
                add_buff_to_result(buff)
                continue
            if t.pos_ in nouns:
                if len(buff) > 0 and buff[-1].pos_ not in nouns:
                    add_buff_to_result(buff)
                buff.append(t)
                continue
            if len(t) > 0 and t.pos_ == 'ADJ':
                buff.append(t)
                continue
            add_buff_to_result(buff)
            add_buff_to_result(buff)
        add_buff_to_result(buff)  # maybe buffer is not empty
        return result

    @staticmethod
    def _extract_textrank_digest(doc: Document, summary_size: int):
        sent_len = len(doc.sentences)
        sim_mat = np.zeros([sent_len, sent_len])
        for y in range(sent_len):
            for x in range(y + 1, sent_len):
                sim = max(
                    0.0001,
                    cosine_similarity(doc.sentences[x].embedding,
                                      doc.sentences[y].embedding))
                sim_mat[y][x] = sim
                sim_mat[x][y] = sim_mat[y][x]
        for x in range(sent_len):
            sim_mat[x] /= sim_mat[x].sum()
        nx_graph = nx.from_numpy_array(sim_mat)
        try:
            scores = nx.pagerank(nx_graph)
        except nx.PowerIterationFailedConvergence as ex:
            doc.warnings.append(
                MessageWithCode(
                    3,
                    "Exception PowerIterationFailedConvergence - cannot calculate "
                    "TextRank matrix, fallback is used."))
            filtered_indices = range(summary_size)
        else:
            ranged_importances: List[_ImportanceIndex] = []
            for i in range(sent_len):
                doc.sentences[i].importance = scores[i]
                ranged_importances.append(_ImportanceIndex(scores[i], i))
            ranged_importances = sorted(ranged_importances,
                                        key=lambda x: x.importance,
                                        reverse=True)
            filtered_indices: List[int] = sorted(
                [ri.index for ri in ranged_importances[:summary_size]])
        doc.summary.clear()
        for i in filtered_indices:
            entity = Entity(str(doc.sentences[i]), EntityKind.SUMMARY_SENTENCE)
            entity.entries.append(doc.sentences[i])
            doc.summary.append(entity)

    def process_document(self, doc: Document,
                         text_process_params: TextProcessParams):
        doc.sentences.clear()
        doc.keywords.clear()
        doc.entities.clear()

        paragraphs: List[FragmentPositions] = parse_paragraphs(doc.text)
        kw_candidates = []
        entities = defaultdict(list)
        for p in paragraphs:
            paragraph_text = text_fragment(doc.text, p)
            pp = self.nlp(paragraph_text)
            for sent in pp.sents:
                doc_sent = Sentence(doc, p.start + sent.start_char,
                                    p.start + sent.end_char,
                                    self.lang_model_definition.lang)
                self._vectorize_sent(doc_sent, sent)
                doc.sentences.append(doc_sent)

                kw_candidates += [(Fragment(doc, p.start + kw.start_char,
                                            p.start + kw.end_char), kw)
                                  for kw in self._find_noun_chunks(sent)]
            for en in pp.ents:
                en_key = _NeKey(en.lemma_, en.label_)
                entities[en_key].append(
                    Fragment(doc, p.start + en.start_char,
                             p.start + en.end_char))

        # self._extract_textrank_digest(doc,
        #                              text_process_params.summary_size.calculate_size(len(doc.sentences)))
        self._extract_keywords(doc, kw_candidates,
                               text_process_params.keywords_number)

        # Extract summary
        ratio = text_process_params.summary_size.calculate_ratio(
            len(doc.sentences))
        summary = summarizer.summarize(
            doc.text,
            ratio=ratio,
            language=self.lang_model_definition.summa_lang)
        for s in summary.splitlines():
            doc.summary.append(Entity(s, EntityKind.SUMMARY_SENTENCE))

        for en_key, entries in entities.items():
            ent = Entity(en_key.lemma, EntityKind.NAMED, en_key.subtype)
            ent.entries += entries
            doc.entities.append(ent)
示例#7
0
class BytePairLanguage(SklearnTransformerMixin):
    """
    This object is used to lazily fetch [Embedding][whatlies.embedding.Embedding]s or
    [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]s from a Byte-Pair Encoding
    backend. This object is meant for retreival, not plotting.

    This language represents token-free pre-trained subword embeddings. Originally created by
    Benjamin Heinzerling and Michael Strube.

    Important:
        These vectors will auto-download by the [BPEmb package](https://nlp.h-its.org/bpemb/).
        You can also specify "multi" to download multi language embeddings. A full list of available
        languages can be found [here](https://nlp.h-its.org/bpemb). The article that
        belongs to this work can be found [here](http://www.lrec-conf.org/proceedings/lrec2018/pdf/1049.pdf)
        Recognition should be given to Benjamin Heinzerling and Michael Strube for making these available.
        The availability of vocabulary size as well as dimensionality can be varified
        on the project website. See [here](https://nlp.h-its.org/bpemb/en/) for an
        example link in English. Please credit the original authors if you use their work.

    Warning:
        This class used to be called `BytePairLang`.

    Arguments:
        lang: name of the model to load
        vs: vocabulary size of the byte pair model
        dim: the embedding dimensionality
        cache_dir: The folder in which downloaded BPEmb files will be cached

    Typically the vocabulary size given from this backend can be of size 1000,
    3000, 5000, 10000, 25000, 50000, 100000 or 200000. The available dimensionality
    of the embbeddings typically are 25, 50, 100, 200 and 300.

    **Usage**:

    ```python
    > from whatlies.language import BytePairLanguage
    > lang = BytePairLanguage(lang="en")
    > lang['python']
    > lang = BytePairLanguage(lang="multi")
    > lang[['hund', 'hond', 'dog']]
    ```
    """

    def __init__(
        self, lang, vs=10000, dim=100, cache_dir=Path.home() / Path(".cache/bpemb")
    ):
        self.lang = lang
        self.vs = vs
        self.dim = dim
        self.cache_dir = cache_dir
        self.module = BPEmb(lang=lang, vs=vs, dim=dim, cache_dir=cache_dir)

    def __getitem__(self, item):
        """
        Retreive a single embedding or a set of embeddings. If an embedding contains multiple
        sub-tokens then we'll average them before retreival.

        Arguments:
            item: single string or list of strings

        **Usage**
        ```python
        > lang = BytePairLanguage(lang="en")
        > lang['python']
        > lang[['python', 'snake']]
        > lang[['nobody expects', 'the spanish inquisition']]
        ```
        """
        if isinstance(item, str):
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore", category=RuntimeWarning)
                return Embedding(item, self.module.embed(item).mean(axis=0))
        if isinstance(item, list):
            return EmbeddingSet(*[self[i] for i in item])
        raise ValueError(f"Item must be list of string got {item}.")

    def _prepare_queries(self, lower):
        queries = [w for w in self.module.emb.vocab.keys()]
        if lower:
            queries = [w for w in queries if w.lower() == w]
        return queries

    def _calculate_distances(self, emb, queries, metric):
        vec = emb.vector
        vector_matrix = np.array([self[w].vector for w in queries])
        # there are NaNs returned, good to investigate later why that might be
        vector_matrix = np.array(
            [np.zeros(v.shape) if np.any(np.isnan(v)) else v for v in vector_matrix]
        )
        return pairwise_distances(vector_matrix, vec.reshape(1, -1), metric=metric)

    def score_similar(
        self,
        emb: Union[str, Embedding],
        n: int = 10,
        metric="cosine",
        lower=False,
    ) -> List:
        """
        Retreive a list of (Embedding, score) tuples that are the most similar to the passed query.

        Arguments:
            emb: query to use
            n: the number of items you'd like to see returned
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Returns:
            An list of ([Embedding][whatlies.embedding.Embedding], score) tuples.
        """
        if isinstance(emb, str):
            emb = self[emb]

        queries = self._prepare_queries(lower=lower)
        distances = self._calculate_distances(emb=emb, queries=queries, metric=metric)
        by_similarity = sorted(zip(queries, distances), key=lambda z: z[1])

        if len(queries) < n:
            warnings.warn(
                f"We could only find {len(queries)} feasible words. Consider changing `top_n` or `lower`",
                UserWarning,
            )

        return [(self[q], float(d)) for q, d in by_similarity[:n]]

    def embset_similar(
        self,
        emb: Union[str, Embedding],
        n: int = 10,
        lower=False,
        metric="cosine",
    ) -> EmbeddingSet:
        """
        Retreive an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] that are the most similar to the passed query.

        Arguments:
            emb: query to use
            n: the number of items you'd like to see returned
            metric: metric to use to calculate distance, must be scipy or sklearn compatible
            lower: only fetch lower case tokens

        Important:
            This method is incredibly slow at the moment without a good `top_n` setting due to
            [this bug](https://github.com/facebookresearch/fastText/issues/1040).

        Returns:
            An [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] containing the similar embeddings.
        """
        embs = [
            w[0] for w in self.score_similar(emb=emb, n=n, lower=lower, metric=metric)
        ]
        return EmbeddingSet({w.name: w for w in embs})
#print(bpemb_en.encode("Stratford"))
#print(bpemb_en.embed("Stratford").shape)

with open("datasets/Chatbot/train.csv") as f:
    reader = csv.reader(f, delimiter='\t')
    max_len = 0
    y = []
    for row in tqdm(reader):
        y.append(row[1])
        sample_len = len(bpemb_en.encode(row[0]))
        max_len = sample_len if sample_len > max_len else max_len

#print(max_len)
#print(y[:10])

# label encoder
le = LabelEncoder()
encoded_labels = le.fit_transform(y)
#print(encoded_labels)
print(le.classes_)

x = None

y = to_categorical(encoded_labels, num_classes=len(le.classes_))
#print(y)

with open("datasets/Chatbot/train.csv") as f:
    reader = csv.reader(f, delimiter='\t')
    for row in tqdm(reader):
        embeddings = bpemb_en.embed(row[0])
ids


# In[73]:

bpemb_fr.vectors[ids].shape


# In[74]:

emb_layer(tensor(ids)).shape


# In[75]:

bpemb_fr.embed("Ceci est une phrase française").shape


# In[76]:

bpemb_en.most_similar("shire", topn=20)


# In[77]:

# Train - Test Split
X, y = sub_Eng,sub_Fren
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape