def test_lookups_to_from_disk_via_vocab():
    table_name = "test"
    vocab = Vocab()
    vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
    assert table_name in vocab.lookups
    with make_tempdir() as tmpdir:
        vocab.to_disk(tmpdir)
        new_vocab = Vocab()
        new_vocab.from_disk(tmpdir)
    assert len(new_vocab.lookups) == len(vocab.lookups)
    assert table_name in new_vocab.lookups
    table = new_vocab.lookups.get_table(table_name)
    assert len(table) == 2
    assert table["hello"] == "world"
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / "vocab"
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
    assert vocab2[strings[0]].norm_ == lex_attr
Пример #3
0
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
    vocab1 = Vocab(strings=strings)
    vocab2 = Vocab()
    vocab1[strings[0]].norm_ = lex_attr
    assert vocab1[strings[0]].norm_ == lex_attr
    assert vocab2[strings[0]].norm_ != lex_attr
    with make_tempdir() as d:
        file_path = d / "vocab"
        vocab1.to_disk(file_path)
        vocab2 = vocab2.from_disk(file_path)
    assert vocab2[strings[0]].norm_ == lex_attr
Пример #4
0
def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
    nlp = English()
    nlp_plain = English()
    # load both vec and hashvec tables
    with make_tempdir() as tmpdir:
        p = tmpdir / "test.hashvec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_hashvec_str)
        convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret")
        p = tmpdir / "test.vec"
        with open(p, "w") as fileh:
            fileh.write(floret_vectors_vec_str)
        convert_vectors(nlp_plain, p, truncate=0, prune=-1)

    word = "der"
    # ngrams: full padded word + padded 2-grams + padded 3-grams
    ngrams = nlp.vocab.vectors._get_ngrams(word)
    assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"]
    # rows: 2 rows per ngram
    rows = OPS.xp.asarray(
        [
            h % nlp.vocab.vectors.shape[0] for ngram in ngrams
            for h in nlp.vocab.vectors._get_ngram_hashes(ngram)
        ],
        dtype="uint32",
    )
    assert_equal(
        OPS.to_numpy(rows),
        numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]),
    )
    assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count
    # all vectors are equivalent for plain static table vs. hash ngrams
    for word in nlp_plain.vocab.vectors:
        word = nlp_plain.vocab.strings.as_string(word)
        assert_almost_equal(nlp.vocab[word].vector,
                            nlp_plain.vocab[word].vector,
                            decimal=3)

        # every word has a vector
        assert nlp.vocab[word * 5].has_vector

    # n_keys is -1 for floret
    assert nlp_plain.vocab.vectors.n_keys > 0
    assert nlp.vocab.vectors.n_keys == -1

    # check that single and batched vector lookups are identical
    words = [s for s in nlp_plain.vocab.vectors]
    single_vecs = OPS.to_numpy(
        OPS.asarray([nlp.vocab[word].vector for word in words]))
    batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words))
    assert_equal(single_vecs, batch_vecs)

    # an empty key returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab[""].vector),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )
    # an empty batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch([""])),
        numpy.zeros((1, nlp.vocab.vectors.shape[0])),
    )
    # an empty key within a batch returns 0s
    assert_equal(
        OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]),
        numpy.zeros((nlp.vocab.vectors.shape[0], )),
    )

    # the loaded ngram vector table cannot be modified
    # except for clear: warning, then return without modifications
    vector = list(range(nlp.vocab.vectors.shape[1]))
    orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.set_vector("the", vector)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab[word].vector = vector
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.add("the", row=6)
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.warns(UserWarning):
        nlp.vocab.vectors.resize(shape=(100, 10))
    assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"])
    with pytest.raises(ValueError):
        nlp.vocab.vectors.clear()

    # data and settings are serialized correctly
    with make_tempdir() as d:
        nlp.vocab.to_disk(d)
        vocab_r = Vocab()
        vocab_r.from_disk(d)
        assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes()
        assert_equal(OPS.to_numpy(nlp.vocab.vectors.data),
                     OPS.to_numpy(vocab_r.vectors.data))
        assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg())
        assert_almost_equal(
            OPS.to_numpy(nlp.vocab[word].vector),
            OPS.to_numpy(vocab_r[word].vector),
            decimal=6,
        )
Пример #5
0
class NlPipe:
    """
    class for creating LDA models using sklearn. Deprecated as gensim is used.
    """
    def __init__(self,
                 list_of_docs,
                 document_ids=None,
                 language_model="en_core_web_lg",
                 tagger=False,
                 parser=False,
                 ner=False,
                 categorization=False,
                 remove_stopwords=True,
                 remove_punctuation=True,
                 set_lower=True,
                 remove_num=True,
                 expand_stopwords=True,
                 language_detection=False,
                 allowed_languages=frozenset({'en'})):
        """
        :param list_of_docs: List of strings where every document is one string.
        :param document_ids: The ids of the documents, matching the order of the list_of_docs
        :param language_model: Spacy language model to be used for text preprocessing
        :param tagger: Use spacy part-of-speech tagger.
        :param parser: Use spacy to annotate syntactic dependencies in documents.
        :param ner: Use spacy for entity recognition and annotation.
        :param categorization: Use spacy to assign document labels
        :param remove_stopwords: Remove stop words during text preprocessing.
        :param remove_punctuation: Remove punctuation during text prssing.
        :param set_lower: Convert all strings to lowercase during text preprocessing.
        :param remove_num: Remove numeric characters during text preprocessing.
        :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words.
        :param language_detection: Detect language of docs.
        :param allowed_languages: Allowed language for the documents.
        """

        self.pipe_disable = []
        if not tagger:
            self.pipe_disable.append("tagger")
        if not parser:
            self.pipe_disable.append("parser")
        if not ner:
            self.pipe_disable.append("ner")
        if not categorization:
            self.pipe_disable.append("textcat")
        self.remove_punctuation = remove_punctuation
        self.remove_stop_words = remove_stopwords
        self.remove_num = remove_num
        self.set_lower = set_lower
        self.input_docs = list_of_docs
        self.document_ids = document_ids
        self.nlp = spacy.load(language_model)
        if expand_stopwords:
            stops = [stop for stop in self.nlp.Defaults.stop_words]
            for stop in stops:
                self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop))
        self.spacy_docs = None
        self.processed_docs = None
        self.vectorizer = None
        self.bag_of_words = None
        self.tf_idf = None
        self.preprocessing_batch_size = 500
        self.processes = multiprocessing.cpu_count() - 2
        self.lda_model = None
        self.lda_output = None
        self.grid_search = None
        self.evaluation_output = None
        self.result_df = None
        self.word_topic_df = None
        self.word_topic_intersection = None
        self.intersection_score = None
        self.allowed_languages = allowed_languages
        self.language_detection = language_detection
        self.spacy_vocab = None
        self.word_distance_dict = None
        self.word_topic_distance_sum = 0
        self.unigram_dict = None
        self.bigram_dict = None

    def enable_pipe_component(self, component):
        if component in self.pipe_disable:
            self.pipe_disable.remove(component)
            #todo: add info if not in list from beginning or if successfully enable

    def disable_pipe_component(self, component):
        if component not in self.pipe_disable:
            self.pipe_disable.append(component)
            # todo: add info if not in list from beginning or if successfully enable

    def preprocess_spacy(self):
        # todo: add language check
        if self.language_detection:
            self.spacy_docs = [
                doc for doc in tqdm(self.nlp.pipe(
                    self.input_docs,
                    disable=self.pipe_disable,
                    n_process=self.processes,
                    batch_size=self.preprocessing_batch_size),
                                    desc="Preprocessing text with spacy: ")
                if detect(doc.text) in self.allowed_languages
            ]
        else:
            self.spacy_docs = [
                doc for doc in tqdm(self.nlp.pipe(
                    self.input_docs,
                    disable=self.pipe_disable,
                    n_process=self.processes,
                    batch_size=self.preprocessing_batch_size),
                                    desc="Preprocessing text with spacy: ")
            ]

    def preprocess(self):
        self.processed_docs = []
        if not self.spacy_docs:
            self.preprocess_spacy()
        for spacy_doc in tqdm(
                self.spacy_docs,
                desc="Removing stop words/punctuation/numeric chars: "):
            doc = []
            for token in spacy_doc:
                if not self.remove_stop_words and token.is_stop:
                    word = token.text
                elif token.is_stop:
                    continue
                else:
                    word = token.text
                if self.set_lower:
                    word = word.lower()
                if self.remove_num:
                    word = re.sub(r"[\d]", "", word)
                if self.remove_punctuation:
                    word = re.sub(r"[\W]", "", word)
                if len(word) >= 2:
                    doc.append(word)
            self.processed_docs.append(doc)

    def create_bag_of_words(self, n_grams=(1, 1), min_df=0.01, max_df=0.6):
        self.preprocess_spacy()
        self.preprocess()
        joined_docs = []
        for doc in self.processed_docs:
            joined_docs.append(" ".join(doc))
        self.vectorizer = CountVectorizer(lowercase=False,
                                          ngram_range=n_grams,
                                          min_df=min_df,
                                          max_df=max_df)
        self.bag_of_words = self.vectorizer.fit_transform(joined_docs)

    def create_tf_idf(self, n_grams=(1, 1), min_df=0.01, max_df=0.6):
        self.preprocess_spacy()
        self.preprocess()
        joined_docs = []
        for doc in self.processed_docs:
            joined_docs.append(" ".join(doc))
        self.vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=n_grams,
                                          min_df=min_df,
                                          max_df=max_df)
        self.tf_idf = self.vectorizer.fit_transform(joined_docs)

    def create_lda_model(self, no_topics=10, input_type="bag"):
        self.lda_model = LDA(n_jobs=self.processes, n_components=no_topics)
        if input_type == "bag":
            if self.bag_of_words is None:
                self.create_bag_of_words()
            self.lda_output = self.lda_model.fit_transform(self.bag_of_words)
        else:
            self.create_tf_idf()
            self.lda_output = self.lda_model.fit_transform(self.tf_idf)

    def search_best_model(self,
                          n_components=[2, 3, 4, 5, 10, 15, 20, 25],
                          learning_decay=[.5, .7, .9],
                          input_type="bag"):
        lda_model = LDA()
        self.grid_search = GridSearchCV(lda_model, {
            "n_components": n_components,
            "learning_decay": learning_decay
        })
        if input_type == "bag":
            if self.bag_of_words is None:
                self.create_bag_of_words()
            self.grid_search.fit(self.bag_of_words)
        else:
            if self.tf_idf is None:
                self.create_tf_idf()
            self.grid_search.fit(self.tf_idf)

    def create_document_topic_df(self,
                                 model=None,
                                 no_topics=10,
                                 input_type="bag",
                                 input_matrix=None):
        if model is None:
            self.create_lda_model(no_topics=no_topics, input_type=input_type)
        else:
            self.lda_model = model
        if input_matrix is not None:
            self.evaluation_output = self.lda_model.fit_transform(input_matrix)
        elif input_type == "bag":
            self.evaluation_output = self.lda_model.fit_transform(
                self.bag_of_words)
        else:
            self.evaluation_output = self.lda_model.fit_transform(self.tf_idf)
        self.result_df = pd.DataFrame(self.evaluation_output)
        if self.document_ids is not None and not self.language_detection:
            self.result_df.index = self.document_ids
        elif self.document_ids is not None and self.language_detection:
            raise Warning(
                "Using document ids and language detection together is not implemented (yet)."
            )
        dominant_topic = np.argmax(self.result_df.values, axis=1)
        self.result_df['dominant_topic'] = dominant_topic

    def plot_document_topic_distribution(self):
        #todo: log normalize
        counter = Counter(self.result_df.dominant_topic)
        topic_dict = OrderedDict(
            sorted(counter.items(), key=lambda x: x[1], reverse=True))
        sns.barplot(x=list(topic_dict.values()),
                    y=list(topic_dict.keys()),
                    order=list(topic_dict.keys()),
                    orient='h')
        plt.show()

    def evaluate_model(self, no_words=30):
        keywords = np.array(self.vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in self.lda_model.components_:
            top_keyword_locations = (-topic_weights).argsort()[:no_words]
            topic_keywords.append(keywords.take(top_keyword_locations))
        self.word_topic_df = pd.DataFrame(
            topic_keywords, columns=[f"word_{x}" for x in range(no_words)])

    def evaluate_pyldavis(self):
        panel = pyLDAvis.sklearn.prepare(self.lda_model, self.bag_of_words,
                                         self.vectorizer)
        pyLDAvis.show(panel)

    def get_word_topic_intersection(self, no_words=30, no_topics=10):
        if not isinstance(self.word_topic_df, pd.DataFrame):
            self.evaluate_model(no_words=no_words)
        elif isinstance(
                self.word_topic_df,
                pd.DataFrame) and self.word_topic_df.shape[1] != no_words:
            self.evaluate_model(no_words=no_words)
        intersection_list = []
        intersection_score = 0
        all_combinations = [
            combo for combo in combinations(range(no_topics), 2)
        ]
        for x in range(no_topics):
            temp_list = []
            for y in range(no_topics):
                if x != y:
                    temp_list.append(
                        len(
                            set(self.word_topic_df[self.word_topic_df.index ==
                                                   x].values[0]).
                            intersection(self.word_topic_df[
                                self.word_topic_df.index == y].values[0])) /
                        no_words)
                if (x, y) in all_combinations:
                    intersection_score += len(
                        set(self.word_topic_df[self.word_topic_df.index == x].
                            values[0]).intersection(
                                self.word_topic_df[self.word_topic_df.index ==
                                                   y].values[0])) / no_words
                else:
                    temp_list.append(1)
            intersection_list.append(temp_list)
        self.intersection_score = intersection_score / len(all_combinations)
        self.word_topic_intersection = pd.DataFrame(intersection_list)

    def get_topic_word_distance_sum(self, no_words=30):
        self.word_distance_dict = {}
        if not isinstance(self.word_topic_df, pd.DataFrame):
            self.evaluate_model(no_words=no_words)
        elif isinstance(
                self.word_topic_df,
                pd.DataFrame) and self.word_topic_df.shape[1] != no_words:
            self.evaluate_model(no_words=no_words)
        if self.spacy_vocab is None:
            self.load_textgain_embs()
        for index in self.word_topic_df.index:
            topic_distance_sum = 0
            missing_count = 0
            for word_a, word_b in combinations(
                    self.word_topic_df[self.word_topic_df.index ==
                                       index].values[0], 2):
                if self.spacy_vocab.has_vector(
                        str(word_a)) and self.spacy_vocab.has_vector(
                            str(word_b)):
                    topic_distance_sum += np.linalg.norm(
                        self.spacy_vocab.get_vector(str(word_a)) -
                        self.spacy_vocab.get_vector(str(word_b)))
                else:
                    missing_count += 1
            self.word_distance_dict[index] = topic_distance_sum / (
                (factorial(no_words) /
                 (factorial(2) * factorial(no_words - 2))) - missing_count)
        self.word_topic_distance_sum = sum(
            self.word_distance_dict.values()) / len(
                self.word_distance_dict.keys())

        # todo: sum of distance between words in topic derived from word embedding
        # todo: sum of sum of distances divided by no topics

    def load_textgain_embs(self,
                           from_txt=False,
                           path="textgain_embeddings/spacy_vocab"):
        self.spacy_vocab = Vocab()
        if from_txt:
            with open(path) as f:
                for line in f:
                    split_line = line.split()
                    self.spacy_vocab.set_vector(
                        "".join(split_line[:-150]),
                        np.array([float(coord)
                                  for coord in split_line[-150:]]))
        else:
            self.spacy_vocab.from_disk(path)

    def calculate_coherence(self, type="cosine"):
        pass
        #todo: add coherence function here

    def calculate_jaccard(self):
        pass
        #todo: calculate jaccard distance here

    def calculate_cosine(self, word_1, word_2):
        return np.dot(
            word_1, word_2) / (np.linalg.norm(word_1) * np.linalg.norm(word_2))

    def calculate_dice(self):
        pass
        #todo: calculate dice coefficient here

    def calculate_centroid_sim(self):
        pass
        #todo: calculate centroid similarity here

    def calculate_word_probs(self):
        #todo: calculate unigram  and probability of words
        self.unigram_dict = defaultdict(int)
        self.bigram_dict = defaultdict(int)
        unigram_count = 0
        bigram_count = 0
        for doc in tqdm(self.processed_docs,
                        desc="calculation uni- and bigram probabilities: "):
            for i, word in enumerate(doc):
                self.unigram_dict[word] += 1
                unigram_count += 1
                try:
                    self.bigram_dict[" ".join([word, doc[i + 1]])] += 1
                    bigram_count += 1
                except:
                    pass
        self.unigram_dict = {
            k: v / unigram_count
            for k, v in self.unigram_dict.items()
        }
        self.bigram_dict = {
            k: v / bigram_count
            for k, v in self.bigram_dict.items()
        }

    def calculate_pmi(self, word_1, word_2):
        if self.unigram_dict is None or self.bigram_dict is None:
            self.calculate_word_probs()
        return np.log2(self.bigram_dict[" ".join([word_1, word_2])] /
                       (self.unigram_dict[word_1] * self.unigram_dict[word_2]))

    def calculate_npmi(self, word_1, word_2):
        return self.calculate_pmi(word_1, word_2) / (
            -np.log(self.bigram_dict[" ".join([word_1, word_2])]))

    def get_weight_vectors(self, weight=2, type="npmi"):
        pass