예제 #1
0
    def test_munge_complex(self):
        """Tests munge.split_corpus and munge.write_clean_corpus on a real-world
        example corpus (Don Quixote). Uses same tests from test_split_corpus_simple
        and test_write_clean_corpus_simple."""  # do i need to repeat what I'm checking for? or is this sufficient?

        # corpus_to_documents tests
        corpus = munge.import_corpus("test_files/quixote.txt")
        for doc in munge.corpus_to_documents(corpus)[:-1]:
            # all documents (except last) are between 250 and 500 words
            self.assertTrue(len(doc.split()) >= 250)
            self.assertTrue(len(doc.split()) <= 500)
            # all documents (except last) either end on punctuation or are 500 words. 5 characters back to accommodate
            # for extra characters like quotes and parentheses
            self.assertTrue(
                len(doc.split()) == 500 or "." in doc[-5:] or "!" in doc[-5:] or "?" in doc[-5:])

        # write_clean_corpus tests
        with open("test_files/munged_quixote.txt", "r") as in_file:
            for i, line in enumerate(in_file):
                features = line.split("\t")
                self.assertEqual(features[0], str(
                    self.sample_metadata["quixote"]["ids"][i]))
                self.assertEqual(
                    features[1], self.sample_metadata["quixote"]["names"][i])
                self.assertEqual(len(features), 3)
예제 #2
0
    def test_write_clean_corpus_simple(self):
        """Tests munge.write_clean_corpus on simple test files. checks:
                -every line of out file has correct formatting: < unique_id >\t < orig_doc_id >\t < text >
                -AssertionError is raised when list of unique_ids are not unique"""
        # every line has correct formatting <unique_id>\t<orig_doc_id>\t<text>
        with open("test_files/munged_angel_50.txt", "r") as in_file:
            for i, line in enumerate(in_file):
                features = line.split("\t")
                self.assertEqual(features[0], str(
                    self.sample_metadata["angel"]["ids"][i]))
                self.assertEqual(
                    features[1], self.sample_metadata["angel"]["names"][i])
                self.assertEqual(len(features), 3)

        # ids are unique
        corpus = munge.import_corpus("test_files/simple_angel_50.txt")
        split_angels = munge.corpus_to_documents(corpus)
        angel_ids_shallow = self.sample_metadata["angel"]["ids"][:][:-1]
        angel_ids_shallow.append(angel_ids_shallow[-1])
        with self.assertRaises(AssertionError):  # last id is repeated twice
            munge.write_clean_corpus(split_angels, angel_ids_shallow, self.sample_metadata["angel"]["names"],
                                     "test_files/angels_nonunique.txt")

        # doc names, doc ids, document lists are same length
        with self.assertRaises(AssertionError):  # last id is repeated twice
            munge.write_clean_corpus(split_angels, angel_ids_shallow[:-1], self.sample_metadata["angel"]["names"],
                                     "test_files/angels_nonunique.txt")
예제 #3
0
 def generate_metadata(self, keys, texts):
     """Creates a dictionary mapping each corpus in keys to a dictionary of ids and names
     generated for each document in that corpus"""
     metadata = {}
     for i, corpus in enumerate(keys):
         text = munge.corpus_to_documents(munge.import_corpus(texts[i]))
         ids = list(range(len(text)))
         names = [corpus + str(x) for x in ids]
         metadata[corpus] = {"text": text, "ids": ids, "names": names}
     return metadata
예제 #4
0
    def test_corpus_to_documents_simple(self):
        """Tests munge.corpus_to_documents on simple test corpora. checks:
                -function can handle text files and directories of text files
                -each line (document) is between 250 and 500 words
                -each line either ends on punctuation or is 500 words"""

        # import_corpus handles text files and directories containin txt files and other file types
        corpora = ["test_files/simple_whale_100.txt",
                   "test_files/simple_angel_50.txt", "test_files/"]
        for filename in corpora:
            corpus = munge.import_corpus(filename)
            for doc in munge.corpus_to_documents(corpus)[:-1]:
                # all documents (except last) are between 250 and 500 words
                self.assertTrue(len(doc.split()) >= 250)
                self.assertTrue(len(doc.split()) <= 500)
                # all documents (except last) either end on punctuation or are 500 words.
                # searching 5 characters back to account for extra characters like quotes and parentheses
                self.assertTrue(
                    len(doc.split()) == 500 or "." in doc[-5:] or "!" in doc[-5:] or "?" in doc[-5:])
예제 #5
0
    def _make_mallet_model(self, corpus_filepath, path_to_mallet, remove_stopwords, corpus_language, num_topics, **kwargs):
        """Returns a gensim-created topic model (class LdaMallet), and assigns class
        attributes _docs (an OrderedDict containing the preprocessed corpus documents)
        and _vocabulary (the corpus vocabulary (iter of str)). This function lowercases
        all words in the corpus, and removes stopwords if remove_stopwords is True.
        The keys for the document dictionary are unique document ids of the format
        "doc<i>" where <i> is the number of the document in the corpus."""
        munged_corpus = munge.corpus_to_doc_tokens(corpus_filepath)

        # make corpus lowercase, remove stopwords
        if remove_stopwords:
            stop_words = stopwords.words(corpus_language)
            prepped_corpus = [
                [word.lower() for word in doc if word.lower() not in stop_words] for doc in munged_corpus]
        else:
            prepped_corpus = [[word.lower() for word in doc]
                              for doc in munged_corpus]
        # TODO (7/12/19 faunam): make lowercasing corpus optional

        id_to_word = corpora.Dictionary(prepped_corpus)
        term_document_frequency = [
            id_to_word.doc2bow(doc) for doc in prepped_corpus]
        mallet_model = LdaMallet(path_to_mallet, corpus=term_document_frequency,
                                 id2word=id_to_word, num_topics=num_topics, **kwargs)

        docs = OrderedDict(("doc" + str(i), " ".join(doc))
                           for i, doc in enumerate(prepped_corpus))
        full_corpus = munge.corpus_to_documents(corpus_filepath)
        full_docs = OrderedDict(("doc" + str(i), doc)
                                for i, doc in enumerate(full_corpus))

        self._docs = docs
        self._full_docs = full_docs
        self._vocabulary = [word for word in id_to_word.values()]

        return mallet_model