Exemplo n.º 1
0
 def test_generate_neighbor_list(self):
     corpus = Corpus([self.d1, self.d2, self.d3, self.d4])
     l = corpus.generate_neighbor_list(self.d1)
     self.assertTrue(((l[0] == ("1", 0)) and (l[1] == ("3", 0))) or
                     ((l[0] == ("3", 0)) and (l[1] == ("1", 0))))
     self.assertEqual(l[2], ("4", 2))
     self.assertEqual(l[3], ("2", 20))
Exemplo n.º 2
0
 def test_neighbors(self):
     corpus = Corpus([self.d1, self.d2, self.d3, self.d4])
     neighbors = corpus.neighbors(self.d1, 10)
     n = [doc.doc_id for doc in neighbors]
     self.assertEqual(set(["1", "3", "4"]), set(n))
     neighbors = corpus.neighbors(self.d1, 20)
     n = [doc.doc_id for doc in neighbors]
     self.assertEqual(set(["1", "2", "3", "4"]), set(n))
     neighbors = corpus.neighbors(self.d1, 0)
     n = [doc.doc_id for doc in neighbors]
     self.assertEqual(set(["1", "3"]), set(n))
Exemplo n.º 3
0
 def __init__(self, output=None, corpus=None,
              attribute="categories",
              categories=None, mode="combined"):
     """
     Create a CategoryToCorpus module, which loads a corpus with tagged
     documents.
     If corpus is passed in, it adds to an existing corpus.
     mode is the corpus loading method to use.  If set to "combined", all
     documents in a category are concatenated to a single document.
     Otherwise each document is loaded separately.
     """
     self.output = output
     self.corpora = {}
     # combined mode has a single corpus
     if corpus == None:
         self.corpus = Corpus()
     else:
         self.corpus = corpus
     self.module_type = enumModuleType(enumModuleType.Document)
     self.module_processing_type = \
         enumModuleProcessingType(enumModuleProcessingType.PostProcess)
     self.attribute = attribute
     self.categories = categories
     self.mode = mode
     self.pp = pprint.PrettyPrinter(indent=4)
Exemplo n.º 4
0
class AddToCorpus(PipelineModule):
    def __init__(self, output=None, corpus=None):
        self.output = output
        self.corpus = Corpus() if (corpus == None) else corpus
        self.module_type = enumModuleType(enumModuleType.Document)
        self.module_processing_type = \
            enumModuleProcessingType(enumModuleProcessingType.PostProcess)

    def process(self, data):
        for document in data:
            print data
            self.corpus.add(data)

    def post_process(self):
        return self.corpus

    def as_json(self):
        json.dumps(self.corpus, sort_keys=True, indent=4, separators=(',', ': '))

    def write(self):
        if self.output != None:
            f = open(self.output, 'w')
            f.write(self.as_json())
            f.close()
Exemplo n.º 5
0
class CategoryToCorpus(PipelineModule):
    def __init__(self, output=None, corpus=None,
                 attribute="categories",
                 categories=None, mode="combined"):
        """
        Create a CategoryToCorpus module, which loads a corpus with tagged
        documents.
        If corpus is passed in, it adds to an existing corpus.
        mode is the corpus loading method to use.  If set to "combined", all
        documents in a category are concatenated to a single document.
        Otherwise each document is loaded separately.
        """
        self.output = output
        self.corpora = {}
        # combined mode has a single corpus
        if corpus == None:
            self.corpus = Corpus()
        else:
            self.corpus = corpus
        self.module_type = enumModuleType(enumModuleType.Document)
        self.module_processing_type = \
            enumModuleProcessingType(enumModuleProcessingType.PostProcess)
        self.attribute = attribute
        self.categories = categories
        self.mode = mode
        self.pp = pprint.PrettyPrinter(indent=4)

    def add_document(self, category, document):
        if self.mode != "combined":
            if category in self.corpora:
                self.corpora[category].append(document)
            else:
                self.corpora[category] = [document]
        else:
            if category in self.corpus:
                d = self.corpus[category]
                d.update_text(unicode(d) + " " + unicode(document))
            else:
                document.set_doc_id(category)
                self.corpus.add(document)

    def process(self, data):
        """
        Process the documents.  The code looks at the attribute
        attribute, which should be a list or dictionary,
        and builds a set of corpora from categories in that
        attribute.
        If category is set, it only builds a single corpus containing
        documents with that category.
        """
        for doc in data:
            if self.attribute in doc.document:
                d = doc.document[self.attribute]
                if type(d) is list:
                    if self.categories == None:
                        for v in d:
                            self.add_document(v, doc)
                    else:
                        for category in self.categories:
                            if category in d:
                                self.add_document(category, doc)
            yield doc

    def post_process(self):
        """
        method that gets run after all data has been processed
        TODO: look into optimizing this, seems inefficient, written in derp-mode
        """
        if self.mode != "combined":
            return self.corpora
        else:
            return self.corpus

    def as_json(self):
        if self.mode != "combined":
            c = self.corpora
        else:
            c = self.corpus
        json.dumps(c, sort_keys=True, indent=4, separators=(',', ': '))

    def write(self):
        if self.output != None:
            f = open(self.output, 'w')
            f.write(self.as_json())
            f.close()

    def top_categories(self, n=10):
        for doc_id in self.categories:
            print str(doc_id)
            rt = self.corpus.ranked_terms(doc_id, n)
            print "  " + str(rt)
Exemplo n.º 6
0
 def setUp(self):
     self.d1 = Document({"id": "1", "body": "This this."})
     self.d2 = Document({"id": "2", "body": "This is another test document."})
     self.d3 = Document({"id": "3", "body": "Two words."})
     self.d4 = Document({"id": "4", "body": "Three words."})
     self.corpus = Corpus([self.d1, self.d2, self.d3])
Exemplo n.º 7
0
class CorpusTestCase(unittest.TestCase):
    def setUp(self):
        self.d1 = Document({"id": "1", "body": "This this."})
        self.d2 = Document({"id": "2", "body": "This is another test document."})
        self.d3 = Document({"id": "3", "body": "Two words."})
        self.d4 = Document({"id": "4", "body": "Three words."})
        self.corpus = Corpus([self.d1, self.d2, self.d3])

    def test_df(self):
        self.assertEqual(self.corpus.df("this"), 2)
        self.assertEqual(self.corpus.df("is"), 1)
        self.assertEqual(self.corpus.df("two"), 1)
        self.assertEqual(self.corpus.df("."), 3)

    def test_idf(self):
        # assume math.log is good
        self.assertEqual(self.corpus.idf("this"), math.log(3.0 / 2.0))
        self.assertEqual(self.corpus.idf("is"), math.log(3.0 / 1.0))
        self.assertEqual(self.corpus.idf("."), math.log(3.0 / 3.0))

    def test_tf(self):
        self.assertEqual(self.corpus.tf("1", "this"), 2.0 / 3.0)
        self.assertEqual(self.corpus.tf("2", "this"), 1.0 / 6.0)
        self.assertEqual(self.corpus.tf("2", "is"), 1.0 / 6.0)
        self.assertEqual(self.corpus.tf("3", "."), 1.0 / 3.0)

    def test_tf_idf(self):
        self.assertEqual(self.corpus.tf_idf("1", "this"), (2.0 / 3.0) * math.log(3.0 / 2.0))
        self.assertEqual(self.corpus.tf_idf("2", "this"), (1.0 / 6.0) * math.log(3.0 / 2.0))
        self.assertEqual(self.corpus.tf_idf("2", "is"), (1.0 / 6.0) * math.log(3.0 / 1.0))
        self.assertEqual(self.corpus.tf_idf("3", "."), (1.0 / 3.0) * math.log(3.0 / 3.0))

    def test_vocabulary(self):
        v = self.corpus.vocabulary()
        self.assertEqual(v["."], 3)
        self.assertEqual(v["this"], 3)
        self.assertEqual(v["another"], 1)

    def test_generate_doc_lens(self):
        self.corpus.generate_doc_lens()
        result = { "1": 10, "2": 30, "3": 10 }
        self.assertEqual(self.corpus.doc_lens, result)

    def test_generate_neighbor_list(self):
        corpus = Corpus([self.d1, self.d2, self.d3, self.d4])
        l = corpus.generate_neighbor_list(self.d1)
        self.assertTrue(((l[0] == ("1", 0)) and (l[1] == ("3", 0))) or
                        ((l[0] == ("3", 0)) and (l[1] == ("1", 0))))
        self.assertEqual(l[2], ("4", 2))
        self.assertEqual(l[3], ("2", 20))

    def test_neighbors(self):
        corpus = Corpus([self.d1, self.d2, self.d3, self.d4])
        neighbors = corpus.neighbors(self.d1, 10)
        n = [doc.doc_id for doc in neighbors]
        self.assertEqual(set(["1", "3", "4"]), set(n))
        neighbors = corpus.neighbors(self.d1, 20)
        n = [doc.doc_id for doc in neighbors]
        self.assertEqual(set(["1", "2", "3", "4"]), set(n))
        neighbors = corpus.neighbors(self.d1, 0)
        n = [doc.doc_id for doc in neighbors]
        self.assertEqual(set(["1", "3"]), set(n))
Exemplo n.º 8
0
 def __init__(self, output=None, corpus=None):
     self.output = output
     self.corpus = Corpus() if (corpus == None) else corpus
     self.module_type = enumModuleType(enumModuleType.Document)
     self.module_processing_type = \
         enumModuleProcessingType(enumModuleProcessingType.PostProcess)