class AddToCorpus(PipelineModule): def __init__(self, output=None, corpus=None): self.output = output self.corpus = Corpus() if (corpus == None) else corpus self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess) def process(self, data): for document in data: print data self.corpus.add(data) def post_process(self): return self.corpus def as_json(self): json.dumps(self.corpus, sort_keys=True, indent=4, separators=(',', ': ')) def write(self): if self.output != None: f = open(self.output, 'w') f.write(self.as_json()) f.close()
class CategoryToCorpus(PipelineModule): def __init__(self, output=None, corpus=None, attribute="categories", categories=None, mode="combined"): """ Create a CategoryToCorpus module, which loads a corpus with tagged documents. If corpus is passed in, it adds to an existing corpus. mode is the corpus loading method to use. If set to "combined", all documents in a category are concatenated to a single document. Otherwise each document is loaded separately. """ self.output = output self.corpora = {} # combined mode has a single corpus if corpus == None: self.corpus = Corpus() else: self.corpus = corpus self.module_type = enumModuleType(enumModuleType.Document) self.module_processing_type = \ enumModuleProcessingType(enumModuleProcessingType.PostProcess) self.attribute = attribute self.categories = categories self.mode = mode self.pp = pprint.PrettyPrinter(indent=4) def add_document(self, category, document): if self.mode != "combined": if category in self.corpora: self.corpora[category].append(document) else: self.corpora[category] = [document] else: if category in self.corpus: d = self.corpus[category] d.update_text(unicode(d) + " " + unicode(document)) else: document.set_doc_id(category) self.corpus.add(document) def process(self, data): """ Process the documents. The code looks at the attribute attribute, which should be a list or dictionary, and builds a set of corpora from categories in that attribute. If category is set, it only builds a single corpus containing documents with that category. """ for doc in data: if self.attribute in doc.document: d = doc.document[self.attribute] if type(d) is list: if self.categories == None: for v in d: self.add_document(v, doc) else: for category in self.categories: if category in d: self.add_document(category, doc) yield doc def post_process(self): """ method that gets run after all data has been processed TODO: look into optimizing this, seems inefficient, written in derp-mode """ if self.mode != "combined": return self.corpora else: return self.corpus def as_json(self): if self.mode != "combined": c = self.corpora else: c = self.corpus json.dumps(c, sort_keys=True, indent=4, separators=(',', ': ')) def write(self): if self.output != None: f = open(self.output, 'w') f.write(self.as_json()) f.close() def top_categories(self, n=10): for doc_id in self.categories: print str(doc_id) rt = self.corpus.ranked_terms(doc_id, n) print " " + str(rt)