def graph_preprocess(self, dataset): """ Preprocesses the whole graph database specially cooccurrences used for further graph generation """ self.logger.debug("starting graph_preprocess") storage = self.get_storage(dataset, create=True, drop_tables=False) if storage == self.STATUS_ERROR: yield self.STATUS_ERROR return yield self.STATUS_RUNNING ngramgraphconfig = self.config['datamining']['NGramGraph'] datasetObj = storage.loadCorpora(dataset) ### cooccurrences calculated for each period for corpusid in datasetObj['edges']['Corpus'].keys(): period = storage.loadCorpus( corpusid ) if period is None: continue ngram_index = set( period.edges['NGram'].keys() ) doc_index = set( period.edges['Document'].keys() ) if len(ngram_index) == 0: self.logger.warning("period %s has NO NGram indexed, skipping graph_preprocess"%period.id) continue if len(doc_index) == 0: self.logger.warning("period %s has NO Document indexed, skipping graph_preprocess"%period.id) continue yield self.STATUS_RUNNING cooc_writer = self._new_graph_writer( dataset, [period['id']], "preprocess tmp graph", storage, generate=False, preprocess=True ) yield self.STATUS_RUNNING ngram_matrix_reducer = matrix.MatrixReducer(ngram_index) ngram_graph_preprocess = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraphPreprocess") ngramsubgraph_gen = process_ngram_subgraph( self.config, dataset, [period], ngram_index, doc_index, ngram_matrix_reducer, ngramgraphconfig, cooc_writer, storage, ngram_graph_preprocess ) try: while 1: yield self.STATUS_RUNNING ngramsubgraph_gen.next() except StopIteration: self.logger.debug("exporting master whitelist") whitelistlabel = "%s_master"%datasetObj['id'] outpath = self._get_whitelist_filepath(whitelistlabel) # this whitelist == dataset newwl = whitelist.Whitelist(whitelistlabel, whitelistlabel) # dataset's storage == master whitelist's storage del newwl.storage newwl.storage = storage yield self.STATUS_RUNNING # exports the dataset's whitelist whitelist_exporter = Writer("whitelist://"+outpath) yield abspath( whitelist_exporter.write_whitelist(newwl, datasetObj['id'], status="w")) return
) extractorGenerator = extract.index() while 1: extractorGenerator.next() yield self.STATUS_RUNNING except IOError, ioe: self.logger.error("%s"%ioe) yield self.STATUS_ERROR return except StopIteration: whitelist_exporter = Writer("whitelist://"+outpath) try: master_user_whitelist = self._import_whitelist(self.config['general']['userwhitelist']) self.logger.debug( "user whitelist found, contains %d valid NGrams"%len(master_user_whitelist['edges']['form_label'].keys()) ) whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs, status=master_user_whitelist) except Exception: self.logger.warning("user whitelist not found at %s"%self.config['general']['userwhitelist']) whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs, status="") yield abspath(outpath) return def index_file(self, path, dataset, whitelistpath, format='tinacsv' ): """ pytextminer's indexation controler : whitelist + source file => session database """
filters, stemmer.Nltk(), tokenizer.NGramTokenizer ) extractorGenerator = extract.index() while 1: extractorGenerator.next() yield self.STATUS_RUNNING except IOError, ioe: self.logger.error("%s"%ioe) yield self.STATUS_ERROR return except StopIteration: whitelist_exporter = Writer("whitelist://"+outpath) whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs) yield abspath(outpath) return def index_file(self, path, dataset, whitelistpath, format='tinacsv', overwrite=False, ): """ pytextminer's indexation controler : whitelist + source file => session database """ self._load_config() try:
def save(self, whitelistobj): wlexporter = Writer("whitelist://"+self.config['output']['whitelist']['path']) wlexporter.write_whitelist(whitelistobj, None, status="w")