def export_cooc(self, dataset, periods, outpath=None, minCooc=1 ): """ returns a text file outpath containing the db cooc for a list of periods ans an ngrams whitelist """ self._load_config() storage = self.get_storage(dataset, create=True, drop_tables=False) if storage == self.STATUS_ERROR: return self.STATUS_ERROR if outpath is None: outpath = self._get_user_filepath( dataset, 'cooccurrences', "%s-export_cooc.txt"%"+".join(periods) ) whitelist = None if whitelistpath is not None: whitelist = self._import_whitelist(whitelistpath) exporter = Writer('coocmatrix://'+outpath) # is a generator return exporter.export_from_storage( storage, periods, minCooc )
def _new_graph_writer(self, dataset, periods, whitelistid, storage=None, generate=True, preprocess=False): """ creates the GEXF Graph exporter """ graphwriter = Writer('gexf://', **self.config['datamining']) # adds meta to the futur gexf file graphmeta = { 'parameters': { 'periods' : "+".join(periods), 'whitelist': whitelistid, 'dataset': dataset, 'layout/algorithm': 'tinaforce', 'rendering/edge/shape': 'curve', 'data/source': 'browser' }, 'description': "a tinasoft graph", 'creators': ["CREA Lab, CNRS/Ecole Polytechnique UMR 7656 (Fr)"], 'date': "%s"%datetime.now().strftime("%Y-%m-%d"), 'nodes': { 'NGram' : {}, 'Document': {} } } graphwriter.new_graph( storage, graphmeta, periods, generate, preprocess) return graphwriter
def graph_preprocess(self, dataset): """ Preprocesses the whole graph database specially cooccurrences used for further graph generation """ self.logger.debug("starting graph_preprocess") storage = self.get_storage(dataset, create=True, drop_tables=False) if storage == self.STATUS_ERROR: yield self.STATUS_ERROR return yield self.STATUS_RUNNING ngramgraphconfig = self.config['datamining']['NGramGraph'] datasetObj = storage.loadCorpora(dataset) ### cooccurrences calculated for each period for corpusid in datasetObj['edges']['Corpus'].keys(): period = storage.loadCorpus( corpusid ) if period is None: continue ngram_index = set( period.edges['NGram'].keys() ) doc_index = set( period.edges['Document'].keys() ) if len(ngram_index) == 0: self.logger.warning("period %s has NO NGram indexed, skipping graph_preprocess"%period.id) continue if len(doc_index) == 0: self.logger.warning("period %s has NO Document indexed, skipping graph_preprocess"%period.id) continue yield self.STATUS_RUNNING cooc_writer = self._new_graph_writer( dataset, [period['id']], "preprocess tmp graph", storage, generate=False, preprocess=True ) yield self.STATUS_RUNNING ngram_matrix_reducer = matrix.MatrixReducer(ngram_index) ngram_graph_preprocess = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraphPreprocess") ngramsubgraph_gen = process_ngram_subgraph( self.config, dataset, [period], ngram_index, doc_index, ngram_matrix_reducer, ngramgraphconfig, cooc_writer, storage, ngram_graph_preprocess ) try: while 1: yield self.STATUS_RUNNING ngramsubgraph_gen.next() except StopIteration: self.logger.debug("exporting master whitelist") whitelistlabel = "%s_master"%datasetObj['id'] outpath = self._get_whitelist_filepath(whitelistlabel) # this whitelist == dataset newwl = whitelist.Whitelist(whitelistlabel, whitelistlabel) # dataset's storage == master whitelist's storage del newwl.storage newwl.storage = storage yield self.STATUS_RUNNING # exports the dataset's whitelist whitelist_exporter = Writer("whitelist://"+outpath) yield abspath( whitelist_exporter.write_whitelist(newwl, datasetObj['id'], status="w")) return
corporaObj, filters, stemmer.Nltk(), tokenizer.NGramTokenizer ) extractorGenerator = extract.index() while 1: extractorGenerator.next() yield self.STATUS_RUNNING except IOError, ioe: self.logger.error("%s"%ioe) yield self.STATUS_ERROR return except StopIteration: whitelist_exporter = Writer("whitelist://"+outpath) whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs) yield abspath(outpath) return def index_file(self, path, dataset, whitelistpath, format='tinacsv', overwrite=False, ): """ pytextminer's indexation controler : whitelist + source file => session database """ self._load_config()
corporaObj, filters, stemmer.Nltk(), tokenizer.NGramTokenizer ) extractorGenerator = extract.index() while 1: extractorGenerator.next() yield self.STATUS_RUNNING except IOError, ioe: self.logger.error("%s"%ioe) yield self.STATUS_ERROR return except StopIteration: whitelist_exporter = Writer("whitelist://"+outpath) try: master_user_whitelist = self._import_whitelist(self.config['general']['userwhitelist']) self.logger.debug( "user whitelist found, contains %d valid NGrams"%len(master_user_whitelist['edges']['form_label'].keys()) ) whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs, status=master_user_whitelist) except Exception: self.logger.warning("user whitelist not found at %s"%self.config['general']['userwhitelist']) whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs, status="") yield abspath(outpath) return def index_file(self, path, dataset, whitelistpath, format='tinacsv'
def save(self, whitelistobj): wlexporter = Writer("whitelist://"+self.config['output']['whitelist']['path']) wlexporter.write_whitelist(whitelistobj, None, status="w")