def generate_graph(self, dataset, periods=None, outpath=None, ngramgraphconfig=None, documentgraphconfig=None, exportedges=True ): """ Generates the graphs from indexed NGrams/Document/Corpus given a list of @periods Then export the corresponding graph to storage and gexf optionnaly exports the complete graph to a gexf file for use in tinaweb @return absolute path to the GEXF nodes file """ self._load_config() if not documentgraphconfig: documentgraphconfig = {} if not ngramgraphconfig: ngramgraphconfig = {} # updates default config with parameters update_ngramconfig = self.config['datamining']['NGramGraph'] update_ngramconfig.update(ngramgraphconfig) update_documentconfig = self.config['datamining']['DocumentGraph'] update_documentconfig.update(documentgraphconfig) storage = self.get_storage(dataset, create=False, drop_tables=False) if storage == self.STATUS_ERROR: yield self.STATUS_ERROR return if periods is None: self.logger.debug("no periods parameters, will use all periods from the dataset") corpora = storage.loadCorpora(dataset) if corpora is None: yield self.STATUS_ERROR raise Exception("%s dataset not found in database") return else: periods = corpora['edges']['Corpus'].keys() if not isinstance(periods, list): periods = [periods] # params_string formed with the periods params_string = "%s_%s"%(update_ngramconfig['proximity'], update_documentconfig['proximity']) if len(periods) > 1: sortedperiods = sorted(periods) params_string += "_from_%s_to_%s"%(sortedperiods[0],sortedperiods[-1]) else: params_string += "_"+periods[0] # outpath is an optional label but used into the file path if outpath is None: outpath = self._get_user_filepath(dataset, 'gexf', "%s-graph"%params_string) else: outpath = self._get_user_filepath(dataset, 'gexf', "%s_%s-graph"%(params_string, outpath) ) outpath = abspath( outpath + ".gexf" ) GEXFWriter = self._new_graph_writer( dataset, periods, "None", storage, generate=True, preprocess=False ) periods_to_process = [] ngram_index = set([]) doc_index = set([]) # checks periods and construct nodes' indices for period in periods: corpus = storage.loadCorpus( period ) yield self.STATUS_RUNNING if corpus is not None: periods_to_process += [corpus] # union ngram_index |= set( corpus['edges']['NGram'].keys() ) yield self.STATUS_RUNNING # union doc_index |= set( corpus['edges']['Document'].keys() ) else: self.logger.debug('Period %s not found in database, skipping'%str(period)) if len(ngram_index) == 0 or len(doc_index) == 0: yield self.STATUS_ERROR errmsg = "Graph not generated because : NGram index length = %d, Document index length = %d"%(len(ngram_index),len(doc_index)) self.logger.warning(errmsg) raise RuntimeError(errmsg) return # hack resolving the proximity parameter ambiguity #if update_ngramconfig['proximity']=='cooccurrences': # ngram_matrix_reducer = graph.MatrixReducerFilter( ngram_index ) #elif update_ngramconfig['proximity']=='pseudoInclusion': # ngram_matrix_reducer = graph.PseudoInclusionMatrix( ngram_index ) if update_ngramconfig['proximity']=='EquivalenceIndex': update_ngramconfig['nb_documents'] = len(doc_index) # ngram_matrix_reducer = graph.EquivalenceIndexMatrix( ngram_index ) #else: # errmsg = "%s is not a valid NGram graph proximity"%update_ngramconfig['proximity'] # self.logger.error(errmsg) # raise NotImplementedError(errmsg) # return ngram_graph_class = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraph") ngram_matrix_class = _dynamic_get_class("tinasoft.pytextminer.graph.matrix", update_ngramconfig['proximity']) ngram_matrix_reducer = ngram_matrix_class(ngram_index) self.logger.debug("finished preparing params for generate_graph") # ngramgraph proximity is based on previously stored ngramsubgraph_gen = process_ngram_subgraph( self.config, dataset, periods_to_process, ngram_index, doc_index, ngram_matrix_reducer, update_ngramconfig, GEXFWriter, storage, ngram_graph_class ) try: while 1: yield self.STATUS_RUNNING ngramsubgraph_gen.next() except StopIteration: self.logger.debug("finished NGramGraph") pass doc_graph_class = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "DocGraph") doc_matrix_reducer = matrix.MatrixReducerMaxDegree( doc_index ) docsubgraph_gen = process_document_subgraph( self.config, dataset, periods_to_process, ngram_index, doc_index, doc_matrix_reducer, update_documentconfig, GEXFWriter, storage, doc_graph_class ) try: while 1: yield self.STATUS_RUNNING docsubgraph_gen.next() except StopIteration: self.logger.debug("finished DocGraph") pass #if exportedges is True: self.logger.warning("exporting the full graph to %s"%outpath) GEXFWriter.graph['parameters']['data/source'] = "standalone" GEXFWriter.finalize(outpath, exportedges=True) # returns the absolute path of outpath #GEXFWriter.graph['parameters']['data/source'] = "browser" #GEXFWriter.finalize(outpath, exportedges=False) yield outpath return
def graph_preprocess(self, dataset): """ Preprocesses the whole graph database specially cooccurrences used for further graph generation """ self.logger.debug("starting graph_preprocess") storage = self.get_storage(dataset, create=True, drop_tables=False) if storage == self.STATUS_ERROR: yield self.STATUS_ERROR return yield self.STATUS_RUNNING ngramgraphconfig = self.config['datamining']['NGramGraph'] datasetObj = storage.loadCorpora(dataset) ### cooccurrences calculated for each period for corpusid in datasetObj['edges']['Corpus'].keys(): period = storage.loadCorpus( corpusid ) if period is None: continue ngram_index = set( period.edges['NGram'].keys() ) doc_index = set( period.edges['Document'].keys() ) if len(ngram_index) == 0: self.logger.warning("period %s has NO NGram indexed, skipping graph_preprocess"%period.id) continue if len(doc_index) == 0: self.logger.warning("period %s has NO Document indexed, skipping graph_preprocess"%period.id) continue yield self.STATUS_RUNNING cooc_writer = self._new_graph_writer( dataset, [period['id']], "preprocess tmp graph", storage, generate=False, preprocess=True ) yield self.STATUS_RUNNING ngram_matrix_reducer = matrix.MatrixReducer(ngram_index) ngram_graph_preprocess = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraphPreprocess") ngramsubgraph_gen = process_ngram_subgraph( self.config, dataset, [period], ngram_index, doc_index, ngram_matrix_reducer, ngramgraphconfig, cooc_writer, storage, ngram_graph_preprocess ) try: while 1: yield self.STATUS_RUNNING ngramsubgraph_gen.next() except StopIteration: self.logger.debug("exporting master whitelist") whitelistlabel = "%s_master"%datasetObj['id'] outpath = self._get_whitelist_filepath(whitelistlabel) # this whitelist == dataset newwl = whitelist.Whitelist(whitelistlabel, whitelistlabel) # dataset's storage == master whitelist's storage del newwl.storage newwl.storage = storage yield self.STATUS_RUNNING # exports the dataset's whitelist whitelist_exporter = Writer("whitelist://"+outpath) yield abspath( whitelist_exporter.write_whitelist(newwl, datasetObj['id'], status="w")) return