Exemplo n.º 1
0
    def generate_graph(self,
            dataset,
            periods=None,
            outpath=None,
            ngramgraphconfig=None,
            documentgraphconfig=None,
            exportedges=True
        ):
        """
        Generates the graphs from indexed NGrams/Document/Corpus
        given a list of @periods
        Then export the corresponding graph to storage and gexf
        optionnaly exports the complete graph to a gexf file for use in tinaweb

        @return absolute path to the GEXF nodes file
        """
        self._load_config()

            
        if not documentgraphconfig: documentgraphconfig = {}
        if not ngramgraphconfig: ngramgraphconfig = {}
        # updates default config with parameters
        update_ngramconfig = self.config['datamining']['NGramGraph']
        update_ngramconfig.update(ngramgraphconfig)
        update_documentconfig = self.config['datamining']['DocumentGraph']
        update_documentconfig.update(documentgraphconfig)

        storage = self.get_storage(dataset, create=False, drop_tables=False)
        if storage == self.STATUS_ERROR:
            yield self.STATUS_ERROR
            return

        if periods is None:
            self.logger.debug("no periods parameters, will use all periods from the dataset")
            corpora = storage.loadCorpora(dataset)

            if corpora is None:
                yield self.STATUS_ERROR
                raise Exception("%s dataset not found in database")
                return
            else:
                periods = corpora['edges']['Corpus'].keys()

        if not isinstance(periods, list):
            periods = [periods]

        # params_string formed with the periods
        params_string = "%s_%s"%(update_ngramconfig['proximity'], update_documentconfig['proximity'])
        if len(periods) > 1:
            sortedperiods = sorted(periods)
            params_string += "_from_%s_to_%s"%(sortedperiods[0],sortedperiods[-1])
        else:
            params_string += "_"+periods[0]
        # outpath is an optional label but used into the file path
        if outpath is None:
            outpath = self._get_user_filepath(dataset, 'gexf', "%s-graph"%params_string)
        else:
            outpath = self._get_user_filepath(dataset, 'gexf',  "%s_%s-graph"%(params_string, outpath) )
        outpath = abspath( outpath + ".gexf" )

        GEXFWriter = self._new_graph_writer(
            dataset,
            periods,
            "None",
            storage,
            generate=True,
            preprocess=False
        )

        periods_to_process = []
        ngram_index = set([])
        doc_index = set([])
        # checks periods and construct nodes' indices
        for period in periods:
            corpus = storage.loadCorpus( period )
            yield self.STATUS_RUNNING
            if corpus is not None:
                periods_to_process += [corpus]
                # union
                ngram_index |= set( corpus['edges']['NGram'].keys() )
                yield self.STATUS_RUNNING
                # union
                doc_index |= set( corpus['edges']['Document'].keys() )
            else:
                self.logger.debug('Period %s not found in database, skipping'%str(period))

        if len(ngram_index) == 0 or len(doc_index) == 0:
            yield self.STATUS_ERROR
            errmsg = "Graph not generated because : NGram index length = %d, Document index length = %d"%(len(ngram_index),len(doc_index))
            self.logger.warning(errmsg)
            raise RuntimeError(errmsg)
            return

        # hack resolving the proximity parameter ambiguity
        #if update_ngramconfig['proximity']=='cooccurrences':
        #    ngram_matrix_reducer = graph.MatrixReducerFilter( ngram_index )
        #elif update_ngramconfig['proximity']=='pseudoInclusion':
        #    ngram_matrix_reducer = graph.PseudoInclusionMatrix( ngram_index )
        if update_ngramconfig['proximity']=='EquivalenceIndex':
            update_ngramconfig['nb_documents'] = len(doc_index)
        #    ngram_matrix_reducer = graph.EquivalenceIndexMatrix( ngram_index )
        #else:
        #    errmsg = "%s is not a valid NGram graph proximity"%update_ngramconfig['proximity']
        #    self.logger.error(errmsg)
        #    raise NotImplementedError(errmsg)
        #    return

        ngram_graph_class = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraph")
        ngram_matrix_class = _dynamic_get_class("tinasoft.pytextminer.graph.matrix", update_ngramconfig['proximity'])
        ngram_matrix_reducer = ngram_matrix_class(ngram_index)
        self.logger.debug("finished preparing params for generate_graph")
        # ngramgraph proximity is based on previously stored
        ngramsubgraph_gen = process_ngram_subgraph(
            self.config,
            dataset,
            periods_to_process,
            ngram_index,
            doc_index,
            ngram_matrix_reducer,
            update_ngramconfig,
            GEXFWriter,
            storage,
            ngram_graph_class
        )
        try:
            while 1:
                yield self.STATUS_RUNNING
                ngramsubgraph_gen.next()
        except StopIteration:
            self.logger.debug("finished NGramGraph")
            pass

        doc_graph_class = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "DocGraph")

        doc_matrix_reducer = matrix.MatrixReducerMaxDegree( doc_index )
        docsubgraph_gen = process_document_subgraph(
            self.config,
            dataset,
            periods_to_process,
            ngram_index,
            doc_index,
            doc_matrix_reducer,
            update_documentconfig,
            GEXFWriter,
            storage,
            doc_graph_class
        )
        try:
            while 1:
                yield self.STATUS_RUNNING
                docsubgraph_gen.next()
        except StopIteration:
            self.logger.debug("finished DocGraph")
            pass

        #if exportedges is True:
        self.logger.warning("exporting the full graph to %s"%outpath)
        GEXFWriter.graph['parameters']['data/source'] = "standalone"
        GEXFWriter.finalize(outpath, exportedges=True)
        # returns the absolute path of outpath
        #GEXFWriter.graph['parameters']['data/source'] = "browser"
        #GEXFWriter.finalize(outpath, exportedges=False)
        yield outpath
        return
Exemplo n.º 2
0
    def graph_preprocess(self, dataset):
        """
        Preprocesses the whole graph database
        specially cooccurrences used for further graph generation
        """
        self.logger.debug("starting graph_preprocess")

        storage = self.get_storage(dataset, create=True, drop_tables=False)
        if storage == self.STATUS_ERROR:
            yield self.STATUS_ERROR
            return

        yield self.STATUS_RUNNING
        ngramgraphconfig = self.config['datamining']['NGramGraph']
        datasetObj = storage.loadCorpora(dataset)

        ### cooccurrences calculated for each period
        for corpusid in datasetObj['edges']['Corpus'].keys():
            period = storage.loadCorpus( corpusid )
            if period is None: continue
            ngram_index = set( period.edges['NGram'].keys() )
            doc_index = set( period.edges['Document'].keys() )
            if len(ngram_index) == 0:
                self.logger.warning("period %s has NO NGram indexed, skipping graph_preprocess"%period.id)
                continue
            if len(doc_index) == 0:
                self.logger.warning("period %s has NO Document indexed, skipping graph_preprocess"%period.id)
                continue

            yield self.STATUS_RUNNING
            cooc_writer = self._new_graph_writer(
                dataset,
                [period['id']],
                "preprocess tmp graph",
                storage,
                generate=False,
                preprocess=True
            )

            yield self.STATUS_RUNNING
            ngram_matrix_reducer = matrix.MatrixReducer(ngram_index)
            ngram_graph_preprocess = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraphPreprocess")
            ngramsubgraph_gen = process_ngram_subgraph(
                self.config,
                dataset,
                [period],
                ngram_index,
                doc_index,
                ngram_matrix_reducer,
                ngramgraphconfig,
                cooc_writer,
                storage,
                ngram_graph_preprocess
            )

            try:
                while 1:
                    yield self.STATUS_RUNNING
                    ngramsubgraph_gen.next()
            except StopIteration:
                self.logger.debug("exporting master whitelist")

                whitelistlabel = "%s_master"%datasetObj['id']
                outpath = self._get_whitelist_filepath(whitelistlabel)
                # this whitelist == dataset
                newwl = whitelist.Whitelist(whitelistlabel, whitelistlabel)
                # dataset's storage == master whitelist's storage
                del newwl.storage
                newwl.storage = storage
                yield self.STATUS_RUNNING
                # exports the dataset's whitelist
                whitelist_exporter = Writer("whitelist://"+outpath)
                yield abspath( whitelist_exporter.write_whitelist(newwl, datasetObj['id'], status="w"))
                return