示例#1
0
 def export_cooc(self,
         dataset,
         periods,
         outpath=None,
         minCooc=1
     ):
     """
     returns a text file outpath containing the db cooc
     for a list of periods ans an ngrams whitelist
     """
     self._load_config()
     storage = self.get_storage(dataset, create=True, drop_tables=False)
     if storage == self.STATUS_ERROR:
         return self.STATUS_ERROR
     if outpath is None:
         outpath = self._get_user_filepath(
             dataset,
             'cooccurrences',
             "%s-export_cooc.txt"%"+".join(periods)
         )
     whitelist = None
     if whitelistpath is not None:
         whitelist = self._import_whitelist(whitelistpath)
     exporter = Writer('coocmatrix://'+outpath)
     # is a generator
     return exporter.export_from_storage( storage, periods, minCooc )
示例#2
0
 def _new_graph_writer(self, dataset, periods, whitelistid, storage=None, generate=True, preprocess=False):
     """
     creates the GEXF Graph exporter
     """
     graphwriter = Writer('gexf://', **self.config['datamining'])
     # adds meta to the futur gexf file
     graphmeta = {
         'parameters': {
             'periods' : "+".join(periods),
             'whitelist': whitelistid,
             'dataset': dataset,
             'layout/algorithm': 'tinaforce',
             'rendering/edge/shape': 'curve',
             'data/source': 'browser'
         },
         'description': "a tinasoft graph",
         'creators': ["CREA Lab, CNRS/Ecole Polytechnique UMR 7656 (Fr)"],
         'date': "%s"%datetime.now().strftime("%Y-%m-%d"),
         'nodes': {
             'NGram' : {},
             'Document': {}
         }
     }
     graphwriter.new_graph( storage, graphmeta, periods, generate, preprocess)
     return graphwriter
示例#3
0
    def graph_preprocess(self, dataset):
        """
        Preprocesses the whole graph database
        specially cooccurrences used for further graph generation
        """
        self.logger.debug("starting graph_preprocess")

        storage = self.get_storage(dataset, create=True, drop_tables=False)
        if storage == self.STATUS_ERROR:
            yield self.STATUS_ERROR
            return

        yield self.STATUS_RUNNING
        ngramgraphconfig = self.config['datamining']['NGramGraph']
        datasetObj = storage.loadCorpora(dataset)

        ### cooccurrences calculated for each period
        for corpusid in datasetObj['edges']['Corpus'].keys():
            period = storage.loadCorpus( corpusid )
            if period is None: continue
            ngram_index = set( period.edges['NGram'].keys() )
            doc_index = set( period.edges['Document'].keys() )
            if len(ngram_index) == 0:
                self.logger.warning("period %s has NO NGram indexed, skipping graph_preprocess"%period.id)
                continue
            if len(doc_index) == 0:
                self.logger.warning("period %s has NO Document indexed, skipping graph_preprocess"%period.id)
                continue

            yield self.STATUS_RUNNING
            cooc_writer = self._new_graph_writer(
                dataset,
                [period['id']],
                "preprocess tmp graph",
                storage,
                generate=False,
                preprocess=True
            )

            yield self.STATUS_RUNNING
            ngram_matrix_reducer = matrix.MatrixReducer(ngram_index)
            ngram_graph_preprocess = _dynamic_get_class("tinasoft.pytextminer.graph.subgraph", "NgramGraphPreprocess")
            ngramsubgraph_gen = process_ngram_subgraph(
                self.config,
                dataset,
                [period],
                ngram_index,
                doc_index,
                ngram_matrix_reducer,
                ngramgraphconfig,
                cooc_writer,
                storage,
                ngram_graph_preprocess
            )

            try:
                while 1:
                    yield self.STATUS_RUNNING
                    ngramsubgraph_gen.next()
            except StopIteration:
                self.logger.debug("exporting master whitelist")

                whitelistlabel = "%s_master"%datasetObj['id']
                outpath = self._get_whitelist_filepath(whitelistlabel)
                # this whitelist == dataset
                newwl = whitelist.Whitelist(whitelistlabel, whitelistlabel)
                # dataset's storage == master whitelist's storage
                del newwl.storage
                newwl.storage = storage
                yield self.STATUS_RUNNING
                # exports the dataset's whitelist
                whitelist_exporter = Writer("whitelist://"+outpath)
                yield abspath( whitelist_exporter.write_whitelist(newwl, datasetObj['id'], status="w"))
                return
示例#4
0
                corporaObj,
                filters,
                stemmer.Nltk(),
                tokenizer.NGramTokenizer
            )
            extractorGenerator = extract.index()
            
            while 1:
                extractorGenerator.next()
                yield self.STATUS_RUNNING
        except IOError, ioe:
            self.logger.error("%s"%ioe)
            yield self.STATUS_ERROR
            return
        except StopIteration:
            whitelist_exporter = Writer("whitelist://"+outpath)
            whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs)
            yield abspath(outpath)
            return

    def index_file(self,
            path,
            dataset,
            whitelistpath,
            format='tinacsv',
            overwrite=False,
        ):
        """
        pytextminer's indexation controler : whitelist + source file => session database
        """
        self._load_config()
示例#5
0
                corporaObj,
                filters,
                stemmer.Nltk(),
                tokenizer.NGramTokenizer
            )
            extractorGenerator = extract.index()
            
            while 1:
                extractorGenerator.next()
                yield self.STATUS_RUNNING
        except IOError, ioe:
            self.logger.error("%s"%ioe)
            yield self.STATUS_ERROR
            return
        except StopIteration:
            whitelist_exporter = Writer("whitelist://"+outpath)
            try:
                master_user_whitelist = self._import_whitelist(self.config['general']['userwhitelist'])
                self.logger.debug( "user whitelist found, contains %d valid NGrams"%len(master_user_whitelist['edges']['form_label'].keys()) )
                whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs, status=master_user_whitelist)
            except Exception:
                self.logger.warning("user whitelist not found at %s"%self.config['general']['userwhitelist'])
                whitelist_exporter.write_whitelist(newwl, corporaObj.id, minoccs=minoccs, status="")
            yield abspath(outpath)
            return

    def index_file(self,
            path,
            dataset,
            whitelistpath,
            format='tinacsv'
示例#6
0
文件: output.py 项目: moma/easiparse
 def save(self, whitelistobj):
     wlexporter = Writer("whitelist://"+self.config['output']['whitelist']['path'])
     wlexporter.write_whitelist(whitelistobj, None, status="w")