예제 #1
0
 def __init__(self, id, label, edges=None, **metas):
     # double heritage
     whitelist.WhitelistFile.__init__(self)
     PyTextMiner.__init__(self, {}, id, label, edges, **metas)
     ### same database as tinasoft, but temp
     self.storage = self._get_storage()
     ### cache for corpus
     self.corpus = {}
예제 #2
0
 def __init__(self,
         id,
         content=None,
         edges=None,
         **metas):
     if content is None:
         content = id
     PyTextMiner.__init__(self, content, id, id, edges, **metas)
예제 #3
0
 def __init__(
         self,
         content,
         id,
         label,
         edges=None,
         **metas
     ):
     PyTextMiner.__init__(self, content, id, label, edges, **metas)
     if 'keyword' not in self.edges:
         self.edges['keyword']={}
예제 #4
0
 def getNormId(tokens):
     """
     Utility returning a normalized NGram ID given a tokenlist
     For external use
     """
     # normalized_tokens list to produce an unique id
     normalized_tokens = [NGram.normalize(word) for word in tokens]
     return PyTextMiner.getId(normalized_tokens)
예제 #5
0
 def addForm(self, form_tokens, form_postag=None, form_occs=1 ):
     """
     increments form edges
     """
     if form_postag is None:
         form_postag = ["?"]
     form_label = PyTextMiner.form_label( form_tokens )
     self.addEdge('label', form_label, form_occs)
     self.addEdge('postag', form_label, form_postag)
예제 #6
0
 def __init__(self, tokenlist, id=None, label=None, edges=None, postag=None, **metas):
     """
     initiate the object
     normalize must be local value for pickling reasons
     """
     # normlist is the normalized list of tokens
     normalized_tokens = [NGram.normalize(word) for word in tokenlist]
     # prepares postag
     if postag is not None:
         metas["postag"] = postag
     else:
         metas["postag"] = ["?"]
     # default emtpy edges
     if edges is None:
         edges = { 'label': {}, 'postag' : {} }
     PyTextMiner.__init__(self, normalized_tokens, id, label, edges, **metas)
     # updates majors forms before returning instance
     self.addForm(normalized_tokens, metas["postag"], 1)
예제 #7
0
    def ngramize(ngrams, minSize, maxSize, tagTokens, filters, stemmer):
        """
        common ngramizing method
        returns a dict of filtered NGram instances
        using the optional stopwords object to filter by ngram length

        @tagTokens == [[word1 tokens], [word2 tokens], etc]
        """
        # content is the list of words from tagTokens
        content = tagger.TreeBankPosTagger.getContent(tagTokens)
        stemmedcontent = []
        for word in content:
             stemmedcontent += [stemmer.stem(word)]
        # tags is the list of tags from tagTokens
        tags = tagger.TreeBankPosTagger.getTag(tagTokens)
        for i in range(len(content)):
            for n in range(minSize, maxSize + 1):
                if len(content) >= i + n:
                    # updates document's ngrams cache
                    ngid = ngram.NGram.getNormId(stemmedcontent[i:n+i])
                    # id made from the stemmedcontent and label from the real tokens
                    ng = ngram.NGram(
                        content[i:n+i],
                        id = ngid,
                        label = PyTextMiner.form_label(content[i:n+i]),
                        occs = 1,
                        postag = tags[i:n+i]
                    )
                    if filtering.apply_filters(ng, filters) is True:
                        if ngid in ngrams:
                            ngrams[ngid].addForm( content[i:n+i], tags[i:n+i], 1 )
                            ngrams[ngid].updateMajorForm()
                            ngrams[ngid]['occs'] += ng['occs']
                        else:
                            ngrams[ngid] = ng
        return ngrams
예제 #8
0
 def __init__(self, name, edges=None, **metas):
     # list of corpus id
     content = []
     if edges is not None and 'Corpus' in edges:
         content = edges['Corpus'].keys()
     PyTextMiner.__init__(self, content, name, name, edges=edges, **metas)
예제 #9
0
 def __init__(self, id, label, edges=None, **metas):
     PyTextMiner.__init__(self, {}, id, label, edges, **metas)
     ### same database as tinasoft, but temp
     self.storage = self._get_storage()
     ### cache for corpus
     self.corpus = {}