def __init__(self, id, label, edges=None, **metas): # double heritage whitelist.WhitelistFile.__init__(self) PyTextMiner.__init__(self, {}, id, label, edges, **metas) ### same database as tinasoft, but temp self.storage = self._get_storage() ### cache for corpus self.corpus = {}
def __init__(self, id, content=None, edges=None, **metas): if content is None: content = id PyTextMiner.__init__(self, content, id, id, edges, **metas)
def __init__( self, content, id, label, edges=None, **metas ): PyTextMiner.__init__(self, content, id, label, edges, **metas) if 'keyword' not in self.edges: self.edges['keyword']={}
def getNormId(tokens): """ Utility returning a normalized NGram ID given a tokenlist For external use """ # normalized_tokens list to produce an unique id normalized_tokens = [NGram.normalize(word) for word in tokens] return PyTextMiner.getId(normalized_tokens)
def addForm(self, form_tokens, form_postag=None, form_occs=1 ): """ increments form edges """ if form_postag is None: form_postag = ["?"] form_label = PyTextMiner.form_label( form_tokens ) self.addEdge('label', form_label, form_occs) self.addEdge('postag', form_label, form_postag)
def __init__(self, tokenlist, id=None, label=None, edges=None, postag=None, **metas): """ initiate the object normalize must be local value for pickling reasons """ # normlist is the normalized list of tokens normalized_tokens = [NGram.normalize(word) for word in tokenlist] # prepares postag if postag is not None: metas["postag"] = postag else: metas["postag"] = ["?"] # default emtpy edges if edges is None: edges = { 'label': {}, 'postag' : {} } PyTextMiner.__init__(self, normalized_tokens, id, label, edges, **metas) # updates majors forms before returning instance self.addForm(normalized_tokens, metas["postag"], 1)
def ngramize(ngrams, minSize, maxSize, tagTokens, filters, stemmer): """ common ngramizing method returns a dict of filtered NGram instances using the optional stopwords object to filter by ngram length @tagTokens == [[word1 tokens], [word2 tokens], etc] """ # content is the list of words from tagTokens content = tagger.TreeBankPosTagger.getContent(tagTokens) stemmedcontent = [] for word in content: stemmedcontent += [stemmer.stem(word)] # tags is the list of tags from tagTokens tags = tagger.TreeBankPosTagger.getTag(tagTokens) for i in range(len(content)): for n in range(minSize, maxSize + 1): if len(content) >= i + n: # updates document's ngrams cache ngid = ngram.NGram.getNormId(stemmedcontent[i:n+i]) # id made from the stemmedcontent and label from the real tokens ng = ngram.NGram( content[i:n+i], id = ngid, label = PyTextMiner.form_label(content[i:n+i]), occs = 1, postag = tags[i:n+i] ) if filtering.apply_filters(ng, filters) is True: if ngid in ngrams: ngrams[ngid].addForm( content[i:n+i], tags[i:n+i], 1 ) ngrams[ngid].updateMajorForm() ngrams[ngid]['occs'] += ng['occs'] else: ngrams[ngid] = ng return ngrams
def __init__(self, name, edges=None, **metas): # list of corpus id content = [] if edges is not None and 'Corpus' in edges: content = edges['Corpus'].keys() PyTextMiner.__init__(self, content, name, name, edges=edges, **metas)
def __init__(self, id, label, edges=None, **metas): PyTextMiner.__init__(self, {}, id, label, edges, **metas) ### same database as tinasoft, but temp self.storage = self._get_storage() ### cache for corpus self.corpus = {}