def ngramStemmer (self, wordList, size, equality): "reduces wordList according to the n-gram stemming method" # use return_list and stop_list for the terms to be removed, later returnList = [] stopList = [] ngramAdvas = Advas("","") # calculate length and range listLength = len(wordList) outerListRange = range(0, listLength) for i in outerListRange: term1 = wordList[i] innerListRange = range (0, i) # define basic n-gram object term1Ngram = Ngram(term1, 2) term1Ngram.deriveNgrams() term1NgramList = term1Ngram.getNgrams() for j in innerListRange: term2 = wordList[j] term2Ngram = Ngram(term2, 2) term2Ngram.deriveNgrams() term2NgramList = term2Ngram.getNgrams() # calculate n-gram value ngramSimilarity = ngramAdvas.compareNgramLists (term1NgramList, term2NgramList) # compare degree = ngramSimilarity - equality if (degree>0): # ... these terms are so similar that they can be conflated # remove the longer term, keep the shorter one if (len(term2)>len(term1)): stopList.append(term2) else: stopList.append(term1) # end if # end if # end for # end for # conflate the matrix # remove all the items which appear in stopList return list(set(wordList) - set(stopList))
def getNgramsByWord(self, word, ngramSize): if not ngramSize: return [] term = Ngram(word, ngramSize) if term.deriveNgrams(): return term.getNgrams() else: return []
def calcSuccVariety(self): # derive two-letter combinations ngramObject = Ngram(self.term, 2) ngramObject.deriveNgrams() ngramSet = set(ngramObject.getNgrams()) # count appearances of the second letter varietyList = {} for entry in ngramSet: letter1 = entry[0] letter2 = entry[1] if varietyList.has_key(letter1): items = varietyList[letter1] if not letter2 in items: # extend the existing one items.append(letter2) varietyList[letter1] = items else: # create a new one varietyList[letter1] = [letter2] return varietyList
def getNgramsByLine(self, ngramSize): if not ngramSize: return [] occurency = [] # split the given text into single lines lines = self.splitParagraph() for line in lines: term = Ngram(line, ngramSize) if term.deriveNgrams(): occurency.append(term.getNgrams()) else: occurency.append([]) return occurency