def termDefWordvector(self): """ Add term definition to term - word vector dictionary """ cursor=self.db.cursor() stemmer = porterStemmer.PorterStemmer() punctuations = list(string.punctuation) nodes = self.goGraph.nodes() for term in nodes: query= "SELECT TD.term_definition,T.name from term_definition as TD, term as T where T.id=TD.term_id and T.acc=('%s')"%(term) cursor.execute(query) query_result = cursor.fetchall(); for row in query_result: wordvector={} termDef = row[0] termName = row[1] termDef = termDef.lower() termName = termName.lower() tokens = word_tokenize(termDef+","+termName) for word in tokens: if word in self.stopwords: continue; word = stemmer.stem(word,0,len(word)-1) if word in wordvector: wordvector[word] += 1 else: wordvector[word] =1 keys_to_remove = [key for key, value in wordvector.iteritems() if key in punctuations] for key in keys_to_remove: del wordvector[key] self.termWordvector[term]=wordvector
def __init__(self): #self.inputFileName = 'relations2.dat' #self.inputFileName = 'relations3.dat' self.inputFileName = 'eval-relations-phase2.dat' #self.inputFileName = '../ok-relations-phase1.dat' self.relationReader = RelationReader() self.stopWorder = stopWorder.StopWorder() self.stemmer = porterStemmer.PorterStemmer() self.articleMatches = {} self.articleToRelation = {} self.magicMatchNumber = 1 self.maxSentencesPerSummary = 6
def stemming(a, b): p = PorterStemmer.PorterStemmer() aWords = a.split() bWords = b.split() if len(aWords) != len(bWords): return False for i in range(len(aWords)): if p.stem(aWords[i], 0, len(aWords[i]) - 1) != p.stem(bWords[i], 0, len(bWords[i]) - 1): return False return True
def stemmer(line): output = '' word = '' p = porterStemmer.PorterStemmer() for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() return output
def __init__(self): self.stopWorder = stopWorder.StopWorder() self.stemmer = porterStemmer.PorterStemmer() #self.inputFileName = '../engine/relations.dat' #self.inputFileName = 'relations2.dat' #self.inputFileName = 'relations3.dat' self.inputFileName = 'eval-relations-phase2.dat' #self.inputFileName = '../ok-relations-phase1.dat' self.relationReader = RelationReader() self.relationCount = 0 self.relations = {} self.index = {} self.relationFile = open('out-relationCountIndex.dat', 'w') self.magicMatchNumber = 1
def __init__(self, f, sexualList, violentList, adultList, sexualBase, violentBase, adultBase): self.sList = sexualList self.vList = violentList self.aList = adultList self.theFile = f self.totalWords = 0.0 self.sexualWords = 0.0 self.violentWords = 0.0 self.adultContentWords = 0.0 self.sBase = sexualBase self.vBase = violentBase self.aBase = adultBase self.stemmer = porterStemmer.PorterStemmer() self.setWords()
def createDocuments(self, xmlfile): """ Return documents dictionary, pmid as key, a wordvector of title and abstract value @oaram xmlfile parsed xmlfile contains all pubmed artiles @return documents """ articles = {} stemmer = porterStemmer.PorterStemmer() documents={} with open(xmlfile, 'rt') as f: tree = ET.parse(f) root = tree.getroot() for article in root.iter("PubMedArticle"): pmid = article.find("PMID").text titleAbstract = "" title = article.find("Title") if title is not None: titleAbstract=titleAbstract+title.text+". " elif title is None: pass abstract = article.find("Abstract").text if abstract is None: pass elif abstract is not None: titleAbstract = titleAbstract+abstract if titleAbstract == "": continue else: articles[pmid]=titleAbstract for key in articles: pmid = key if articles[key] is not None: titleAbstract = articles[key] wordvector = self.wordVector(titleAbstract, stemmer, self.stopwords); documents[pmid] = wordvector return documents
def __init__(self, theConn): self.conn = theConn cur = theConn.cursor() cur.execute("SET search_path TO filerater") cur.close() self.stem = porterStemmer.PorterStemmer()
## porterStemmerTest.py tests the accuracy of the Porter Stemmer ## Author: Robert Lowman ## Date: 12/8/16 import unittest import porterStemmer stemmer = porterStemmer.PorterStemmer() testFile = open('PorterStemmerTest.txt', 'r') resultFile = open('PorterStemmerResults.txt', 'r') for line in testFile: theLine = line.strip() temp = resultFile.readline().strip() stem = stemmer.stripWord(theLine) if temp != stem: print theLine + " became " + stem + " after stemming, but " \ + temp + " was expected.\n"