Exemplo n.º 1
0
	def termDefWordvector(self):
		"""
		Add term definition to term - word vector dictionary
		"""
		cursor=self.db.cursor()
		stemmer = porterStemmer.PorterStemmer()
		punctuations = list(string.punctuation)
		nodes = self.goGraph.nodes() 
		for term in nodes:
			query= "SELECT TD.term_definition,T.name from term_definition as TD, term as T where T.id=TD.term_id and T.acc=('%s')"%(term)
			cursor.execute(query)
			query_result = cursor.fetchall();
			for row in query_result:
				wordvector={}
				termDef = row[0]
				termName = row[1]
				termDef = termDef.lower()
				termName = termName.lower()
				tokens = word_tokenize(termDef+","+termName)
				for word in tokens:
					if word in self.stopwords:
						continue;
					word = stemmer.stem(word,0,len(word)-1)
					if word in wordvector:
						wordvector[word] += 1
					else: 
						wordvector[word] =1
				keys_to_remove = [key for key, value in wordvector.iteritems() if key in punctuations]
				for key in keys_to_remove:
					del wordvector[key]
				self.termWordvector[term]=wordvector	
Exemplo n.º 2
0
 def __init__(self):
     #self.inputFileName = 'relations2.dat'
     #self.inputFileName = 'relations3.dat'
     self.inputFileName = 'eval-relations-phase2.dat'
     #self.inputFileName = '../ok-relations-phase1.dat'
     self.relationReader = RelationReader()
     self.stopWorder = stopWorder.StopWorder()
     self.stemmer = porterStemmer.PorterStemmer()
     self.articleMatches = {}
     self.articleToRelation = {}
     self.magicMatchNumber = 1
     self.maxSentencesPerSummary = 6
Exemplo n.º 3
0
def stemming(a, b):
    p = PorterStemmer.PorterStemmer()
    aWords = a.split()
    bWords = b.split()
    if len(aWords) != len(bWords):
        return False

    for i in range(len(aWords)):
        if p.stem(aWords[i], 0,
                  len(aWords[i]) - 1) != p.stem(bWords[i], 0,
                                                len(bWords[i]) - 1):
            return False
    return True
Exemplo n.º 4
0
def stemmer(line):
    output = ''
    word = ''
    p = porterStemmer.PorterStemmer()
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0, len(word) - 1)
                word = ''
            output += c.lower()
    return output
Exemplo n.º 5
0
 def __init__(self):
     self.stopWorder = stopWorder.StopWorder()
     self.stemmer = porterStemmer.PorterStemmer()
     #self.inputFileName = '../engine/relations.dat'
     #self.inputFileName = 'relations2.dat'
     #self.inputFileName = 'relations3.dat'
     self.inputFileName = 'eval-relations-phase2.dat'
     #self.inputFileName = '../ok-relations-phase1.dat'
     self.relationReader = RelationReader()
     self.relationCount = 0
     self.relations = {}
     self.index = {}
     self.relationFile = open('out-relationCountIndex.dat', 'w')
     self.magicMatchNumber = 1
Exemplo n.º 6
0
 def __init__(self, f, sexualList, violentList, adultList, sexualBase,
              violentBase, adultBase):
     self.sList = sexualList
     self.vList = violentList
     self.aList = adultList
     self.theFile = f
     self.totalWords = 0.0
     self.sexualWords = 0.0
     self.violentWords = 0.0
     self.adultContentWords = 0.0
     self.sBase = sexualBase
     self.vBase = violentBase
     self.aBase = adultBase
     self.stemmer = porterStemmer.PorterStemmer()
     self.setWords()
Exemplo n.º 7
0
	def createDocuments(self, xmlfile):
		"""
		Return documents dictionary, pmid as key, a wordvector of title and abstract value
		@oaram xmlfile parsed xmlfile contains all pubmed artiles
		@return documents 
		"""
		articles = {}
		stemmer = porterStemmer.PorterStemmer()
		documents={}
		with open(xmlfile, 'rt') as f:
			tree = ET.parse(f)
			root = tree.getroot()
			for article in root.iter("PubMedArticle"):
				pmid = article.find("PMID").text
				titleAbstract = ""
				title = article.find("Title")
				if title is not None:
					titleAbstract=titleAbstract+title.text+". "
				elif title is None:
					pass
				abstract = article.find("Abstract").text
				if abstract is None:
					pass
				elif abstract is not None:
					titleAbstract = titleAbstract+abstract
				if titleAbstract == "":
					continue
				else:
					articles[pmid]=titleAbstract
		for key in articles:
			pmid = key
			if articles[key] is not None:
				titleAbstract = articles[key]
				wordvector = self.wordVector(titleAbstract, stemmer, self.stopwords);
				documents[pmid] = wordvector
		return documents
Exemplo n.º 8
0
 def __init__(self, theConn):
     self.conn = theConn
     cur = theConn.cursor()
     cur.execute("SET search_path TO filerater")
     cur.close()
     self.stem = porterStemmer.PorterStemmer()
Exemplo n.º 9
0
## porterStemmerTest.py tests the accuracy of the Porter Stemmer
## Author: Robert Lowman
## Date: 12/8/16
import unittest
import porterStemmer

stemmer = porterStemmer.PorterStemmer()
testFile = open('PorterStemmerTest.txt', 'r')
resultFile = open('PorterStemmerResults.txt', 'r')
for line in testFile:
    theLine = line.strip()
    temp = resultFile.readline().strip()
    stem = stemmer.stripWord(theLine)
    if temp != stem:
        print theLine + " became " + stem + " after stemming, but " \
             + temp + " was expected.\n"