def boolean_search(self, text): results = [] actualResults = "" stem = PorterStemmer.PorterStemmer() words = text.split() if words.__len__() == 1: results = self._inverted_index[stem.stem(words[0], 0, len(words[0]) - 1)] else: results1 = self._inverted_index[stem.stem(words[0], 0, len(words[0]) - 1)] #print (results1) results2 = self._inverted_index[stem.stem(words[2], 0, len(words[2]) - 1)] #print(results2) if words[1] == "AND": results = (set(results1) & set(results2)) if words[1] == "OR": results = set(results1).union(results2) # PUT YOUR CODE HERE for thingy in results: actualResults += str(thingy) #print (actualResults) return actualResults
def __init__(self, win, startTime, rootPath): import HTMLParser self.win = win self.StartTime = startTime self.rootPath = rootPath self.DocID = 0 self.WordID = 0 self.StemmedWordID = 0 self.DirCount = 0 self.FilesCount = 0 self.WordCount = 0 self.StemmedWordCount = 0 self.ElapsedTime = "" self.ParseStatus = "Indexing in Progress..." self.KeyColumnNames = "" self.UseStemmer = False self.Stemmer = None #self.SetupTextCatDB() #DBFunctions.SetupTextCatTables(Globals.TextCatFileName) DBFunctions.SetupSqliteIndexTables(Globals.TextCatFileName) self.EventStart = time.time() if Globals.Stemmer == "Porter Stemmer": self.Stemmer = PorterStemmer() self.FileScanStartTime = time.time() self.fout = None
def boolean_search(self, text): results = [] count = 0 # PUT YOUR CODE HERE PS = PorterStemmer.PorterStemmer() stemmed_Search =[] words = text.split() for word in words: stemmed_Search.append(PS.stem(word, 0, len(word)-1)) #print(stemmed_Search) for word in words: if(re.search('AND', word)):#if AND is in the query results+=self.computeAND(words[count-1], words[count]) return results if(re.search('OR', word)):#if OR is in the query #print(words[count - 1]) #print(words[count]) results += self.computeOR(words[count-1], words[count]) return results if(self._inverted_index.__contains__(stemmed_Search[0])):#for the cases without and/or #print(stemmed_Search[0]) #print(self._inverted_index['footbal']) results = self._inverted_index[stemmed_Search[0]] return results
def get_stems(): """ Returns the array of the filtered stems according to the conditions mentioned in the paper @return: stemarray """ stemarray = [] p = ps.PorterStemmer() infile = open("./part-of-speech.txt", 'r') while 1: output = '' line = infile.readline() line = line.split('\t')[0] if line == '': break for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() stemarray.append(output) if (len(output) > 2 and output not in stemarray) else None infile.close() return stemarray
def stemming(self, tokens): stemmed_tokens = [] porter = PorterStemmer.PorterStemmer() for i in range(len(tokens)): stemmed_tokens.append(porter.stem(tokens[i], 0, len(tokens[i]) - 1)) return stemmed_tokens
def __init__(self, win, startTime): import HTMLParser self.win = win self.StartTime = startTime self.DocID = 0 self.WordID = 0 self.StemmedWordID = 0 self.DirCount = 0 self.FilesCount = 0 self.WordCount = 0 self.StemmedWordCount = 0 self.ElapsedTime = "" self.ParseStatus = "Indexing in Progress..." self.KeyColumnNames = "" self.UseStemmer = False self.Stemmer = None #self.SetupTextCatDB() DBFunctions.SetupTextCatTables(Globals.TextCatFileName) """ self.timerStatus = wx.Timer(id=wx.NewId(), owner=self) self.Bind(wx.EVT_TIMER, self.OnTimerStatusTimer, id=self.timerStatus.GetId()) """ self.EventStart = time.time() self.splitter = re.compile(r'\W*') #self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I) if Globals.Stemmer == "Porter Stemmer": self.Stemmer = PorterStemmer() #self.UseStemmer = True self.htmlParser = HTMLParser.HTMLParser(self.Stemmer) self.textParser = TextParser.TextParser(self.Stemmer) """
def stemming(self, tokens): if __name__ == '__main__': stemmed_tokens = [] for i in tokens: stemmed_i = PorterStemmer.PorterStemmer().stem(i,0,(len(i)-1)) stemmed_tokens.append(stemmed_i) files = self._documents return stemmed_tokens
def stemming(self, tokens): stemmed_tokens = [] PS = PorterStemmer.PorterStemmer() for word in tokens: stemmed_tokens.append(PS.stem(word, 0, len(word)-1)) # PUT YOUR CODE HERE #print(stemmed_tokens) return stemmed_tokens
def stemming(self, tokens): stemmed_tokens = [] steemer = PorterStemmer.PorterStemmer() tempSteemed = "" for i in tokens: tempSteemed = steemer.stem(i, 0, len(i) - 1) stemmed_tokens.append(tempSteemed) return stemmed_tokens
def stemming(self, tokens): stemmed_tokens = [] stemmer = PorterStemmer.PorterStemmer() for token in tokens: stemmed_token = stemmer.stem(token, 0, len(token) - 1) # print(stemmed_token) stemmed_tokens.append(stemmed_token) return stemmed_tokens
def __init__(self): f = formatter.NullFormatter( ) #formatter.AbstractFormatter(formatter.DumbWriter()) #htmllib.HTMLParser.__init__(self, f) sgmllib.SGMLParser.__init__(self, f) self.SqliteDB = SqliteDatabase(Globals.DBName) self.Stemmer = PorterStemmer() self.ReadStopWords('stopwords.txt') #self.textData = "" #self.BitMap = BitMap #self.WordFrequency = {} #self.splitter = re.compile(r'\W+', re.I) self.splitter = re.compile(r'\s+', re.I) #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I) self.DigitWord = re.compile(r'\b\d+\b', re.I) self.AlphaNumericWord = re.compile(r'\w+', re.I) #self.doubleSlashes = re.compile(r'\\*', re.I) self.tagType = "" self.REUTERSTOPICS = "" self.LEWISSPLIT = "" self.CGISPLIT = "" self.NEWID = "" self.DATE = "" self.MKNOTE = "" self.TOPICS = "" self.PLACES = "" self.UNKNOWN = "" self.AUTHOR = "" self.DATELINE = "" self.TITLE = "" self.TOPICS = "" self.PLACES = "" self.PEOPLE = "" self.ORGS = "" self.EXCHANGES = "" self.COMPANIES = "" self.TEXTTYPE = "" self.DateHandled = False self.InTagDate = False self.MknoteHandled = False self.InTagMknote = False self.InTagTitle = False self.InTagDateline = False self.InTagBody = False self.InTagTopics = False self.InTagPlaces = False self.InTagPeople = False self.InTagOrgs = False self.InTagExchanges = False self.InTagCompanies = False self.InTagAuthor = False self.InTagUnknown = False
def stemWords(list_of_tokens): p = PorterStemmer.PorterStemmer() # instance of a Porter Stemmer stemmed_list = [] for token in list_of_tokens: if token.isalpha(): stemmed_list.append(p.stem(token.lower(), 0, len(token) - 1)) else: # if non-aphabetical character exists, no stemming! stemmed_list.append(token.lower()) return stemmed_list
def Rocchio(self, invertedFile, documentsList, relevantDocs): p = PorterStemmer.PorterStemmer() weights = {} for term in invertedFile.iterkeys(): sterm = term if STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0, len(term) - 1) weights[ sterm] = 0.0 #initialize weight vector for each key in inverted file print '' relevantDocsTFWeights = {} # ------------------------------------- # # Compute relevantDocsTFWeights and nonrelevantDocsTFWeights vectors for docId in relevantDocs: doc = documentsList[docId] for term in doc["tfVector"]: sterm = term if STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0, len(term) - 1) if sterm in relevantDocsTFWeights: relevantDocsTFWeights[sterm] = relevantDocsTFWeights[ sterm] + doc["tfVector"][term] else: relevantDocsTFWeights[sterm] = doc["tfVector"][term] # ------------------------------------- # # Compute Rocchio vector for term in invertedFile.iterkeys(): idf = math.log( float(len(documentsList)) / float(len(invertedFile[term].keys())), 10) sterm = term if STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0, len(term) - 1) # Terms 2 and 3 of Rocchio algorithm for docId in invertedFile[term].iterkeys(): if documentsList[docId]['IsRelevant'] == 1: # Term 2: Relevant documents weights normalized and given BETA weight weights[sterm] = weights[sterm] + constants.BETA * idf * ( relevantDocsTFWeights[sterm] / len(relevantDocs)) # Term 1 of Rocchio, query terms if term in self.query: self.query[term] = constants.BETA * self.query[term] + weights[ sterm] #build new query vector of weights elif weights[sterm] > 0: self.query[term] = weights[sterm] with open('output_lucene_after_relevance_feedback.txt', 'w') as file: file.write(pickle.dumps(self.query))
def TokenStem(document): steemer = PorterStemmer() returner = [] document = document.lower().split(' ') for word in document: if (word not in stopwords.words('english')): word = re.sub('[^A-Za-z0-9]+', '', word) word = steemer.stem(word, 0, len(word) - 1) returner.append(word) return returner
def clear(self, dataset, patter, replace = ' ', join = ' '): corpus = []; for e in dataset.values: review = re.sub(patter, replace, e); review = review.lower(); review = review.split(); ps = PorterStemmer(); review = [ps.stem(word) for word in review if not word in set(stopwords.words(self.lang))]; review = join.join(review); corpus.append(review)
def porter_stemmer(self, words_list): p = PorterStemmer() return_list = [] for i in range(len(words_list)): if words_list[i].isalpha(): return_list.append( p.stem(words_list[i], 0, len(words_list[i]) - 1)) else: return_list.append(words_list[i]) return return_list
def stem_text(text): stemmer = ps.PorterStemmer() output = '' word = '' for c in text: if c.isalpha(): word += c.lower() else: if word: output += stemmer.stem(word, 0, len(word) - 1) word = '' output += c.lower() output += stemmer.stem(word, 0, len(word) - 1) return output
def content_terms_generate(self, content): content_dict = dict() # Generate a query term list from the query. stemmer = ps.PorterStemmer() content_terms = [] term_list = content.strip().lower().split() for word in term_list: if word in self.stop_words: term_list.remove(word) for term in term_list: content_terms.append(stemmer.stem(term, 0, len(term)-1)) # Modify the query term list to a dictionary, then to a vector (Pandas Series). for term in content_terms: content_dict[term] = 1 if term not in content_dict.keys() else content_dict[term]+1 return content_dict
def stemm(line): p = PorterStemmer() #Here the porter stemmer is initialized. line += " " line1 = "" element = '' for c in line: if c.isalpha(): element += c.lower() else: if element: element = p.stem(element, 0, len(element) - 1) line1 += element line1 += " " element = '' return line1
def add_page_to_index_re(index, url, content): i = 0 # it is not a good idea to use regular expression to parse html # i did this just to give a quick and dirty result # to parse html pages in practice you should use a DOM parser regex = re.compile('(?<!script)[>](?![\s\#\'-<]).+?[<]') p = PorterStemmer.PorterStemmer() for words in regex.findall(content): word_list = split_string(words, """ ,"!-.()<>[]{};:?!-=`&""") for word in word_list: #word = stem(word,p) if word > 2: add_to_index(index, word, url) return i
def removeCommonPluralsN7(fileName): try: #remove plurals that are normally used in plural form #clothes, lots, pants #Also, we need to look at plurals that have suffixes CommonPlurals=["CLOTHES", "LOTS", "PANTS", "SCISSORS","SHORTS", "TROUSERS","TONGS","PLIERS","GLASSES","STAIRS"] SecondExamplar=["BLOCKS", "GRAPES", "SHOES"] LexicalCriteria=["N1","N2","N3","N4","N5","N6","N7","N8","N9","N10","N11","V1","V2","V8","V10","V11","V14","V17","Q1","Q2","Q4","Q8","Q9","S5","S10"] ps=PorterStemmer.PorterStemmer() examplesToDelete=[] for i in range(len(FileList[fileName]["N7"])): if FileList[fileName]["N7"][i][1] in CommonPlurals: examplesToDelete.append(FileList[fileName]["N7"][i]) for i in examplesToDelete: FileList[fileName]["N7"].remove(i) if len(FileList[fileName]["N7"])==1: if FileList[fileName]["N7"][0][1] in SecondExamplar: FileList[fileName]["N7"].pop(0) for lc in LexicalCriteria: l2=[] examplesToDelete=[] for i in range(len(FileList[fileName][lc])): present=False for l1 in l2: if ps.stem(l1.lower(),0,len(l1)-1)==ps.stem(FileList[fileName][lc][i][1].lower(),0,len(FileList[fileName][lc][i][1].lower())-1): present=True break if present==True: examplesToDelete.append(FileList[fileName][lc][i]) else: l2.append(FileList[fileName][lc][i][1]) FileList[fileName][lc].reverse() for l1 in examplesToDelete: FileList FileList[fileName][lc].remove(l1) FileList[fileName][lc].reverse() return except: print "Error occured while trying to remove plurals for N7"
def text_snippet(self, terms, start, length): """ Return a snippet from pos start to end with highlighted terms start - the "word" position (as opposed to characater position) length - how many words to include """ start_found = False new_start = 0 new_end = 0 pos = start for term in self.text.split(" "): pos = pos - 1 if not start_found: new_start = new_start + 1 else: new_end = new_end + 1 if not start_found and pos <= 0: pos = length start_found = True elif pos <= 0: break new_end = new_start + new_end snippet = " ".join(self.text.split(" ")[new_start:new_end]) for term in terms: p = PorterStemmer.PorterStemmer() term = p.stem(term, 0,len(term)-1) snippet = re.sub('(?i)([\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]' + \ re.escape(term) + \ "[^\s.,=?!:@<>()\"-;\'&_\\{\\}\\|\\[\\]\\\\]*)", '\033[94m\\1\033[0m', snippet) return snippet
def __init__(self): f = formatter.NullFormatter( ) #formatter.AbstractFormatter(formatter.DumbWriter()) #htmllib.HTMLParser.__init__(self, f) sgmllib.SGMLParser.__init__(self, f) self.SqliteDB = SqliteDatabase(Globals.DBName) self.Stemmer = PorterStemmer() #self.textData = "" #self.BitMap = BitMap #self.WordFrequency = {} self.splitter = re.compile(r'\W+', re.I) #self.splitter = re.compile(r'\s+', re.I) #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I) #self.DigitWord = re.compile(r'\b\d+\b', re.I) self.DigitWord = re.compile(r'[a-z]*\d+[a-z]*', re.I) self.AlphaNumericWord = re.compile(r'[a-z]*\W+[a-z]*', re.I) self.AlphabeticWord = re.compile(r'[a-z]+') #self.doubleSlashes = re.compile(r'\\*', re.I) self.BodyData = ""
from PorterStemmer import * import re import math vocab = [] #This is an array for the unique vocabulary. p = PorterStemmer() #Here the porter stemmer is initialized. number_of_total_senses = 0 #This variable keeps the total number of senses. senses = { } #This dictionary is to keep the number of existance of all different senses. training_set = {} #This is going to be a nested dictionary. sense_word = {} cosine_similarity_dict = {} unique_vocab = {} count = 0 result_text = "" word1 = "" #The first word to be compared. word2 = "" #The second word to be compared. total_document = "" #This is to keep all the sentences under one example. #Here the output file işs created. def print_out(line): output_file = open("output.txt", "a") output_file.write(line) output_file.close() return #This function is used to use stemmer. def stemm(line, words): element = ''
import sys import PorterStemmer p = PorterStemmer.PorterStemmer() import math import numpy as np output = "" terms = {} docs = {} import json stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "not", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] queries = [] with open ("./cran_qry.txt", 'r') as infile: while 1: word = '' line = infile.readline() if '.I' in line:
def stemming(self, tokens): stemmed_tokens = [] # PUT YOUR CODE HERE stemmer = PorterStemmer.PorterStemmer() stemmed_tokens = [stemmer.stem(w, 0, len(w) - 1) for w in tokens] return stemmed_tokens
def main(): if len(argv) < 2: exit('Usage: ./stemWord word') ps = PorterStemmer.PorterStemmer() print(ps.stem(argv[1]))
def stem(terms): ps = PorterStemmer.PorterStemmer() return [ps.stem(x, 0, len(x) - 1) for x in terms]
stopWordSet.add(str.strip(line)) # extract docID, title, and text from the XML xmlparse = XMLParser(collection) parsedPages = xmlparse.parseCollection() # maps from terms to smaller dictionaries bigDict = dict() # maps from terms to idf idfDict = dict() # maps from docID to euclidian normalization factor normalizationDict = dict() pstemmer = PorterStemmer.PorterStemmer() # the number of documents N = float(len(parsedPages)) # set up the inverted index for p in parsedPages: docID = p._id # write the title index titleIndex.write(str(p._id) + "\t" + p._title + "\n") # split the terms into a list listOfTerms = re.split('[^a-z0-9]+', (p._title + "\n" + p._text).lower()) if listOfTerms[0] == '':
def Rocchio(self, invertedFile, documentsList, relevantDocs, nonrelevantDocs): ''' output new query vector' calculate summation of relevant documents weights 'calculate IDF per inverted file' ''' p = PorterStemmer.PorterStemmer() weights = {} for term in invertedFile.iterkeys(): sterm = term if constants.STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0,len(term)-1) weights[sterm] = 0.0 #initialize weight vector for each key in inverted file print '' relevantDocsTFWeights = {} nonrelevantDocsTFWeights = {} # ------------------------------------- # # Compute relevantDocsTFWeights and nonrelevantDocsTFWeights vectors for docId in relevantDocs: doc = documentsList[docId] for term in doc["tfVector"]: sterm = term if constants.STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0,len(term)-1) # combining frequencies from individual document's tfVectors if sterm in relevantDocsTFWeights: relevantDocsTFWeights[sterm] = relevantDocsTFWeights[sterm] + doc["tfVector"][term] else: relevantDocsTFWeights[sterm] = doc["tfVector"][term] for docId in nonrelevantDocs: doc = documentsList[docId] for term in doc["tfVector"]: sterm = term if constants.STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0,len(term)-1) if sterm in nonrelevantDocsTFWeights: nonrelevantDocsTFWeights[sterm] = nonrelevantDocsTFWeights[sterm] + doc["tfVector"][term] else: nonrelevantDocsTFWeights[sterm] = doc["tfVector"][term] # ------------------------------------- # # Compute Rocchio vector for term in invertedFile.iterkeys(): idf = math.log(float(len(documentsList)) / float(len(invertedFile[term].keys())), 10) sterm = term if constants.STEM_IN_ROCCHIO: sterm = p.stem(term.lower(), 0,len(term)-1) # Terms 2 and 3 of Rocchio algorithm for docId in invertedFile[term].iterkeys(): if documentsList[docId]['IsRelevant'] == 1: # Term 2: Relevant documents weights normalized and given BETA weight weights[sterm] = weights[sterm] + constants.BETA * idf * (float(relevantDocsTFWeights[sterm]) / len(relevantDocs)) else: # Term 3: NonRelevant documents weights normalized and given BETA weight weights[sterm] = weights[sterm] - constants.GAMMA * idf * (nonrelevantDocsTFWeights[sterm]/len(nonrelevantDocs)) if term in self.query: self.query[term] = constants.ALPHA * self.query[term] + weights[sterm] #build new query vector of weights elif weights[sterm] > 0: self.query[term] = weights[sterm] return self.query