def indexDoc(self, doc): # indexing a Document object ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead''' # ToDo: indexing only title and body; use some functions defined in util.py titletoken = word_tokenize(doc.subject) bodytoken = word_tokenize(doc.body) tokens = titletoken + bodytoken for counter, token in enumerate(tokens): #remove stop words from token token_is_stopword = isStopWord(token) if (token_is_stopword): tokens.pop(counter) continue #perform stemming stemmedToken = stemming(token) positionindoc = counter tokens[counter] = stemmedToken tempindexitem = IndexItem(tokens[counter]) if (stemmedToken in self.items): self.items.get(stemmedToken).add(doc.docID, positionindoc, doc.class_name) else: tempindexitem.add(doc.docID, positionindoc, doc.class_name) self.items[stemmedToken] = tempindexitem positionindoc = positionindoc + len(tokens[counter]) + 1
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms #lower-case query self.raw_query = self.raw_query.lower() #eliminate numbers self.raw_query = re.sub(r'\d+', '', self.raw_query) #tokenizing tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(self.raw_query) self.words = [] #spell correction, stop word removal, stemming for i in tokens: i = norvig_spell.correction(i) if util.isStopWord(i) == False: self.words.append(util.stemming(i))
def preprocessing(self, qid): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' qbody = self.raw_query qbody = qbody.get(qid) # self.convertFromMap(qbody) #self.docs print("Below is the query: ") print(qbody.text) #in query.text, there are no 005, etc due to this exception will # be thrown. try: qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text)) except Exception: print("Query ID which is not having text: ", qid) raise tokens = nltk.tokenize.word_tokenize(qbody) corrected_tokens = [correction(word) for word in tokens] #spell check converted_tokens = [word.lower() for word in corrected_tokens] #below query will not have stop words clean_query = [] for word in converted_tokens: #removing stop words if util.isStopWord(word): clean_query.append(util.stemming(word)) if len(clean_query) > 0: self.query.append(clean_query) print("Query after spell check and removing the stop words: ", self.query)
def indexDoc(self, doc): # indexing a Document object ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead''' listoftokens=[] listofterms=[] psdocpos={} values=[] #tokenize the document body tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(doc.body) self.len_body=len(tokens) listoftokens+=tokens '''replace the token after stemming ''' for counter,token in enumerate(tokens): #self.tf=0 stemmed_term = stemming(token) posi=counter tokens[counter]=stemmed_term #remove stop words from token token_is_stopword=isStopWord(token) if(token_is_stopword): tokens.pop(counter) continue indexitem = IndexItem(stemmed_term) indexitem.add(doc.docID,posi) psdocpos= {doc.docID:[posi]} ''' if document contains the term''' if self.items.__contains__(stemmed_term): #if same term in same document if(self.items[stemmed_term].__contains__(doc.docID)): self.tf+=1/self.len_body values=self.items.get(stemmed_term) values[doc.docID].append(posi) val=self.term_freq.get(stemmed_term) val[doc.docID]=self.tf #if same term in different document else: self.tf=1/self.len_body self.items[stemmed_term].update(psdocpos) self.term_freq[stemmed_term].update({doc.docID:self.tf}) #insert the new term and posting else: self.tf=1/self.len_body self.items.update({stemmed_term:psdocpos}) self.term_freq.update({stemmed_term:{doc.docID:self.tf}}) listofterms+=tokens
def preprocessing(self): """ apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)""" print(self.raw_query) tokens = word_tokenize(self.raw_query) alpha_tokens = [ norvig_spell.correction(token) for token in tokens if token.isalpha() ] # tokenizing the query,norvig_spell check and removing punctuations self.tokens = [ util.stemming(token.lower()) for token in alpha_tokens if not util.isStopWord(token) ] # remove stopwords return self.tokens
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms for x in self.raw_query: lower_case = util.query_lower(self.raw_query[x].text) lower_case = list( map(lambda y: norvig_spell.correction(y), lower_case) ) #spelling correction is done before stemming and removing of stop words lower_case = list(map(lambda y: util.stemming(y), lower_case)) for i in list(lower_case): if util.isStopWord(i): lower_case.remove(i) QueryProcessor.preprocessed_query[x] = lower_case
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector''' #ToDo: return a list of terms 'tokenization of query along with removal of punctuation' tokenizer = RegexpTokenizer(r'\w+') querytoken = tokenizer.tokenize(self.raw_query) '''checking for spell error in query tokens and making corrections and storing the words in Query dictionary''' for token in querytoken: to_lower = ''.join( norvig_spell.words(token)) #converting list to string spellcorrection = norvig_spell.correction(to_lower) Query.append(spellcorrection) stopword = isStopWord(spellcorrection) if not stopword: stemqueryterm = stemming(spellcorrection) Queryterm.append(stemqueryterm)
def preprocessing(self): tokenizer = RegexpTokenizer(r'\w+') querytokens = tokenizer.tokenize(self.raw_query) self.q_tf_dino = len(querytokens) #length of the query #make spell correction for counter, querytoken in enumerate(querytokens): querytokens[counter] = norvig_spell.correction(querytoken) #replace the token after stemming for counter, querytoken in enumerate(querytokens): stemmed_term = stemming(querytoken) querytokens[counter] = stemmed_term #remove stop words from token token_is_stopword = isStopWord(querytoken) if (token_is_stopword): querytokens.pop(counter) print("Query tokens", querytokens) return querytokens
def test(): ''' test your code thoroughly. put the testing cases here''' '''test code to test wheather the NLTK process the document/sentence and return tokens without punctuation''' tokenizer = RegexpTokenizer(r'\w+') string = tokenizer.tokenize("this is me checking , . (tokenization ' " ")/\ ") print(string) i = ['you', 'i', 'me', 'my', 'myself', 'bad', 'good'] 'checking for if function returns true for real stop words' for i in i: value = isStopWord(i) if (value == True): print("stopword", i) else: print("Not stopword", i) stem = ['stemming', 'cars', 'experimental', 'coming'] rootwords = [] for s in stem: stemword = stemming(s) rootwords.append(stemword) print("words post stemming") print(rootwords) print('Pass')
def preprocessing(self, qid): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms qbody = self.raw_query cqObj = CranFile('query.text') qbody = qbody.get(qid) #self.convertFromMap(qbody) #self.docs #print(qbody) try: qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text)) except Exception: print("Query ID which is not having text: ", qid) raise reduced = nltk.tokenize.word_tokenize(qbody) '''for words in reduced: # reduced terms are passed through stopwords and stemming in util if util.isStopWord(words): self.query.append(util.stemming(words).lower()) # normalized terms are stored in reducedList Dictionary print("1...", self.query)''' correctedwords = [correction(word) for word in reduced] lowercasewords = [word.lower() for word in correctedwords] notstopwords = [] for word in lowercasewords: if util.isStopWord(word): notstopwords.append(util.stemming(word)) if len(notstopwords) > 0: self.query.append(notstopwords) print("1...", self.query)
def remove_stop_word(tok): return "" if util.isStopWord(tok) else tok
import util print util.isStopWord("hel") print util.stemming("Running") from nltk.corpus import stopwords from nltk.tokenize import word_tokenize example_sent = "This is a sample sentence, showing off the stop words filtration." stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(example_sent) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) print(word_tokens) print(filtered_sentence)
def indexDoc(self, Pdoc): # indexing a Document object ''' indexing a document, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead''' # ToDo: indexing only title and body; use some functions defined in util.py # (1) convert to lower cases, Predictionary = {} 'Tokenizing the document' tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(Pdoc.body) '''iterating over the tokens and converting it to lowercase''' for tokenpos, token in enumerate(tokens): IndexItemobj = IndexItem(token.lower()) 'checking if token is in the Predictionary' if (IndexItemobj.term not in Predictionary): # (2) remove stopwords, 'checking the token is stop word or not.If it is stop word it will not be appended to Predictionary' isStop = isStopWord(IndexItemobj.term) if isStop == False: 'storing the token position of the document along with docid for the token in stem dictionary' IndexItemobj.posting[int(Pdoc.docID)] = [tokenpos] Predictionary[IndexItemobj.term] = IndexItemobj.posting '''This below code is executed if predictionary already contains the token just appened the value rather then replacing the term with new value''' else: docIDlist = Predictionary[IndexItemobj.term] if int(Pdoc.docID) not in docIDlist: docIDlist[int(Pdoc.docID)] = [tokenpos] else: docIDlist[int(Pdoc.docID)].append(tokenpos) 'stemming the tokens in predictionary ' 'stem dictionary merging common terms and postion of token in a document while stemming' # (3) stemming Stemdictionary = {} for keytoken, values in Predictionary.items(): stem = stemming(keytoken) if stem not in Stemdictionary: Stemdictionary[stem] = values else: stemlist = Stemdictionary[stem] for k in values.keys(): valuekey = k valueposition = values[valuekey] for v in valueposition: stemlist[valuekey].append(v) Doclist.append(Stemdictionary) IndexItemobj2 = IndexItem(keytoken) 'sorting the token positions in stemDictionary' IndexItemobj2.sort() 'single pass in memory indexing' '''Below code builds the inverted index using SPIMI-INVERT ''' for termdata in Doclist: for token, posting in termdata.items(): for id, termpos in posting.items(): if token not in dictionary: 'add postings to dictionary if dictionary does not contain the token' dictionary[token] = [posting] # add to dictionary else: 'get Postings list if term existing in dictionary and append it to existing term' Getpostinglist = dictionary[token] if posting not in Getpostinglist: dictionary[token].append( posting) #add to postings list
def indexDoc(self, docs): # indexing a Document object ''' indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead''' # ToDo: indexing only title and body; use some functions defined in util.py # (1) convert to lower cases, # (2) remove stopwords, # (3) stemming #lower case title and body t = docs.title.lower() b = docs.body.lower() self.nDocs = self.nDocs + 1 #remove numbers t = re.sub(r'\d+', '', t) b = re.sub(r'\d+', '', b) #tokenize tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(t) #stopword removal words = [] for i in tokens: if util.isStopWord(i) == False: words.append(i) tokens = tokenizer.tokenize(b) for i in tokens: if util.isStopWord(i) == False: words.append(i) #stemming word = [] for i in words: word.append(util.stemming(i)) #to store the terms of each document as list to compute unit vector of documents self.dictionary[docs.docID] = [] #add each term to the index for pos, i in enumerate(word): if i not in self.items: self.items[i] = IndexItem( i) # create an IndexItem object for each term self.items[i].add( docs.docID, pos + 1) # add documents and positions to posting dictionary self.dictionary[docs.docID].append( i) #adds each term to the list appearing in a document self.dictionary[docs.docID] = set(self.dictionary[ docs.docID]) #removes duplicate terms from the list #computing tf-idf if self.nDocs == self.N: #sort the index by terms self.index = OrderedDict(sorted(self.items.items())) #compute document frequency for each term for term in self.index: self.df[term] = len(self.index[term].posting) self.index[term].sort() #sort the posting by docID #compute the term frequency for each term in a document for term in self.index: self.tf[term] = {} for docc in self.index[term].sortedp: if len(self.index[term].sortedp[docc].positions) == 0: self.tf[term][docc] = 0 else: #compute (1+ log(tf))*idf self.tf[term][docc] = (1 + math.log10( len(self.index[term].sortedp[docc].positions)) ) * self.idf(term) #compute the sum of squares of each term in the document and calculate square root sums = {} for d in self.dictionary: sums[d] = 0 for i in self.dictionary[d]: sums[d] += math.pow(self.tf[i][d], 2) sums[d] = math.sqrt(sums[d]) #Divide tf*idf/ sqrt(sum of sqaures) and store in tf dictionary -> weights for each term in the documents for term in self.index: self.idfs[term] = self.idf( term ) #compute the idf of each term in the index and store in dictionary for docc in self.index[term].sortedp: if sums[docc] == 0: self.tf[term][docc] = 0 else: self.tf[term][docc] = self.tf[term][docc] / sums[docc]