def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms #lower-case query self.raw_query = self.raw_query.lower() #eliminate numbers self.raw_query = re.sub(r'\d+', '', self.raw_query) #tokenizing tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(self.raw_query) self.words = [] #spell correction, stop word removal, stemming for i in tokens: i = norvig_spell.correction(i) if util.isStopWord(i) == False: self.words.append(util.stemming(i))
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms for q in self.raw_query: query_tokens = [] stemmed_query_tokens = [] # print(q, self.raw_query[q].text) # query_tokens = re.split(" ", self.raw_query[q].text.replace('\n', ' ')) query_tokens = word_tokenize(self.raw_query[q].text) query_tokens = [element.lower() for element in query_tokens]; tempcounter = 0 while tempcounter < len(query_tokens): query_tokens[tempcounter] = correction(query_tokens[tempcounter]); tempcounter = tempcounter + 1 ps = PorterStemmer() temp = 0 querytokentemp = 0 while temp < len(query_tokens): query_tokens[temp] = ps.stem(query_tokens[temp]) querytokentemp = querytokentemp + 1 with open("stopwords") as f: for line in f: if line.strip() == query_tokens[temp]: query_tokens.remove(line.strip()) temp = temp - 1 temp = temp + 1 self.preprocessed_query_tokens[q] = query_tokens
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms # Tokenize and lowercase doc into list form token_list = util.tokenize_doc(self.raw_query) # Helper function to replace stopwords with empty string def remove_stop_word(tok): return "" if util.isStopWord(tok) else tok # Correct spelling of each word tokens_corrected_spell = list( map(lambda tok: correction(tok), token_list)) # Remove the stopwords from both positional list and token list token_list_no_stopword = list( map(remove_stop_word, tokens_corrected_spell)) # Stem the words stemmed_token_list = list( map(lambda tok: util.stemming(tok), token_list_no_stopword)) return stemmed_token_list
def preprocessing(self, qid): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' qbody = self.raw_query qbody = qbody.get(qid) # self.convertFromMap(qbody) #self.docs print("Below is the query: ") print(qbody.text) #in query.text, there are no 005, etc due to this exception will # be thrown. try: qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text)) except Exception: print("Query ID which is not having text: ", qid) raise tokens = nltk.tokenize.word_tokenize(qbody) corrected_tokens = [correction(word) for word in tokens] #spell check converted_tokens = [word.lower() for word in corrected_tokens] #below query will not have stop words clean_query = [] for word in converted_tokens: #removing stop words if util.isStopWord(word): clean_query.append(util.stemming(word)) if len(clean_query) > 0: self.query.append(clean_query) print("Query after spell check and removing the stop words: ", self.query)
def tokenize_text_for_q(self, doc): list_token = [] tokenizer = RegexpTokenizer(r'\w+') list_token = tokenizer.tokenize(doc) # because of the limited size of our corpus, spelling correction results in slight boost # with a larger corpus you would not do this, especially due to the simplicity of the spelling correction list_token = [ correction(word.lower()) if word not in self.known_words else word.lower() for word in list_token ] return list_token
def preprocessing(self): """ apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)""" print(self.raw_query) tokens = word_tokenize(self.raw_query) alpha_tokens = [ norvig_spell.correction(token) for token in tokens if token.isalpha() ] # tokenizing the query,norvig_spell check and removing punctuations self.tokens = [ util.stemming(token.lower()) for token in alpha_tokens if not util.isStopWord(token) ] # remove stopwords return self.tokens
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms for x in self.raw_query: lower_case = util.query_lower(self.raw_query[x].text) lower_case = list( map(lambda y: norvig_spell.correction(y), lower_case) ) #spelling correction is done before stemming and removing of stop words lower_case = list(map(lambda y: util.stemming(y), lower_case)) for i in list(lower_case): if util.isStopWord(i): lower_case.remove(i) QueryProcessor.preprocessed_query[x] = lower_case
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector''' #ToDo: return a list of terms 'tokenization of query along with removal of punctuation' tokenizer = RegexpTokenizer(r'\w+') querytoken = tokenizer.tokenize(self.raw_query) '''checking for spell error in query tokens and making corrections and storing the words in Query dictionary''' for token in querytoken: to_lower = ''.join( norvig_spell.words(token)) #converting list to string spellcorrection = norvig_spell.correction(to_lower) Query.append(spellcorrection) stopword = isStopWord(spellcorrection) if not stopword: stemqueryterm = stemming(spellcorrection) Queryterm.append(stemqueryterm)
def preprocessing(self): tokenizer = RegexpTokenizer(r'\w+') querytokens = tokenizer.tokenize(self.raw_query) self.q_tf_dino = len(querytokens) #length of the query #make spell correction for counter, querytoken in enumerate(querytokens): querytokens[counter] = norvig_spell.correction(querytoken) #replace the token after stemming for counter, querytoken in enumerate(querytokens): stemmed_term = stemming(querytoken) querytokens[counter] = stemmed_term #remove stop words from token token_is_stopword = isStopWord(querytoken) if (token_is_stopword): querytokens.pop(counter) print("Query tokens", querytokens) return querytokens
def preprocessing(self): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms corrected_terms_list = list() for term in self.raw_query.split(' '): #splitting on white space corrected_terms_list.append(norvig_spell.correction(term)) try: corrected_terms_list.remove( 'gw' ) #since we used the Cranfield dataset for spelling correction, 'gw' appeared and we remove here except: pass corrected_terms_text = ' '.join(corrected_terms_list) terms = util.splitDoc(corrected_terms_text) return terms #list of terms
def preprocessing(self, qid): ''' apply the same preprocessing steps used by indexing, also use the provided spelling corrector. Note that spelling corrector should be applied before stopword removal and stemming (why?)''' #ToDo: return a list of terms qbody = self.raw_query cqObj = CranFile('query.text') qbody = qbody.get(qid) #self.convertFromMap(qbody) #self.docs #print(qbody) try: qbody = re.sub("[^a-z0-9]+", " ", str(qbody.text)) except Exception: print("Query ID which is not having text: ", qid) raise reduced = nltk.tokenize.word_tokenize(qbody) '''for words in reduced: # reduced terms are passed through stopwords and stemming in util if util.isStopWord(words): self.query.append(util.stemming(words).lower()) # normalized terms are stored in reducedList Dictionary print("1...", self.query)''' correctedwords = [correction(word) for word in reduced] lowercasewords = [word.lower() for word in correctedwords] notstopwords = [] for word in lowercasewords: if util.isStopWord(word): notstopwords.append(util.stemming(word)) if len(notstopwords) > 0: self.query.append(notstopwords) print("1...", self.query)
def correct(self, sentence, position): word = sentence[position] import norvig_spell return norvig_spell.correction(word)
def spellcheck(word): spellcheckedwords = [] for x in word: wrd = correction(x) spellcheckedwords.append(wrd) return spellcheckedwords
def vectorQuery(self, k): ''' vector query processing, using the cosine similarity. ''' #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order # You can use term frequency or TFIDF to construct the vectors #constructing document vector for document 1 vectorResult = [] cf = CranFile('cran.all') documentVector = {} queryVector = {} ps = PorterStemmer() finalResult = {} for q in self.raw_query: if q == self.queryId: query_tokens = [] stemmed_query_tokens = [] # print(q, self.raw_query[q].text) # query_tokens = re.split(" ", self.raw_query[q].text.replace('\n', ' ')) query_tokens = word_tokenize(self.raw_query[q].text) query_tokens = [element.lower() for element in query_tokens]; tempcounter = 0 while tempcounter < len(query_tokens): query_tokens[tempcounter] = correction(query_tokens[tempcounter]); tempcounter = tempcounter + 1 ps = PorterStemmer() temp = 0 querytokentemp = 0 while temp < len(query_tokens): query_tokens[temp] = ps.stem(query_tokens[temp]) querytokentemp = querytokentemp + 1 with open("stopwords") as f: for line in f: if line.strip() == query_tokens[temp]: query_tokens.remove(line.strip()) temp = temp - 1 temp = temp + 1 #block to calculate query vector start temp2 = 0 while temp2 < len(query_tokens): if query_tokens[temp2] in self.index.items: wordfreq = [query_tokens.count(query_tokens[temp2])] # print(wordfreq) queryVector[query_tokens[temp2]] = (self.index.items[query_tokens[temp2]].get('idf') )* (1 + math.log( wordfreq[0] , 10)) temp2 = temp2 + 1 else: queryVector[query_tokens[temp2]] = 0; temp2 = temp2 + 1 #block to calculate query vector end docidScorepair = {} for doc in cf.docs: # print(doc.docID, doc.title, doc.body) # print("generating document vector here") titletoken = word_tokenize(doc.title) bodytoken = word_tokenize(doc.body) tokens = titletoken + bodytoken tokens = [element.lower() for element in tokens]; temp3 = 0 while temp3 < len(tokens): with open("stopwords") as f: for line in f: if line.strip() == tokens[temp3]: tokens.remove(line.strip()) temp3 = temp3 - 1 temp3 = temp3 + 1 temp = 0 while temp < len(tokens): tokens[temp] = ps.stem(tokens[temp]) temp = temp + 1 temp2 = 0 while temp2 < len(tokens): if tokens[temp2] in self.index.items: documentVector[tokens[temp2]] = (1 + math.log(self.index.items[tokens[temp2]].get('posting').get(doc.docID).get('termfreq'),10)) * (self.index.items[tokens[temp2]].get('idf')) temp2 = temp2 + 1 else: documentVector[tokens[temp2]] = 0; temp2 = temp2 + 1 # print('document vector complete') #print(documentVector) # without normalization #normalize query vector and document vector start normalizequeryvectorcounter = 0 queryVectornormalized = [] # sumofsquaresquery = 0 # for z in queryVector: # sumofsquaresquery = sumofsquaresquery + np.multiply(queryVector[z] , queryVector[z]) # # sumofsquaresquery = 1 / math.sqrt(sumofsquaresquery) # for r in queryVector: # queryVector[r] = queryVector[r] * sumofsquaresquery sumofsquaresdocument = 0 for l in documentVector: sumofsquaresdocument = sumofsquaresdocument + np.multiply(documentVector[l], documentVector[l]) try: sumofsquaresdocument = 1 / math.sqrt(sumofsquaresdocument) except: sumofsquaresdocument = 0 for h in documentVector: documentVector[h] = documentVector[h] * sumofsquaresdocument #noramlize ends cosineVector = queryVector.copy() for u in queryVector: if u in documentVector: cosineVector[u] = np.multiply(documentVector[u], queryVector[u]) else: #below line is wrong # cosineVector[k] = queryVector[k] cosineVector[u] = 0 # print ("query vector -->") # print(queryVector) # print ("document vector -->") # print( documentVector) # print ("cosine vector -->") # print(cosineVector) # print ("****************************") # document score docidScorepair[doc.docID] = sum(cosineVector.values()) #end of document score self.intermediateResultVectorQuery[q] = docidScorepair cosineVector = {} #end without normalization documentVector = {} queryVector = {} # print(query_tokens) counterObject = Counter(self.intermediateResultVectorQuery[q]) high = counterObject.most_common(k) # print('*** query id ***'+q + "***** query text *****" +self.raw_query[q].text) if k == 3: print(high) vectorResult = [i[0] for i in counterObject.most_common(k)] # print(vectorResult) return vectorResult
def spell_correction(self, list_token): temp = [correction(item) for item in list_token] return temp