def printDocsHelper(fileT,k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0,len(fileT)-1) + " " fileT=re.sub(r'\s', '', fileT) print fileT if (len(fileT)>1) and (fileT not in stopwords): newDict[k].append(fileT)
def process(self): """ This function reads the text file and performs- -punctuation -tokenization -lower-casing/upper-casing / punctuation / numbers -stop word -stemming """ try: stopWords = open(self.stopwordFile, "r").read() try: if self.writerFlag == True: outFile = open(self.oFile, "w") stemmer = PorterStemmer() dataDic = {} translator = str.maketrans('', '', string.punctuation) nTranslator = str.maketrans('', '', "0123456789") with open(self.iFile) as f: for line in f: try: (key, val) = line.split("\t") except ValueError: continue stringToWrite = "" val = val.translate(translator) val = val.translate(nTranslator) val = val.lower().strip().split(" ") if self.writerFlag == True: stringToWrite = "%s %s \t" % (stringToWrite, key.upper()) for words in val: if words.strip() not in stopWords: stringToWrite = "%s %s" % (stringToWrite, stemmer.stem(words)) stringToWrite = "%s \n" % (stringToWrite) if self.writerFlag == False: dataDic[key.strip()] = stringToWrite.strip() else: outFile.write(stringToWrite) if self.writerFlag == True: outFile.close() else: return dataDic except (OSError, IOError) as e: print("Wrong input file name or file path", e) except (OSError, IOError) as e: print("Wrong stopwords file name or file path", e)
def printDocsHelper(fileT, k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0, len(fileT) - 1) + " " fileT = re.sub(r'\s', '', fileT) if (len(fileT) > 1) and (fileT not in stopwords): fileT = "./wordFiles/" + fileT FILE = open(fileT, 'a') initFreq = checkforFrequency(k, fileT) if checkifWritten(fileT, k): FILE.write( str(fileT[12:]) + " " + str(k) + " " + str(initFreq)) FILE.write("\n") return 1 return 0
def tokenize(document, stem): tokens = [] p = PorterStemmer() for text in document.headline, document.graphic, document.text: # Lowercase and split on non-alphanumerics text = text.lower() text_tokens = re.split('[\W]', text) if stem: stem_tokens = [] for t in text_tokens: t = p.stem(t, 0, len(t) - 1) stem_tokens.append(t) text_tokens = stem_tokens tokens += text_tokens # Remove empty strings in resulting tokens list tokens = list(filter(None, tokens)) return tokens
def build_word_index(doc_dict): ''' this method builds the word index dictionary ''' dictionary={} stopWords = set(stopwords.words('english')) p=PorterStemmer() global idx term_frequecy_list=[] def append_to_word_list(text,doc_index): global idx #text = " ".join(re.findall("[a-zA-Z]+", st)).lower() text=" ".join(re.findall("[a-zA-Z]+", text)) text=set(text.split(" ")) text=list(text) text.sort() temp_list=[] f_dt={} for word in text: if(word!=""): if word in stopWords: continue else: word=p.stem(word, 0,len(word)-1) #update frequency of term if word not in f_dt: f_dt[word]=1 else: f_dt[word]+=1 #check if word in dictionary and append it if word not in dictionary: dictionary[word]=idx idx+=1 term_frequecy_list.append([dictionary[word],doc_index,f_dt[word]]) #wordlist.append(word) idx=1 for i in range(1,len(doc_dict)+1): if(doc_dict[i][1]!=''): append_to_word_list(doc_dict[i][1],i) return dictionary,term_frequecy_list
def calculate_bm25(topic_id, topic, token_token_id, postings_list, doc_id_no, average_doc_length, stem, docs_path): """Calculates BM25 for a topic against all LATimes Documents, returns ordered dictionary of doc_no to ranking""" query_tokens = tokenize(topic) doc_no_score = {} N = len(doc_id_no) p = PorterStemmer() # Calculate tf in query, and idf for token in query_tokens: qf = query_tokens.count(token) token_tf = ((K2 + 1)*qf) / (K2 + qf) # Calculate idf if stem: token = p.stem(token, 0, len(token) - 1) token_id = token_token_id[token] postings = postings_list[token_id] # Postings follow format: [doc_id, count] n_i = len(postings[::2]) a = (N - n_i + 0.5) / (n_i + 0.5) token_idf = math.log(a) # Calculate tf for docs for i in range(0, len(postings), 2): doc_id = postings[i] doc_no = doc_id_no[doc_id] document = getDocument.retrieve_by_docno(docs_path, doc_no) fi = postings[i+1] K = K1 * ((1 - B) + B * (document.length / average_doc_length)) doc_tf = ((K1 + 1)*fi) / (K + fi) score = doc_tf * token_tf * token_idf if doc_no in doc_no_score: doc_no_score[doc_no] = doc_no_score[doc_no] + score else: doc_no_score[doc_no] = score sorted_doc_no_score = OrderedDict(sorted(doc_no_score.items(), key=lambda t: t[1], reverse=True)) print("Calculated scores for query: {}".format(topic_id)) return sorted_doc_no_score
import sys import re from porterStemmer import PorterStemmer from collections import defaultdict import copy porter = PorterStemmer() class QueryIndex: def __init__(self): self.index = {} self.titleIndex = {} self.tf = {} #term frequencies self.idf = {} #inverse document frequencies def intersectLists(self, lists): if len(lists) == 0: return [] #start intersecting from the smaller list lists.sort(key=len) return list(reduce(lambda x, y: set(x) & set(y), lists)) def getStopwords(self): f = open(self.stopwordsFile, 'r') stopwords = [line.rstrip() for line in f] self.sw = dict.fromkeys(stopwords) f.close() def getTerms(self, line): line = line.lower()
strg = "" for elements in temp: strg += " " + elements punc1 = [",","-","=","/","'",";","^","+","|",":","<",">","`","&","(",")"] punc2 = [".",'"',"[","]","?","!","*","%","{","}"] for punc in punc1: if punc in strg: strg = strg.replace(punc," ") for punc in punc2: if punc in strg: strg = strg.replace(punc,"") strg = strg.split() finallist = [x for x in strg if x not in stop] p = PorterStemmer() midlist = [(p.stem(word, 0, len(word)-1)) for word in finallist] newlist = [x for x in midlist if x not in stop] finalstring = ''.join(" " + x for x in newlist) queryhashmap[key] = finalstring.strip() avgdoclen = 46.25 #avgdoclen = 46.2484394507 #zipfs avgdoclen def calcOBM25(OBM25dict,docid,doclen,termfreq,df): b = 0.6 #0.2-1.0 k = 1.6 #1.2-2.0 idf = log(3204.0/df) numerator = termfreq * float(k+1.0) denominator = termfreq + k*(1.0 - b + (b*doclen)/avgdoclen)
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = strfind(email_contents, ([char(10) char(10)])); # email_contents = email_contents(hdrstart(1):end); # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # Other email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},' '\">_<;%\\n\\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n') # Process file stemmer = PorterStemmer() processed_email = [] for word in email_contents: word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) # Skip the word if it is too short if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # try: index = vocabList.index(word) except ValueError: pass else: word_indices.append(index) # ============================================================" print(' '.join(processed_email)) # Print footer print('\n\n=========================') return word_indices
def querySearcher(self): """This is the main function which performs the AND, OR, AND NOT, BUT NOT and OR NOT operations""" try: stemmer = PorterStemmer() preProcess = PreProcessing(False, self.iFile, "", self.stopwordFile) preProcessRes = preProcess.process() createIndex = InvertedIndexGenerator(False, preProcessRes, "") mainIndex = createIndex.generate() originalquery = self.query self.query = self.query.lower() self.query = self.query.replace('but', 'and') querySep = list(self.parenthetic_contents(self.query)) res = self.queryCalculator(querySep, mainIndex, stemmer, preProcessRes) tempQuery = self.query tempQuery = tempQuery.replace('{', '') tempQuery = tempQuery.replace('}', '') tempQuery = tempQuery.replace('(', '') tempQuery = tempQuery.replace(')', '') tempQuery = tempQuery.replace('/', '') mapKey = {} quryStem = [] for t in tempQuery.split(" "): quryStem.append(stemmer.stem(t)) tempQuery = ' '.join(quryStem) for i, r in enumerate(res.keys()): mapKey["%d_%s" % (i, "firstItr")] = r tempQuery = tempQuery.replace(r, "%d_%s" % (i, "firstItr")) res = {**res, **mainIndex} andPro = tempQuery.split(" ") """AND operation""" for index, term in enumerate(andPro): if term == "and": if andPro[index + 1] == "not": continue else: if mapKey.get(andPro[index - 1], -1) == -1: tempKeyFirst = andPro[index - 1] else: tempKeyFirst = mapKey[andPro[index - 1]] if mapKey.get(andPro[index + 1], -1) == -1: tempKeySecond = andPro[index + 1] else: tempKeySecond = mapKey[andPro[index + 1]] res["%s and %s" % (andPro[index - 1], andPro[index + 1])] = {} for k in res[tempKeyFirst].keys(): res["%s and %s" % (andPro[index - 1], andPro[index + 1])][k] = res[tempKeyFirst][ k] and res[tempKeySecond][k] tempQuery = tempQuery.replace( "%s and %s" % (andPro[index - 1], andPro[index + 1]), "%d_%s" % (index, "secondItr")) mapKey["%d_%s" % (index, "secondItr")] = "%s and %s" % ( andPro[index - 1], andPro[index + 1]) """OR operation""" orPro = tempQuery.split(" ") for index, term in enumerate(orPro): if term == "or": if orPro[index + 1] == "not": continue else: if mapKey.get(orPro[index - 1], -1) == -1: tempKeyFirst = orPro[index - 1] else: tempKeyFirst = mapKey[orPro[index - 1]] if mapKey.get(orPro[index + 1], -1) == -1: tempKeySecond = orPro[index + 1] else: tempKeySecond = mapKey[orPro[index + 1]] res["%s or %s" % (orPro[index - 1], orPro[index + 1])] = {} for k in res[tempKeyFirst].keys(): res["%s or %s" % (orPro[index - 1], orPro[index + 1])][k] = res[ tempKeyFirst][k] or res[tempKeySecond][k] tempQuery = tempQuery.replace( "%s or %s" % (orPro[index - 1], orPro[index + 1]), "%d_%s" % (index, "thirdItr")) mapKey["%d_%s" % (index, "thirdItr")] = "%s or %s" % ( orPro[index - 1], orPro[index + 1]) """AND NOT, OR NOT, BUT NOT operations""" notPro = tempQuery.split(" ") for index, term in enumerate(notPro): if term == "not": tempKeyNot = {} if mapKey.get(notPro[index + 1], -1) == -1: tempKeySecond = notPro[index + 1] else: tempKeySecond = mapKey[notPro[index + 1]] for k in res[tempKeySecond].keys(): if not res[tempKeySecond][k] == True: tempKeyNot[k] = 1 else: tempKeyNot[k] = 0 for index, term in enumerate(notPro): if term == "and": if mapKey.get(notPro[index - 1], -1) == -1: tempKeyFirst = notPro[index - 1] else: tempKeyFirst = mapKey[notPro[index - 1]] res["%s and not %s" % (notPro[index - 1], notPro[index + 2])] = {} for kee in res[tempKeyFirst].keys(): res["%s and not %s" % (notPro[index - 1], notPro[index + 2])][kee] = res[ tempKeyFirst][kee] and tempKeyNot[kee] tempQuery = tempQuery.replace( "%s and not %s" % (notPro[index - 1], notPro[index + 2]), "%d_%s" % (index, "fourthItr")) mapKey["%d_%s" % (index, "fourthItr")] = "%s and not %s" % ( notPro[index - 1], notPro[index + 2]) if term == "or": if mapKey.get(notPro[index - 1], -1) == -1: tempKeyFirst = notPro[index - 1] else: tempKeyFirst = mapKey[notPro[index - 1]] res["%s or not %s" % (notPro[index - 1], notPro[index + 2])] = {} for kee in res[tempKeyFirst].keys(): res["%s or not %s" % (notPro[index - 1], notPro[index + 2])][kee] = res[ tempKeyFirst][kee] or tempKeyNot[kee] tempQuery = tempQuery.replace( "%s or not %s" % (notPro[index - 1], notPro[index + 2]), "%d_%s" % (index, "fourthItr")) mapKey["%d_%s" % (index, "fourthItr")] = "%s or not %s" % ( notPro[index - 1], notPro[index + 2]) self.queryAnswer(originalquery, tempQuery, mapKey, res) except: print('The term is not present in the Documents')
def applyStem(word): p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) return word
import os import sys import re as reg import math from array import array from collections import defaultdict from porterStemmer import PorterStemmer import copy #Creating object of PorterStemmer Class portstemmer_obj = PorterStemmer() class CreateIndex: def __init__(self): self.mainindex = defaultdict(list) self.termfrequency = defaultdict(list) self.documentfrequency = defaultdict(int) self.totaldocuments = 0 self.indexanditstitle = defaultdict(list) def findstopwords(self): stopwordsfile = open('stopwords.txt','r',encoding='UTF-8') stopwords=[line.rstrip() for line in stopwordsfile] self.stop_words = dict.fromkeys(stopwords) stopwordsfile.close() def get_important_terms(self , lines): #get the useful words and terms from the text lines = lines.lower() lines = reg.sub(r'[^a-z0-9 ]' , ' ' , lines) lines = lines.split() lines = [ele for ele in lines if ele not in self.stop_words]
def __init__(self): self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~') self.stemmer = PorterStemmer() self.stopWordsList = [] self.loadStopWords()