def processQuery(): f = open("Queries.txt") text = f.read() lines = text.splitlines() for i in lines: raw_query = i raw_query = raw_query.replace("\n", "") raw_query = raw_query.lower() query = re.compile(r'(\w+)', re.DOTALL).findall(raw_query) print(query) queryNum = query[0] #print(queryNum) query = re.compile(r'[a-z]+',re.DOTALL).findall(raw_query) query = filter(None, query) #print(query) stemmer = PorterStemmer() query = map(lambda word: stemmer.stem(word, 0, len(word) - 1), query) queryLen = len(query) #print(queryLen) #run the models okapiTF(query, queryNum) tfIdf(query, queryNum, queryLen) smoothing(query, queryNum, 'Laplace') smoothing(query, queryNum, 'Jelinek-Mercer') bm25(query, queryNum) print("Queries processed")
class Parser: # A processor for removing the commoner morphological and inflexional endings from words in English stemmer = None stopwords = [] def __init__(self,): self.stemmer = PorterStemmer() self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-~]") for file in glob.glob(os.path.dirname(__file__) + "/stopwords/*/*.txt"): self.stopwords += [line.strip() for line in open(file).readlines()] self.stopwords.append("the") def clean(self, string): """ remove any nasty grammar tokens from string """ string = self.p.sub(" ", string) string = string.lower() return string def removeStopwords(self, list): """ Remove common words which have no search value """ return [word for word in list if word not in self.stopwords] def tokenise(self, string, stem=False): """ break string up into tokens and stem words """ string = self.clean(string) words = string.split() if stem: return [self.stemmer.stem(word, 0, len(word) - 1) for word in words] else: return words def tokenize(self, string, stem=False): tokenise(self, string, stem=stem)
def printDocsHelper(fileT,k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0,len(fileT)-1) + " " fileT=re.sub(r'\s', '', fileT) print fileT if (len(fileT)>1) and (fileT not in stopwords): newDict[k].append(fileT)
class Parser: def __init__(self): self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~') self.stemmer = PorterStemmer() self.stopWordsList = [] self.loadStopWords() ''' words_list is an array ''' def fullParse(self, words_list): stopped = self.removeStopWords(words_list) cleaned = self.cleanCaseAndPunctuation(stopped) stopped = self.stemWords(cleaned) return stopped def stemWords(self, words_list): stemmed = [] for word in words_list: word = self.stemmer.stem(word, 0, len(word) - 1) stemmed.append(word) return stemmed def removeStopWords(self, words_list): non_stop_list = [] for word in words_list: word = ''.join( filter(lambda word: word not in self.stopWordsList, word.strip())) non_stop_list.append(word) return non_stop_list def cleanCaseAndPunctuation(self, words_list): clean_list = [] for word in words_list: word = word.lower() if not word.startswith('http'): clean = ''.join( [c for c in word if c not in self.remove_punctuation_set]) if clean: clean_list.append(clean) return clean_list def printStopWords(self): print "****************************************************************" print " STOP WORDS" print "****************************************************************" print self.stopWordsList ''' happens on __init__ ''' def loadStopWords(self): for line in open(STOPWORDS_FILE): self.stopWordsList.append(line.strip())
class Parser: def __init__(self): self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~') self.stemmer = PorterStemmer() self.stopWordsList = [] self.loadStopWords() ''' words_list is an array ''' def fullParse(self, words_list): stopped = self.removeStopWords(words_list) cleaned = self.cleanCaseAndPunctuation(stopped) stopped = self.stemWords(cleaned) return stopped def stemWords(self, words_list): stemmed = [] for word in words_list: word = self.stemmer.stem(word, 0, len(word)-1) stemmed.append(word) return stemmed def removeStopWords(self, words_list): non_stop_list = [] for word in words_list: word = ''.join(filter(lambda word: word not in self.stopWordsList, word.strip())) non_stop_list.append(word) return non_stop_list def cleanCaseAndPunctuation(self, words_list): clean_list = [] for word in words_list: word = word.lower() if not word.startswith('http'): clean = ''.join([c for c in word if c not in self.remove_punctuation_set]) if clean: clean_list.append(clean) return clean_list def printStopWords(self): print "****************************************************************" print " STOP WORDS" print "****************************************************************" print self.stopWordsList ''' happens on __init__ ''' def loadStopWords(self): for line in open(STOPWORDS_FILE): self.stopWordsList.append(line.strip())
def process(self): """ This function reads the text file and performs- -punctuation -tokenization -lower-casing/upper-casing / punctuation / numbers -stop word -stemming """ try: stopWords = open(self.stopwordFile, "r").read() try: if self.writerFlag == True: outFile = open(self.oFile, "w") stemmer = PorterStemmer() dataDic = {} translator = str.maketrans('', '', string.punctuation) nTranslator = str.maketrans('', '', "0123456789") with open(self.iFile) as f: for line in f: try: (key, val) = line.split("\t") except ValueError: continue stringToWrite = "" val = val.translate(translator) val = val.translate(nTranslator) val = val.lower().strip().split(" ") if self.writerFlag == True: stringToWrite = "%s %s \t" % (stringToWrite, key.upper()) for words in val: if words.strip() not in stopWords: stringToWrite = "%s %s" % (stringToWrite, stemmer.stem(words)) stringToWrite = "%s \n" % (stringToWrite) if self.writerFlag == False: dataDic[key.strip()] = stringToWrite.strip() else: outFile.write(stringToWrite) if self.writerFlag == True: outFile.close() else: return dataDic except (OSError, IOError) as e: print("Wrong input file name or file path", e) except (OSError, IOError) as e: print("Wrong stopwords file name or file path", e)
def printDocsHelper(fileT,k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0,len(fileT)-1) + " " fileT=re.sub(r'\s', '', fileT) if (len(fileT)>1) and (fileT not in stopwords): fileT="./wordFiles/" + fileT FILE=open(fileT,'a') initFreq=checkforFrequency(k,fileT) if checkifWritten(fileT,k): FILE.write(str(fileT[12:])+ " " +str(k)+ " " +str(initFreq)) FILE.write("\n") return 1 return 0
def printDocsHelper(fileT, k): if fileT not in stopwords: #stemmed words p = PorterStemmer() fileT = p.stem(fileT, 0, len(fileT) - 1) + " " fileT = re.sub(r'\s', '', fileT) if (len(fileT) > 1) and (fileT not in stopwords): fileT = "./wordFiles/" + fileT FILE = open(fileT, 'a') initFreq = checkforFrequency(k, fileT) if checkifWritten(fileT, k): FILE.write( str(fileT[12:]) + " " + str(k) + " " + str(initFreq)) FILE.write("\n") return 1 return 0
def tokenize(document, stem): tokens = [] p = PorterStemmer() for text in document.headline, document.graphic, document.text: # Lowercase and split on non-alphanumerics text = text.lower() text_tokens = re.split('[\W]', text) if stem: stem_tokens = [] for t in text_tokens: t = p.stem(t, 0, len(t) - 1) stem_tokens.append(t) text_tokens = stem_tokens tokens += text_tokens # Remove empty strings in resulting tokens list tokens = list(filter(None, tokens)) return tokens
def calculate_bm25(topic_id, topic, token_token_id, postings_list, doc_id_no, average_doc_length, stem, docs_path): """Calculates BM25 for a topic against all LATimes Documents, returns ordered dictionary of doc_no to ranking""" query_tokens = tokenize(topic) doc_no_score = {} N = len(doc_id_no) p = PorterStemmer() # Calculate tf in query, and idf for token in query_tokens: qf = query_tokens.count(token) token_tf = ((K2 + 1)*qf) / (K2 + qf) # Calculate idf if stem: token = p.stem(token, 0, len(token) - 1) token_id = token_token_id[token] postings = postings_list[token_id] # Postings follow format: [doc_id, count] n_i = len(postings[::2]) a = (N - n_i + 0.5) / (n_i + 0.5) token_idf = math.log(a) # Calculate tf for docs for i in range(0, len(postings), 2): doc_id = postings[i] doc_no = doc_id_no[doc_id] document = getDocument.retrieve_by_docno(docs_path, doc_no) fi = postings[i+1] K = K1 * ((1 - B) + B * (document.length / average_doc_length)) doc_tf = ((K1 + 1)*fi) / (K + fi) score = doc_tf * token_tf * token_idf if doc_no in doc_no_score: doc_no_score[doc_no] = doc_no_score[doc_no] + score else: doc_no_score[doc_no] = score sorted_doc_no_score = OrderedDict(sorted(doc_no_score.items(), key=lambda t: t[1], reverse=True)) print("Calculated scores for query: {}".format(topic_id)) return sorted_doc_no_score
temp_string += " " + words punc_type_1 = [",","-","=","/","\\","'",";","^","+","|",":","<",">","`","&","(",")"] punc_type_2 = [".",'"',"[","]","?","!","*","%","{","}","$"] for punc in punc_type_1: if punc in temp_string: temp_string = temp_string.replace (punc, " ") for punc in punc_type_2: if punc in temp_string: temp_string = temp_string.replace (punc, "") temp_string = temp_string.split() final_word_list = [x for x in temp_string if x not in stop_words] p = PorterStemmer() mid_list = [(p.stem(word, 0, len (word)-1)) for word in final_word_list] new_list = [x for x in mid_list if x not in stop_words] final_string = ''.join(" " + x for x in new_list) query_hashmap[key] = final_string.strip() #print query_hashmap[key] # printing each query after stopping and stemming model_dict = {} query_word_count = defaultdict(float) file1 = open ("file1.txt","r").readlines() for key in query_hashmap.keys(): query = query_hashmap[key].split() print query query = map(str.lower,query) query_dict = {} help_dict = {}
def applyStem(word): p = PorterStemmer() word = p.stem(word, 0,len(word)-1) return word
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = strfind(email_contents, ([char(10) char(10)])); # email_contents = email_contents(hdrstart(1):end); # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # Other email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},' '\">_<;%\\n\\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n') # Process file stemmer = PorterStemmer() processed_email = [] for word in email_contents: word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) # Skip the word if it is too short if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # try: index = vocabList.index(word) except ValueError: pass else: word_indices.append(index) # ============================================================" print(' '.join(processed_email)) # Print footer print('\n\n=========================') return word_indices
for elements in temp: strg += " " + elements punc1 = [",","-","=","/","'",";","^","+","|",":","<",">","`","&","(",")"] punc2 = [".",'"',"[","]","?","!","*","%","{","}"] for punc in punc1: if punc in strg: strg = strg.replace(punc," ") for punc in punc2: if punc in strg: strg = strg.replace(punc,"") strg = strg.split() finallist = [x for x in strg if x not in stop] p = PorterStemmer() midlist = [(p.stem(word, 0, len(word)-1)) for word in finallist] newlist = [x for x in midlist if x not in stop] finalstring = ''.join(" " + x for x in newlist) queryhashmap[key] = finalstring.strip() avgdoclen = 46.25 #avgdoclen = 46.2484394507 #zipfs avgdoclen def calcOBM25(OBM25dict,docid,doclen,termfreq,df): b = 0.6 #0.2-1.0 k = 1.6 #1.2-2.0 idf = log(3204.0/df) numerator = termfreq * float(k+1.0) denominator = termfreq + k*(1.0 - b + (b*doclen)/avgdoclen) score = idf * (numerator/denominator)
def querySearcher(self): """This is the main function which performs the AND, OR, AND NOT, BUT NOT and OR NOT operations""" try: stemmer = PorterStemmer() preProcess = PreProcessing(False, self.iFile, "", self.stopwordFile) preProcessRes = preProcess.process() createIndex = InvertedIndexGenerator(False, preProcessRes, "") mainIndex = createIndex.generate() originalquery = self.query self.query = self.query.lower() self.query = self.query.replace('but', 'and') querySep = list(self.parenthetic_contents(self.query)) res = self.queryCalculator(querySep, mainIndex, stemmer, preProcessRes) tempQuery = self.query tempQuery = tempQuery.replace('{', '') tempQuery = tempQuery.replace('}', '') tempQuery = tempQuery.replace('(', '') tempQuery = tempQuery.replace(')', '') tempQuery = tempQuery.replace('/', '') mapKey = {} quryStem = [] for t in tempQuery.split(" "): quryStem.append(stemmer.stem(t)) tempQuery = ' '.join(quryStem) for i, r in enumerate(res.keys()): mapKey["%d_%s" % (i, "firstItr")] = r tempQuery = tempQuery.replace(r, "%d_%s" % (i, "firstItr")) res = {**res, **mainIndex} andPro = tempQuery.split(" ") """AND operation""" for index, term in enumerate(andPro): if term == "and": if andPro[index + 1] == "not": continue else: if mapKey.get(andPro[index - 1], -1) == -1: tempKeyFirst = andPro[index - 1] else: tempKeyFirst = mapKey[andPro[index - 1]] if mapKey.get(andPro[index + 1], -1) == -1: tempKeySecond = andPro[index + 1] else: tempKeySecond = mapKey[andPro[index + 1]] res["%s and %s" % (andPro[index - 1], andPro[index + 1])] = {} for k in res[tempKeyFirst].keys(): res["%s and %s" % (andPro[index - 1], andPro[index + 1])][k] = res[tempKeyFirst][ k] and res[tempKeySecond][k] tempQuery = tempQuery.replace( "%s and %s" % (andPro[index - 1], andPro[index + 1]), "%d_%s" % (index, "secondItr")) mapKey["%d_%s" % (index, "secondItr")] = "%s and %s" % ( andPro[index - 1], andPro[index + 1]) """OR operation""" orPro = tempQuery.split(" ") for index, term in enumerate(orPro): if term == "or": if orPro[index + 1] == "not": continue else: if mapKey.get(orPro[index - 1], -1) == -1: tempKeyFirst = orPro[index - 1] else: tempKeyFirst = mapKey[orPro[index - 1]] if mapKey.get(orPro[index + 1], -1) == -1: tempKeySecond = orPro[index + 1] else: tempKeySecond = mapKey[orPro[index + 1]] res["%s or %s" % (orPro[index - 1], orPro[index + 1])] = {} for k in res[tempKeyFirst].keys(): res["%s or %s" % (orPro[index - 1], orPro[index + 1])][k] = res[ tempKeyFirst][k] or res[tempKeySecond][k] tempQuery = tempQuery.replace( "%s or %s" % (orPro[index - 1], orPro[index + 1]), "%d_%s" % (index, "thirdItr")) mapKey["%d_%s" % (index, "thirdItr")] = "%s or %s" % ( orPro[index - 1], orPro[index + 1]) """AND NOT, OR NOT, BUT NOT operations""" notPro = tempQuery.split(" ") for index, term in enumerate(notPro): if term == "not": tempKeyNot = {} if mapKey.get(notPro[index + 1], -1) == -1: tempKeySecond = notPro[index + 1] else: tempKeySecond = mapKey[notPro[index + 1]] for k in res[tempKeySecond].keys(): if not res[tempKeySecond][k] == True: tempKeyNot[k] = 1 else: tempKeyNot[k] = 0 for index, term in enumerate(notPro): if term == "and": if mapKey.get(notPro[index - 1], -1) == -1: tempKeyFirst = notPro[index - 1] else: tempKeyFirst = mapKey[notPro[index - 1]] res["%s and not %s" % (notPro[index - 1], notPro[index + 2])] = {} for kee in res[tempKeyFirst].keys(): res["%s and not %s" % (notPro[index - 1], notPro[index + 2])][kee] = res[ tempKeyFirst][kee] and tempKeyNot[kee] tempQuery = tempQuery.replace( "%s and not %s" % (notPro[index - 1], notPro[index + 2]), "%d_%s" % (index, "fourthItr")) mapKey["%d_%s" % (index, "fourthItr")] = "%s and not %s" % ( notPro[index - 1], notPro[index + 2]) if term == "or": if mapKey.get(notPro[index - 1], -1) == -1: tempKeyFirst = notPro[index - 1] else: tempKeyFirst = mapKey[notPro[index - 1]] res["%s or not %s" % (notPro[index - 1], notPro[index + 2])] = {} for kee in res[tempKeyFirst].keys(): res["%s or not %s" % (notPro[index - 1], notPro[index + 2])][kee] = res[ tempKeyFirst][kee] or tempKeyNot[kee] tempQuery = tempQuery.replace( "%s or not %s" % (notPro[index - 1], notPro[index + 2]), "%d_%s" % (index, "fourthItr")) mapKey["%d_%s" % (index, "fourthItr")] = "%s or not %s" % ( notPro[index - 1], notPro[index + 2]) self.queryAnswer(originalquery, tempQuery, mapKey, res) except: print('The term is not present in the Documents')
",", "-", "=", "/", "\\", "'", ";", "^", "+", "|", ":", "<", ">", "`", "&", "(", ")" ] punc_type_2 = [".", '"', "[", "]", "?", "!", "*", "%", "{", "}", "$"] for punc in punc_type_1: if punc in temp_string: temp_string = temp_string.replace(punc, " ") for punc in punc_type_2: if punc in temp_string: temp_string = temp_string.replace(punc, "") temp_string = temp_string.split() final_word_list = [x for x in temp_string if x not in stop_words] p = PorterStemmer() mid_list = [(p.stem(word, 0, len(word) - 1)) for word in final_word_list] new_list = [x for x in mid_list if x not in stop_words] final_string = ''.join(" " + x for x in new_list) query_hashmap[key] = final_string.strip() #print query_hashmap[key] # printing each query after stopping and stemming model_dict = {} query_word_count = defaultdict(float) file1 = open("file1.txt", "r").readlines() for key in query_hashmap.keys(): query = query_hashmap[key].split() #print query #query with uppercase query = map(str.lower, query) print query #query with lowercase query_dict = {}
#defining punctuations to be eliminated punct_list_2 = [".", '"', "[", "]", "?", "!", "*", "%", "{", "}", "$"] #removing punctuations for punct in punct_list_1: if punct in key_text: key_text = key_text.replace(punct, " ") for punct in punct_list_2: if punct in key_text: key_text = key_text.replace(punct, "") key_text = key_text.split() #removing stop words text_wo_stop_punct = [x for x in key_text if x not in stop_word_file] p = PorterStemmer() midlist = [(p.stem(word, 0, (len(word) - 1))) for word in text_wo_stop_punct] newlist = [x for x in midlist if x not in stop_word_file] finaltext = ''.join(" " + x for x in newlist) dict_map[key] = finaltext.strip() print "Completed stemming and stopping" dict_word_ID_map = {} i = 1 print "Assigning IDs to words..........please wait" for key in dict_map.keys(): for word in dict_map[key].split(): if dict_word_ID_map.has_key(word): pass else: dict_word_ID_map[word] = i
#defining punctuations to be eliminated punct_list_2 = [".",'"',"[","]","?","!","*","%","{","}","$"] #removing punctuations for punct in punct_list_1: if punct in key_text: key_text = key_text.replace (punct, " ") for punct in punct_list_2: if punct in key_text: key_text = key_text.replace (punct, "") key_text = key_text.split() #removing stop words text_wo_stop_punct = [x for x in key_text if x not in stop_word_file] p = PorterStemmer() midlist = [(p.stem (word, 0, (len (word) - 1))) for word in text_wo_stop_punct] newlist = [x for x in midlist if x not in stop_word_file] finaltext = ''.join (" " + x for x in newlist) dict_map[key] = finaltext.strip() print "Completed stemming and stopping" dict_word_ID_map = {} i = 1 print "Assigning IDs to words..........please wait" for key in dict_map.keys(): for word in dict_map[key].split(): if dict_word_ID_map.has_key (word): pass else: dict_word_ID_map[word] = i i += 1
i.create_mat() #print(i.id_title) #print(i.token_set) while True: st=raw_input("search karo: ") st=st.lower() l=st.split(); main=[] for token in l: match,index=i.find_match(token) if match== 100: main.append(token) else: main.append(index) line = [index_porter.stem(word , 0 , len(word) - 1) for word in main] #print("asdadsasd",line) rank_list=i.rank_doc(line) if len(rank_list)==0: print("NO RESULTS FOUND") else: print("SHOWING RESULTS FOR "+ " ".join(main)) for x in rank_list: print(x[0],x[1]) print(i.id_title[str(x[0])][0]) #print(str(i.id_title[str(x[0])][1])) h1='https://en.wikipedia.org/wiki?curid=' + str(x[0])
def applyStem(word): p = PorterStemmer() word = p.stem(word, 0, len(word) - 1) return word
def stemWords(tokens): for i in range(0,len(tokens)): stemmer = PorterStemmer() tokens[i] = stemmer.stem(tokens[i], 0, len(tokens[i])-1) return tokens
#!/usr/bin/env python #encoding=utf8 from porterStemmer import PorterStemmer if __name__ == "__main__": stemmer = PorterStemmer() word = "Keyphrases" result = stemmer.stem(word, 0, len(word)-1) print result
def parseDocs(): global terms global documents files = os.listdir("./cacm") sp = open("stopWords.txt") stopData = sp.read() stopTerms = stopData.lower().split("\n") stopTerms = stopTerms[:len(stopTerms) - 1] filep1 = open("terms.txt", "a") filep2 = open("mappings.txt", "a") filep3 = open("documents.txt", "a") termId = 1 for f in files: fp = open("./cacm/" + f) documentName = f.split(".")[0] documentId = documentName.split("-")[1] line = fp.read() data = re.compile(r'.*?<pre>(.*?)</pre>', re.DOTALL).match(line).group(1) data = data.replace("\n", " ") splitword = re.compile(r'CA\d+', re.DOTALL).findall(data)[0] text = data.split(splitword) words = text[0] words = words.replace("CACM", " ") words = words.lower() words = re.compile(r'(\w+)', re.DOTALL).findall(words) stemmer = PorterStemmer() words = map(lambda word: stemmer.stem(word, 0, len(word) - 1), words) docLength = len(words) global totalDocLength totalDocLength += docLength count = collections.Counter(words) filep3.write(documentId + " " + documentName + " " + str(docLength) + "\n") for term in words: if term not in stopTerms and term not in stopList: global numOfTerms numOfTerms += 1 if term in terms: #print(term) attributes = terms[term] #print(attributes) idterm = attributes[0] tf = count[term] documentDetails = attributes[3] latestDoc = len(documentDetails) lastTermId = documentDetails[latestDoc - 1] #print(latestDoc) if documentId == lastTermId[0]: ctf = attributes[1] ctf = ctf + 1 df = attributes[2] terms[term] = idterm, ctf, df, documents[term] #print(terms[term]) else: documents[term] = documents[term] + [[documentId, documentName, docLength, tf]] ctf = attributes[1] ctf = ctf + 1 df = attributes[2] df = df + 1 terms[term] = idterm, ctf, df, documents[term] #print(terms[term]) if term not in terms: #print(termId) ctf = 1 tf = count[term] df = 1 documents[term] = [[documentId, documentName, docLength, tf]] terms[term] = termId, ctf, df, documents[term] termId += 1 #print(termId) #print(terms[term]) for key in terms: attributes = terms[key] key_termName = key key_termId = attributes[0] key_ctf = attributes[1] key_df = attributes[2] key_documents = attributes[3] offsetLength = len(str(key_termId)) + 1 filep2.write(str(key_termId) + " ") for doc in key_documents: docId = doc[0] tf = doc[3] offsetLength += len(docId) + len(str(tf)) + 2 filep2.write(docId + " " + str(tf) + " ") filep2.write("\n") global offset filep1.write(key_termName + " " + str(key_termId) + " " + str(key_ctf) + " " + str(key_df) + " " + str(offset) + " " + str(offsetLength) + "\n") offset += offsetLength + 1