def __init__(self,allFiles): self.parser = Parser() self.initializeVectorSpace(allFiles)
class TFIDF: #adding all the files to the dictionary mypath=dirname(getcwd())+"\\data" documentFreqDict={} vectorSpaceSet=set() fileDetails={} nestedDict = defaultdict(dict) queryTFIDFDict={} N=0 parser = None def __init__(self,allFiles): self.parser = Parser() self.initializeVectorSpace(allFiles) def initializeVectorSpace(self,allFiles): #try: #setting the of the document to be zero docId=1 for file in allFiles: fileName=self.mypath+"\\"+file print(fileName) if file.endswith(".txt"): self.fileDetails[docId]=fileName infile=open(fileName,"r") document = infile.read() infile.close() #tokenized=self.tokenization(document) tokenized=self.parser.tokenise_and_remove_stop_words(document) tempSet=set(tokenized) print(tempSet) self.vectorSpaceSet=self.vectorSpaceSet.union(tempSet) for term in tempSet: #normalized length self.nestedDict[term][docId]=tokenized.count(term)/len(tokenized) docId+=1 print(self.nestedDict) print(len(self.nestedDict)) self.calculateDocumentFrequency() self.N =len(self.fileDetails) print(self.N) #except: # print("Exception occured")''' #Tokenization function- It splits the files in to lines and then lines into words, removes the punctuation '''def tokenization(self,document): try: characters="~@#$%^&*()_-+=!|'\".,!;:\n\t\\\"?!{}[]<>" words = document.lower().split() return [word.strip(characters) for word in words] except: print("Exception occured") ''' def calculateDocumentFrequency(self): #calculates the number of documents this given keyword appears for word in self.vectorSpaceSet: self.documentFreqDict[word]=len(self.nestedDict[word]) def inverseDocumentFrequency(self,term): if term in self.vectorSpaceSet: print(self.documentFreqDict[term]) print(self.N) return 1+math.log(self.N/self.documentFreqDict[term]) else: return 0.0 def imp(self,term,docId): if docId in self.nestedDict[term]: return self.nestedDict[term][docId]*self.inverseDocumentFrequency(term) else: return 0.0 def cosineSimilarity(self,query,docId): similarity=0.0 aSquare=0.0 bSquare=0.0 lengthOfQuery=len(query) for term in query: #normalized frequency self.queryTFIDFDict[term]=(query.count(term)/lengthOfQuery)*self.inverseDocumentFrequency(term) aSquare+=math.pow(self.queryTFIDFDict[term],2) bSquare+=math.pow(self.imp(term,docId),2) similarity += self.queryTFIDFDict[term]*self.imp(term,docId) modAmodB=math.sqrt(aSquare)*math.sqrt(bSquare) similarity=similarity/modAmodB return similarity #creating Query vector for the dot product with document vector #Function that creates a set which has all the words from all the the documents, called as corpus dictionary, or the vector spcae which has the document vectors #each term is defined as a unit vector,document vector \vec d is just the vector whose component in the direction \vec e_t of term t is a measure of the importance #of term t in the document d def searchRelevantDocument(self,lst): inputQuery=lst.strip() inputQuery=self.parser.tokenise_and_remove_stop_words(inputQuery) if inputQuery==[]: sys.exit() else: scoresList=[] final_set=set() relevant_document_ids =[set(self.nestedDict[term].keys()) for term in inputQuery] relevant_document_ids=list(relevant_document_ids) #add a if clause if(len(relevant_document_ids)>2): final_set=(relevant_document_ids[0]).intersection((relevant_document_ids[1])) for i in range(2,len(relevant_document_ids)): final_set=set(final_set).intersection(relevant_document_ids[i]) elif(len(relevant_document_ids)==2): final_set=(relevant_document_ids[0]).intersection((relevant_document_ids[1])) else: final_set=(relevant_document_ids[0]).intersection((relevant_document_ids[0])) final_set=list(final_set) for id1 in final_set: result=[(self.cosineSimilarity(inputQuery,id1)),self.fileDetails[id1]] scoresList.append(result) scoresList.sort() resStr='' for i in range(len(scoresList)): fname=str(scoresList[i][1]).split("/") resStr+=str(i+1)+" "+str(fname[len(fname)-1])+"\n\n" return resStr def intersection(self,sets): """Returns the intersection of all sets in the list sets. Requires that the list sets contains at least one element, otherwise it raises an error.""" print( (set.intersection, [s for s in sets])) return (set.intersection, [s for s in sets])