def wKNN(filename,k): NUM_OF_DOCS = DocClass.objects.count() FREQ_THRESHOLD = NUM_OF_DOCS*FREQ_THRESHOLD_PERCENT #get all rows with frequency more than threshold docInstances = DocFreqTable.objects.filter(docFreq__lte = FREQ_THRESHOLD) wordList = [] for inst in docInstances: wordList.append(getASCII(inst.word)) #Now start reading the file and then parse it match if the keywords are present in the file #print(wordList) filePointer = open(filename,encoding="latin-1") #Push the keywords in the file to a dictionary datastructure fileKeyWords = {} wordsInFile = filePointer.read().split() cnt = 0 stopwords=loadStopWords(STOP_WORD_LIST_FILENAME) for word in wordsInFile: if getASCII(word) in stopwords.keys(): continue fileKeyWords[getASCII(word)] = True cnt += 1 if cnt == MAX_NUM_OF_WORDS_READ: break #Get the tupple for the current file ##print(fileKeyWords) inputFileTupple = getTuple(wordList,fileKeyWords) ##print(inputFileTupple.list) docListInstance = DocClass.objects.all() docList = [] # to get back the class name for any given document (assuming the number of documnets can be stored in the main mememory) docToClassName = {} for inst in docListInstance: docList.append(inst.docName) docToClassName[inst.docName] = inst.className numRows = len(docList) numCols = len(wordList) knnMat = [[0]*numCols for i in range(numRows)] rowNumber = 0 for doc in docList: docWordInstances = WordTable.objects.filter(docName=doc) docKeyWords = {} for wordInstance in docWordInstances: docKeyWords[wordInstance.word] = True oneRow = getTuple(wordList,docKeyWords).list x = 0 while x<numCols: knnMat[rowNumber][x] = oneRow[x] x += 1 rowNumber += 1 #get all the distance from the doc distanceClassList = [] i = 0 while i < numRows: newTuple = Tuple() newTuple.list = knnMat[i] newTuple.count = numCols dis = inputFileTupple.distance(newTuple) distanceClassList.append((dis,docToClassName[docList[i]])) i += 1 distanceClassList.sort() #print(distanceClassList) classCount = {} cnt = 0 for ins in distanceClassList: wt = 1/(ins[0]+0.01) #for removing the zero error #print(str(ins[1])+' -> '+str(wt)) if ins[1] in classCount.keys(): classCount[ins[1]] += wt else: classCount[ins[1]] = wt if cnt == k: break; cnt += 1 #print(str(classCount)) classCountList = [] for className in classCount.keys(): classCountList.append((-1*classCount[className],className)) classCountList.sort() answer_list = [] len_classCountList = len(classCountList) sum_rows = 0 i = 0 while i < len_classCountList: sum_rows += classCountList[i][0] i += 1 i = 0 while i < len_classCountList: answer_list.append(((classCountList[i][0]/(sum_rows))*100,classCountList[i][1])) i += 1 return answer_list
def NaiveBayes(file): #Logging to a file orig_stdout = sys.stdout #f = open('out.txt', 'w') #sys.stdout = f #Extract tokens from doc #1. Select the first 300 words which #2. Contains only letters (helps in removing meaningless tokens) #3. Must not be a stop word fp = open(file,encoding="latin-1") docCount = DocClass.objects.count() #print("Number of documents: ",docCount) W = fp.read().split() cnt=0 processedW={} stopwords=loadStopWords(STOP_WORD_LIST_FILENAME) for w_ in W: w=w_.lower() w = get_my_string(w) if not w.isalpha(): continue if w in processedW.keys(): continue if w in stopwords.keys(): continue processedW[w]=1 cnt += 1 if cnt == MAX_NUM_OF_WORDS_READ: break #List of all (className: "className") key value pairs C = DocClass.objects.all().values('className').distinct() #number of classes countC=len(C) #print("Number of classes: ",countC) #(probability,class) pairs scores=[] #number of terms B=DocFreqTable.objects.all().count() #print("Number of terms: ",B) #Outer Loop: for every class for c in C: cName=c['className'] #print("Calculating probabilities for class=",cName) #get all docs corresponding to current class docsC = DocClass.objects.filter(className = cName).values("docName") priorC = len(docsC)/docCount prob = log10(priorC) #prob is the probability of the test document of belonging to clas #number of docs of current class in which a term of vocabulary cumulative for all terms denC = WordTable.objects.filter(docName__in=docsC).count() #Inner Loop: for every term for t in processedW.keys(): #number of docs of current class in which the current term appears numC = WordTable.objects.filter(docName__in=docsC,word=t).select_related().count() #print("Using term=",t) #how much evidence does term provide that clas is the correct class prob= prob + log10( (1+numC)/(denC+B) ) scores.append( (-prob,cName) ) scores.sort() answer_list = [] #make the probabilities non negative again for i in scores: answer_list.append( ( -i[0], i[1] ) ); sys.stdout = orig_stdout #f.close() return answer_list