def __getdKlArray__(self): dKL = [] print(f"Calculating kl information for {self.__strTypeDictionary__()}") bar = defaultProgress(len(self.words)).start() j = 0 for wordVector in self.words: KLt = 0 pOfWord = 0 argLogKt = 0 for i in range(0, len(wordVector.groupVector)): # KLt pOfC = self.totalClassDocuments[i] / self.totalDocuments groupedWord = wordVector.groupVector[i] # Kt pOfWord += pOfC * wordVector.weights[i] if groupedWord is not None: KLt -= pOfC * wordVector.weights[i] * math.log( groupedWord.documents / self.totalClassDocuments[i]) argLogKt += groupedWord.documents / self.totalDocuments Kt = -pOfWord * math.log(argLogKt) KL = Kt - KLt dKL.append([KL, wordVector]) j += 1 bar.update(j) bar.finish() dKL.sort(reverse=True, key=lambda tup: tup[0]) return dKL
def copyFiles(datasetPath, files, destinationPath): bar = defaultProgress(len(files)).start() i = 0 for file in files: copyfile("{}/{}".format(datasetPath, file), "{}/{}".format(destinationPath, file)) i += 1 bar.update(i) bar.finish()
def startTest(self, maxLength=-1, kl=False): mbmDictionary, mmDictionary = self.__setSelectFeature__(maxLength, kl) currentTestedFiles = 0 correctMBMPrediction = 0 correctMMPrediction = 0 for testGroup in self.testGroups: print(f"Testing file in group {testGroup.name}") totalTestFiles = len(testGroup.documents) bar = defaultProgress(totalTestFiles).start() documentTested = 0 for document in testGroup.documents: mbmWeights = mbmDictionary.classifyDictionary( document.dictionary) mmWeights = mmDictionary.classifyDictionary( document.dictionary) groupMBMPosition = 0 groupMMPosition = 0 minMBMWeight = mbmWeights[0] minMMWeight = mmWeights[0] for i in range(0, len(mbmWeights)): if minMBMWeight > mbmWeights[i]: minMBMWeight = mbmWeights[i] groupMBMPosition = i if minMMWeight > mmWeights[i]: minMMWeight = mmWeights[i] groupMMPosition = i if self.trainGroups[groupMBMPosition].name == testGroup.name: correctMBMPrediction += 1 if self.trainGroups[groupMMPosition].name == testGroup.name: correctMMPrediction += 1 currentTestedFiles += 1 documentTested += 1 bar.update(documentTested) bar.finish() print(f"Done testing group {testGroup.name}") accuracyMBM = correctMBMPrediction / currentTestedFiles accuracyMM = correctMMPrediction / currentTestedFiles mbmTest = Test("MBM", len(mbmDictionary.words), accuracyMBM) self.resultMBMTest.append(mbmTest) mmTest = Test("MM", len(mmDictionary.words), accuracyMM) self.resultMMTest.append(mmTest) return mbmTest, mmTest
def featureSelection(self, maxLength): remainingWords = [] print(f"Selecting feature for {self.__strTypeDictionary__()}") if len(self.activeInformation) < maxLength: maxLength = len(self.activeInformation) bar = defaultProgress(maxLength).start() for i in range(0, maxLength): remainingWords.append(self.activeInformation[i][1]) bar.update(i) bar.finish() print("Done selecting feature") self.__setNewWordsVectors__(remainingWords)
def createParameters(self): self.__setUpTotalWordsCount__() print( f"Starting calculating parameters for {self.__strTypeDictionary__()}" ) bar = defaultProgress(len(self.words)).start() i = 0 self.resetStartWeight() self.__setUpTotalWordsCount__() for word in self.words: word.updateWeights() self.__updateStartWeights__(word) i += 1 bar.update(i) bar.finish() print("Parameters created")
def cleanDictionary(self): print(f"Cleaning {self.__strTypeDictionary__()}") bar = defaultProgress(len(self.words)).start() cleanedWords = [] for i in range(0, len(self.words)): wordInDocuments = 0 for groupedWord in self.words[i].groupVector: if groupedWord != None: wordInDocuments += self.__cleanValueWord__(groupedWord) if wordInDocuments > 1: cleanedWords.append(self.words[i]) bar.update(i) bar.finish() removedWords = len(self.words) - len(cleanedWords) self.words = cleanedWords print(f"Removed {removedWords} words")
def readDocuments(self, stopWords=[], headers=[], fastReading=False): self.dictionary.clean() print(f"Start reading group {self.name}, type: {self.type}") bar = defaultProgress(len(self.documents)).start() i = 0 for document in self.documents: document.readWords(stopWords, headers, fastReading) for word in document.dictionary.words: self.dictionary.searchAndAddWord( GroupedWord(word.text, self, word.counted, 1)) document.clearReadedWords() i += 1 bar.update(i) self.setTotalCountedWords() bar.finish() print(f"Done reading group {self.name}")
def createDictionary(self): if self.datasetReaded is False: self.readDataset() self.mbmWeightedDictionary = MBMWeightedDictionary(self.trainGroups) self.mmWeightedDictionary = MMWeightedDictionary(self.trainGroups) print("Creating weight") for group in self.trainGroups: print(f"Adding weight from group {group.name}") bar = defaultProgress(len(group.dictionary.words)).start() i = 0 for word in group.dictionary.words: self.mbmWeightedDictionary.searchAndAddWord(word) self.mmWeightedDictionary.searchAndAddWord(word) i += 1 bar.update(i) bar.finish() print(f"Done adding weight from group {group.name}") self.mbmWeightedDictionary.cleanDictionary() self.mmWeightedDictionary.cleanDictionary() self.mbmWeightedDictionary.createParameters() self.mmWeightedDictionary.createParameters() self.mbmWeightedDictionary.setUpFeatureInformation() self.mmWeightedDictionary.setUpFeatureInformation() print( bcolors.OKGREEN + f"Dictionary MBM created with {len(self.mbmWeightedDictionary.words)} words" + bcolors.ENDC) print( bcolors.OKGREEN + f"Dictionary MM created with {len(self.mmWeightedDictionary.words)} words" + bcolors.ENDC)
def __getMutualInformationArray__(self): mutualInformation = [] print( f"Calculating mutual information for {self.__strTypeDictionary__()}" ) bar = defaultProgress(len(self.words)).start() j = 0 for wordVector in self.words: mi = 0 B = wordVector.getSumOfCounted() for i in range(0, len(wordVector.groupVector)): groupedWord = wordVector.groupVector[i] A = 0 if groupedWord is not None: A = groupedWord.counted C = self.totalClassWords[i] n = A * self.totalWords d = B * C if A != 0: mi += (A / self.totalWords) * math.log(n / d) n0 = (C - A) * self.totalWords d0 = (self.totalWords - B) * C if C - A != 0: mi += ((C - A) / self.totalWords) * math.log(n0 / d0) mutualInformation.append([mi, wordVector]) j += 1 bar.update(j) bar.finish() mutualInformation.sort(reverse=True, key=lambda tup: tup[0]) return mutualInformation