def load(self): datasetAlreadyExists = self.__create_folders() # clone file sources if dataset doesn't already exists if not datasetAlreadyExists: self.__cloneFilesSources() if not os.path.exists(FileManager.getDatasetCopyFileUrl()): # load dataset in memory self.__loadInMemory() # generate 'filtered' version self.__filterSources() # save dataset copy datasetCopy: dict = {'training': self.Dataset.training, 'testing': self.Dataset.testing} FileManager.writeFile(FileManager.getDatasetCopyFileUrl(), json.dumps(datasetCopy)) else: datasetCopy = json.loads(FileManager.readFile(FileManager.getDatasetCopyFileUrl())) self.Dataset.training = datasetCopy['training'] self.Dataset.testing = datasetCopy['testing'] return self
def __calculateTokensEntropyLoss(self, dataset: str): if os.path.exists(FileManager.getFeaturesFileUrl(self.type)): return self sources, languages = self.extractSources(dataset) withTokensOccurencyMap: dict = {} withoutTokensOccurencyMap: dict = {} for index, source in enumerate(sources): language = languages[index] tokens = set(source.split(' ')) for token in tokens: if token not in withTokensOccurencyMap: withTokensOccurencyMap[token] = [] withTokensOccurencyMap[token].append(language) for index, source in enumerate(sources): language = languages[index] tokens = set(source.split(' ')) for token in withTokensOccurencyMap: if token not in tokens: if token not in withoutTokensOccurencyMap: withoutTokensOccurencyMap[token] = [] withoutTokensOccurencyMap[token].append(language) tokensMetrics: dict = {} for language in ConfigurationManager.getLanguages(): tokensMetrics[language] = {} for token in withTokensOccurencyMap: tokensMetrics[language][token] = {} tokensMetrics[language][token][ 'numberOfExamplesWithFeatureF']: int = len( withTokensOccurencyMap[token]) tokensMetrics[language][token][ 'numberOfExamplesWithoutFeatureF']: int = len( withoutTokensOccurencyMap[token]) tokensMetrics[language][token][ 'numberOfPositiveExamplesWithFeatureF']: int = len([ lg for lg in withTokensOccurencyMap[token] if lg == language ]) tokensMetrics[language][token][ 'numberOfPositiveExamplesWithoutFeatureF']: int = len([ lg for lg in withoutTokensOccurencyMap[token] if lg == language ]) languageFeatures = {} tokensEntropyLoss: dict = {} numberOfExamples = self.Dataset.countExamples(dataset) N_OF_TOKENS_FOR_LANGUAGE: int = self.config[ 'number_of_tokens_for_language'] for language in ConfigurationManager.getLanguages(): tokensEntropyLoss[language] = {} numberOfPositiveExamples: int = self.Dataset.getCounters( dataset)[language] for token in tokensMetrics[language]: tokensEntropyLoss[language][token] = 0 metrics = tokensMetrics[language][token] numberOfExamplesWithFeatureF = metrics[ 'numberOfExamplesWithFeatureF'] numberOfExamplesWithoutFeatureF = metrics[ 'numberOfExamplesWithoutFeatureF'] numberOfPositiveExamplesWithFeatureF = metrics[ 'numberOfPositiveExamplesWithFeatureF'] numberOfPositiveExamplesWithoutFeatureF = metrics[ 'numberOfPositiveExamplesWithoutFeatureF'] # preparing entropy formula vars pr_C: float = numberOfPositiveExamples / numberOfExamples pr_f: float = numberOfExamplesWithFeatureF / numberOfExamples pr_C_f: float = numberOfPositiveExamplesWithFeatureF / numberOfExamplesWithFeatureF pr_C_notf: float = numberOfPositiveExamplesWithoutFeatureF / numberOfExamplesWithoutFeatureF # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range pr_C = (pr_C if pr_C > 0 else .0001) pr_f = (pr_f if pr_f > 0 else .0001) pr_C_f = (pr_C_f if pr_C_f > 0 else .0001) pr_C_notf = (pr_C_notf if pr_C_notf > 0 else .0001) # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range pr_C = (pr_C if pr_C < 1 else .9999) pr_f = (pr_f if pr_f < 1 else .9999) pr_C_f = (pr_C_f if pr_C_f < 1 else .9999) pr_C_notf = (pr_C_notf if pr_C_notf < 1 else .9999) # calculating token's entropy e = -(pr_C * math.log2(pr_C)) - ( (1 - pr_C) * math.log2(1 - pr_C)) e_f = -(pr_C_f * math.log2(pr_C_f)) - ( (1 - pr_C_f) * math.log2(1 - pr_C_f)) e_not_f = -(pr_C_notf * math.log2(pr_C_notf)) - ( (1 - pr_C_notf) * math.log2(1 - pr_C_notf)) tokensEntropyLoss[language][token] = e - (e_f * pr_f) + (e_not_f * (1 - pr_f)) # sort entropy values by desc order tokensEntropyLoss[language] = { k: v for k, v in sorted(tokensEntropyLoss[language].items(), key=lambda item: item[1]) } # take first n tokens languageFeatures[language] = list( tokensEntropyLoss[language].keys())[:N_OF_TOKENS_FOR_LANGUAGE] # export tokens with maximum entropy loss FileManager.writeFile(FileManager.getFeaturesFileUrl(self.type), json.dumps(languageFeatures)) return self
def exportClassificationReport(self, report: str): FileManager.writeFile(FileManager.getReportFileUrl(self.type), report)
def exportVocabulary(self, indexes): FileManager.writeFile(FileManager.getVocabularyFileUrl(self.type), json.dumps(indexes)) return self