def test(self): if not os.path.exists(FileManager.getTrainedModelFileUrl(self.type)): raise Exception('You can\'t test a model without training it') # label encoder Y_Encoder = preprocessing.LabelEncoder() Y_Encoder.fit(ConfigurationManager.getLanguages()) # preparing features X, languages = self.__prepareFeatures('testing', True) # import trained model self.importScikitTrainedModel() # make predictions Y_real = Y_Encoder.transform(languages) Y_predicted = self.model.predict(X) # metrics accuracy = accuracy_score(Y_real, Y_predicted) report = classification_report(Y_real, Y_predicted, target_names=Y_Encoder.classes_) print(' > [BAYES] classification report exported!') print(' > [BAYES] total accuracy = ' + str(float("{:.2f}".format(accuracy)) * 100) + '%') # export the classification report self.exportClassificationReport(str(report)) return self
def main(): data = {"success": False} languages = ConfigurationManager.getLanguages() matched = 0 totalExamples = 0 for languageFolder in FileManager.getLanguagesFolders(FileManager.datasets['testing']['url']): language = str(languageFolder.name).lower() for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): totalExamples += 1 X_test = [] originalFileContent = FileManager.readFile(FileManager.getOriginalFileUrl(exampleFolder.path)) code_snip = originalFileContent # print(code_snip, file=sys.stdout) word_vec = convert_text_to_index_array(code_snip) X_test.append(word_vec) X_test = pad_sequences(X_test, maxlen=100) # print(X_test[0].reshape(1,X_test.shape[1]), file=sys.stdout) y_prob = model.predict(X_test[0].reshape(1, X_test.shape[1]), batch_size=1, verbose=2)[0] a = np.array(y_prob) idx = np.argmax(a) if str(languages[idx]) == language: matched += 1 # data["predictions"] = [] # for i in range(len(languages)): # # print(languages[i], file=sys.stdout) # r = {"label": languages[i], "probability": format(y_prob[i] * 100, '.2f')} # data["predictions"].append(r) print('') print('') print('totalExamples = ' + str(totalExamples)) print('matched = ' + str(matched)) print('matched / totalExamples = ' + str(matched / totalExamples)) print('') print('')
def train(self): if os.path.exists(FileManager.getTrainedModelFileUrl(self.type)): return self # preparing features X, languages = self.__prepareFeatures('training', False) # label encoder Y_Encoder = preprocessing.LabelEncoder() Y_Encoder.fit(ConfigurationManager.getLanguages()) # (X, Y) creation Y = Y_Encoder.transform(languages) # prepare model self.__prepareModel() # training self.model.fit(X, Y) # export the trained model self.exportScikitTrainedModel() return self
# /usr/bin/env python3 import re as regex from utils import ConfigurationManager from keras.preprocessing.text import text_to_word_sequence RESERVED_WORDS: list = ConfigurationManager.getReservedWords() ESCAPED_TOKENS = ConfigurationManager.escaped_tokens TOKENIZER_CONFIG: dict = ConfigurationManager.tokenizerConfiguration class Parser: def __init__(self): self.ORIGINAL_URI: str = "_MISSING_" self.PARSED_URI: str = "_MISSING_" def initialize(self, originalUri, parserdUri): self.ORIGINAL_URI = originalUri self.PARSED_URI = parserdUri ## def _isLineAComment(self, line: str): # create tokens list words: list = line.split(' ') # check if contains 'reserved words' if any(el in RESERVED_WORDS for el in words): return False # remove first occurrency words.pop(0) # check if line contains at least 3 words
def __calculateTokensEntropyLoss(self, dataset: str): if os.path.exists(FileManager.getFeaturesFileUrl(self.type)): return self sources, languages = self.extractSources(dataset) withTokensOccurencyMap: dict = {} withoutTokensOccurencyMap: dict = {} for index, source in enumerate(sources): language = languages[index] tokens = set(source.split(' ')) for token in tokens: if token not in withTokensOccurencyMap: withTokensOccurencyMap[token] = [] withTokensOccurencyMap[token].append(language) for index, source in enumerate(sources): language = languages[index] tokens = set(source.split(' ')) for token in withTokensOccurencyMap: if token not in tokens: if token not in withoutTokensOccurencyMap: withoutTokensOccurencyMap[token] = [] withoutTokensOccurencyMap[token].append(language) tokensMetrics: dict = {} for language in ConfigurationManager.getLanguages(): tokensMetrics[language] = {} for token in withTokensOccurencyMap: tokensMetrics[language][token] = {} tokensMetrics[language][token][ 'numberOfExamplesWithFeatureF']: int = len( withTokensOccurencyMap[token]) tokensMetrics[language][token][ 'numberOfExamplesWithoutFeatureF']: int = len( withoutTokensOccurencyMap[token]) tokensMetrics[language][token][ 'numberOfPositiveExamplesWithFeatureF']: int = len([ lg for lg in withTokensOccurencyMap[token] if lg == language ]) tokensMetrics[language][token][ 'numberOfPositiveExamplesWithoutFeatureF']: int = len([ lg for lg in withoutTokensOccurencyMap[token] if lg == language ]) languageFeatures = {} tokensEntropyLoss: dict = {} numberOfExamples = self.Dataset.countExamples(dataset) N_OF_TOKENS_FOR_LANGUAGE: int = self.config[ 'number_of_tokens_for_language'] for language in ConfigurationManager.getLanguages(): tokensEntropyLoss[language] = {} numberOfPositiveExamples: int = self.Dataset.getCounters( dataset)[language] for token in tokensMetrics[language]: tokensEntropyLoss[language][token] = 0 metrics = tokensMetrics[language][token] numberOfExamplesWithFeatureF = metrics[ 'numberOfExamplesWithFeatureF'] numberOfExamplesWithoutFeatureF = metrics[ 'numberOfExamplesWithoutFeatureF'] numberOfPositiveExamplesWithFeatureF = metrics[ 'numberOfPositiveExamplesWithFeatureF'] numberOfPositiveExamplesWithoutFeatureF = metrics[ 'numberOfPositiveExamplesWithoutFeatureF'] # preparing entropy formula vars pr_C: float = numberOfPositiveExamples / numberOfExamples pr_f: float = numberOfExamplesWithFeatureF / numberOfExamples pr_C_f: float = numberOfPositiveExamplesWithFeatureF / numberOfExamplesWithFeatureF pr_C_notf: float = numberOfPositiveExamplesWithoutFeatureF / numberOfExamplesWithoutFeatureF # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range pr_C = (pr_C if pr_C > 0 else .0001) pr_f = (pr_f if pr_f > 0 else .0001) pr_C_f = (pr_C_f if pr_C_f > 0 else .0001) pr_C_notf = (pr_C_notf if pr_C_notf > 0 else .0001) # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range pr_C = (pr_C if pr_C < 1 else .9999) pr_f = (pr_f if pr_f < 1 else .9999) pr_C_f = (pr_C_f if pr_C_f < 1 else .9999) pr_C_notf = (pr_C_notf if pr_C_notf < 1 else .9999) # calculating token's entropy e = -(pr_C * math.log2(pr_C)) - ( (1 - pr_C) * math.log2(1 - pr_C)) e_f = -(pr_C_f * math.log2(pr_C_f)) - ( (1 - pr_C_f) * math.log2(1 - pr_C_f)) e_not_f = -(pr_C_notf * math.log2(pr_C_notf)) - ( (1 - pr_C_notf) * math.log2(1 - pr_C_notf)) tokensEntropyLoss[language][token] = e - (e_f * pr_f) + (e_not_f * (1 - pr_f)) # sort entropy values by desc order tokensEntropyLoss[language] = { k: v for k, v in sorted(tokensEntropyLoss[language].items(), key=lambda item: item[1]) } # take first n tokens languageFeatures[language] = list( tokensEntropyLoss[language].keys())[:N_OF_TOKENS_FOR_LANGUAGE] # export tokens with maximum entropy loss FileManager.writeFile(FileManager.getFeaturesFileUrl(self.type), json.dumps(languageFeatures)) return self
from click_help_colors import HelpColorsGroup, HelpColorsCommand import os import sys import inspect from sys import argv from utils import ConfigurationManager from commands import product, component, releaseNote, auth, changelog, repo currentdir = os.path.dirname(os.path.abspath( inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) sys.path.insert(0, parentdir) confManager = ConfigurationManager() @click.group(help="Atlassian CLI", cls=HelpColorsGroup, help_headers_color='yellow', help_options_color='green') @click.pass_context # TODO: add url validation under commands when they are needed def cli(ctx): ctx.ensure_object(dict) # if not confManager.is_config_valid(): # sys.exit("ERROR: You have not configured the CLI. run atlcli auth login first.") pass
def __cloneFilesSources(self): SOURCE_URL = FileManager.datasets['source']['url'] TRAINING_URL = FileManager.datasets['training']['url'] TESTING_URL = FileManager.datasets['testing']['url'] # foreach directory in '/Lang' folder ... languagesExamplesCounter = {} for languageFolder in [f for f in os.scandir(SOURCE_URL) if f.is_dir()]: language = str(languageFolder.name).lower() languagesExamplesCounter[language] = 0 # parse only selected languages if language in ConfigurationManager.getLanguages(): # preparing empty {languageFolder.name} for each dataset if not (os.path.isdir(os.path.join(TRAINING_URL, language))): os.mkdir(os.path.join(TRAINING_URL, language)) if not (os.path.isdir(os.path.join(TESTING_URL, language))): os.mkdir(os.path.join(TESTING_URL, language)) # count example foreach language for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): for _ in FileManager.getExampleFiles(exampleFolder.path): languagesExamplesCounter[language] += 1 # print languages with examples counter less than {TRAINING_EXAMPLES_NUMBER} if languagesExamplesCounter[language] < TRAINING_EXAMPLES_NUMBER: print(' > [dataset] the total number of examples for the ' + language + ' is less than ' + str(TRAINING_EXAMPLES_NUMBER)) continue # for this language, the total examples number could be less than {TRAINING_EXAMPLES_NUMBER} indexesOfTrainingExamples = random.sample( range(1, languagesExamplesCounter[language]), TRAINING_EXAMPLES_NUMBER ) # list all examples in {languageFolder.name} folder exampleIndex = 0 for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): # list all examples versions in {exampleFolder.name} folder for exampleVersionFile in FileManager.getExampleFiles(exampleFolder.path): exampleIndex += 1 # move file to right dataset if exampleIndex in indexesOfTrainingExamples: DATASET_TYPE = TRAINING_URL else: DATASET_TYPE = TESTING_URL # prepare destination folder example = str(exampleVersionFile.name).lower() exampleFolderUri = os.path.join(DATASET_TYPE, language, example) os.mkdir(exampleFolderUri) # copy the ORIGINAL source file content originalFileUri = FileManager.getOriginalFileUrl(exampleFolderUri) FileManager.createFile(originalFileUri) shutil.copyfile(exampleVersionFile.path, originalFileUri) # create the 'PARSED' version of the orginal file parsedFileUri = FileManager.getParsedFileUrl(exampleFolderUri) FileManager.createFile(parsedFileUri) parser = Parser() parser.initialize(originalFileUri, parsedFileUri) parser.parse() return self