Exemplo n.º 1
0
    def test(self):
        if not os.path.exists(FileManager.getTrainedModelFileUrl(self.type)):
            raise Exception('You can\'t test a model without training it')

        # label encoder
        Y_Encoder = preprocessing.LabelEncoder()
        Y_Encoder.fit(ConfigurationManager.getLanguages())

        # preparing features
        X, languages = self.__prepareFeatures('testing', True)

        # import trained model
        self.importScikitTrainedModel()

        # make predictions
        Y_real = Y_Encoder.transform(languages)
        Y_predicted = self.model.predict(X)

        # metrics
        accuracy = accuracy_score(Y_real, Y_predicted)
        report = classification_report(Y_real,
                                       Y_predicted,
                                       target_names=Y_Encoder.classes_)
        print(' >  [BAYES]  classification report exported!')
        print(' >  [BAYES]  total accuracy = ' +
              str(float("{:.2f}".format(accuracy)) * 100) + '%')

        # export the classification report
        self.exportClassificationReport(str(report))

        return self
Exemplo n.º 2
0
def main():
    data = {"success": False}
    languages = ConfigurationManager.getLanguages()

    matched = 0
    totalExamples = 0

    for languageFolder in FileManager.getLanguagesFolders(FileManager.datasets['testing']['url']):
        language = str(languageFolder.name).lower()
        for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
            totalExamples += 1

            X_test = []
            originalFileContent = FileManager.readFile(FileManager.getOriginalFileUrl(exampleFolder.path))
            code_snip = originalFileContent
            # print(code_snip, file=sys.stdout)
            word_vec = convert_text_to_index_array(code_snip)
            X_test.append(word_vec)
            X_test = pad_sequences(X_test, maxlen=100)
            # print(X_test[0].reshape(1,X_test.shape[1]), file=sys.stdout)
            y_prob = model.predict(X_test[0].reshape(1, X_test.shape[1]), batch_size=1, verbose=2)[0]

            a = np.array(y_prob)
            idx = np.argmax(a)
            if str(languages[idx]) == language:
                matched += 1

            # data["predictions"] = []
            # for i in range(len(languages)):
            #     # print(languages[i], file=sys.stdout)
            #     r = {"label": languages[i], "probability": format(y_prob[i] * 100, '.2f')}
            #     data["predictions"].append(r)

    print('')
    print('')
    print('totalExamples = ' + str(totalExamples))
    print('matched = ' + str(matched))
    print('matched / totalExamples  = ' + str(matched / totalExamples))
    print('')
    print('')
Exemplo n.º 3
0
    def train(self):
        if os.path.exists(FileManager.getTrainedModelFileUrl(self.type)):
            return self

        # preparing features
        X, languages = self.__prepareFeatures('training', False)

        # label encoder
        Y_Encoder = preprocessing.LabelEncoder()
        Y_Encoder.fit(ConfigurationManager.getLanguages())

        # (X, Y) creation
        Y = Y_Encoder.transform(languages)

        # prepare model
        self.__prepareModel()
        # training
        self.model.fit(X, Y)
        # export the trained model
        self.exportScikitTrainedModel()

        return self
Exemplo n.º 4
0
# /usr/bin/env python3

import re as regex
from utils import ConfigurationManager
from keras.preprocessing.text import text_to_word_sequence

RESERVED_WORDS: list = ConfigurationManager.getReservedWords()
ESCAPED_TOKENS = ConfigurationManager.escaped_tokens
TOKENIZER_CONFIG: dict = ConfigurationManager.tokenizerConfiguration


class Parser:
    def __init__(self):
        self.ORIGINAL_URI: str = "_MISSING_"
        self.PARSED_URI: str = "_MISSING_"

    def initialize(self, originalUri, parserdUri):
        self.ORIGINAL_URI = originalUri
        self.PARSED_URI = parserdUri

    ##

    def _isLineAComment(self, line: str):
        # create tokens list
        words: list = line.split(' ')
        # check if contains 'reserved words'
        if any(el in RESERVED_WORDS for el in words):
            return False
        # remove first occurrency
        words.pop(0)
        # check if line contains at least 3 words
Exemplo n.º 5
0
    def __calculateTokensEntropyLoss(self, dataset: str):
        if os.path.exists(FileManager.getFeaturesFileUrl(self.type)):
            return self

        sources, languages = self.extractSources(dataset)
        withTokensOccurencyMap: dict = {}
        withoutTokensOccurencyMap: dict = {}

        for index, source in enumerate(sources):
            language = languages[index]
            tokens = set(source.split(' '))
            for token in tokens:
                if token not in withTokensOccurencyMap:
                    withTokensOccurencyMap[token] = []
                withTokensOccurencyMap[token].append(language)

        for index, source in enumerate(sources):
            language = languages[index]
            tokens = set(source.split(' '))
            for token in withTokensOccurencyMap:
                if token not in tokens:
                    if token not in withoutTokensOccurencyMap:
                        withoutTokensOccurencyMap[token] = []
                    withoutTokensOccurencyMap[token].append(language)

        tokensMetrics: dict = {}

        for language in ConfigurationManager.getLanguages():
            tokensMetrics[language] = {}
            for token in withTokensOccurencyMap:
                tokensMetrics[language][token] = {}
                tokensMetrics[language][token][
                    'numberOfExamplesWithFeatureF']: int = len(
                        withTokensOccurencyMap[token])
                tokensMetrics[language][token][
                    'numberOfExamplesWithoutFeatureF']: int = len(
                        withoutTokensOccurencyMap[token])
                tokensMetrics[language][token][
                    'numberOfPositiveExamplesWithFeatureF']: int = len([
                        lg for lg in withTokensOccurencyMap[token]
                        if lg == language
                    ])
                tokensMetrics[language][token][
                    'numberOfPositiveExamplesWithoutFeatureF']: int = len([
                        lg for lg in withoutTokensOccurencyMap[token]
                        if lg == language
                    ])

        languageFeatures = {}
        tokensEntropyLoss: dict = {}
        numberOfExamples = self.Dataset.countExamples(dataset)
        N_OF_TOKENS_FOR_LANGUAGE: int = self.config[
            'number_of_tokens_for_language']

        for language in ConfigurationManager.getLanguages():
            tokensEntropyLoss[language] = {}
            numberOfPositiveExamples: int = self.Dataset.getCounters(
                dataset)[language]
            for token in tokensMetrics[language]:
                tokensEntropyLoss[language][token] = 0
                metrics = tokensMetrics[language][token]
                numberOfExamplesWithFeatureF = metrics[
                    'numberOfExamplesWithFeatureF']
                numberOfExamplesWithoutFeatureF = metrics[
                    'numberOfExamplesWithoutFeatureF']
                numberOfPositiveExamplesWithFeatureF = metrics[
                    'numberOfPositiveExamplesWithFeatureF']
                numberOfPositiveExamplesWithoutFeatureF = metrics[
                    'numberOfPositiveExamplesWithoutFeatureF']
                # preparing entropy formula vars
                pr_C: float = numberOfPositiveExamples / numberOfExamples
                pr_f: float = numberOfExamplesWithFeatureF / numberOfExamples
                pr_C_f: float = numberOfPositiveExamplesWithFeatureF / numberOfExamplesWithFeatureF
                pr_C_notf: float = numberOfPositiveExamplesWithoutFeatureF / numberOfExamplesWithoutFeatureF

                # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range
                pr_C = (pr_C if pr_C > 0 else .0001)
                pr_f = (pr_f if pr_f > 0 else .0001)
                pr_C_f = (pr_C_f if pr_C_f > 0 else .0001)
                pr_C_notf = (pr_C_notf if pr_C_notf > 0 else .0001)

                # TODO: use this https://scikit-learn.org/stable/modules/preprocessing.html#scaling-features-to-a-range
                pr_C = (pr_C if pr_C < 1 else .9999)
                pr_f = (pr_f if pr_f < 1 else .9999)
                pr_C_f = (pr_C_f if pr_C_f < 1 else .9999)
                pr_C_notf = (pr_C_notf if pr_C_notf < 1 else .9999)

                # calculating token's entropy
                e = -(pr_C * math.log2(pr_C)) - (
                    (1 - pr_C) * math.log2(1 - pr_C))
                e_f = -(pr_C_f * math.log2(pr_C_f)) - (
                    (1 - pr_C_f) * math.log2(1 - pr_C_f))
                e_not_f = -(pr_C_notf * math.log2(pr_C_notf)) - (
                    (1 - pr_C_notf) * math.log2(1 - pr_C_notf))
                tokensEntropyLoss[language][token] = e - (e_f *
                                                          pr_f) + (e_not_f *
                                                                   (1 - pr_f))

            # sort entropy values by desc order
            tokensEntropyLoss[language] = {
                k: v
                for k, v in sorted(tokensEntropyLoss[language].items(),
                                   key=lambda item: item[1])
            }
            # take first n tokens
            languageFeatures[language] = list(
                tokensEntropyLoss[language].keys())[:N_OF_TOKENS_FOR_LANGUAGE]

        # export tokens with maximum entropy loss
        FileManager.writeFile(FileManager.getFeaturesFileUrl(self.type),
                              json.dumps(languageFeatures))

        return self
Exemplo n.º 6
0
from click_help_colors import HelpColorsGroup, HelpColorsCommand
import os
import sys
import inspect
from sys import argv
from utils import ConfigurationManager
from commands import product, component, releaseNote, auth, changelog, repo


currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)


confManager = ConfigurationManager()


@click.group(help="Atlassian CLI",
             cls=HelpColorsGroup,
             help_headers_color='yellow',
             help_options_color='green')
@click.pass_context
# TODO: add url validation under commands when they are needed
def cli(ctx):
    ctx.ensure_object(dict)

    # if not confManager.is_config_valid():
    #     sys.exit("ERROR: You have not configured the CLI. run atlcli auth login first.")
    pass
Exemplo n.º 7
0
    def __cloneFilesSources(self):
        SOURCE_URL = FileManager.datasets['source']['url']
        TRAINING_URL = FileManager.datasets['training']['url']
        TESTING_URL = FileManager.datasets['testing']['url']

        # foreach directory in '/Lang' folder ...
        languagesExamplesCounter = {}
        for languageFolder in [f for f in os.scandir(SOURCE_URL) if f.is_dir()]:
            language = str(languageFolder.name).lower()
            languagesExamplesCounter[language] = 0
            # parse only selected languages
            if language in ConfigurationManager.getLanguages():
                # preparing empty {languageFolder.name} for each dataset
                if not (os.path.isdir(os.path.join(TRAINING_URL, language))):
                    os.mkdir(os.path.join(TRAINING_URL, language))
                if not (os.path.isdir(os.path.join(TESTING_URL, language))):
                    os.mkdir(os.path.join(TESTING_URL, language))

                # count example foreach language
                for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                    for _ in FileManager.getExampleFiles(exampleFolder.path):
                        languagesExamplesCounter[language] += 1

                # print languages with examples counter less than {TRAINING_EXAMPLES_NUMBER}
                if languagesExamplesCounter[language] < TRAINING_EXAMPLES_NUMBER:
                    print(' >  [dataset] the total number of examples for the '
                          + language + ' is less than ' + str(TRAINING_EXAMPLES_NUMBER))
                    continue

                # for this language, the total examples number could be less than {TRAINING_EXAMPLES_NUMBER}
                indexesOfTrainingExamples = random.sample(
                    range(1, languagesExamplesCounter[language]),
                    TRAINING_EXAMPLES_NUMBER
                )

                # list all examples in {languageFolder.name} folder
                exampleIndex = 0
                for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                    # list all examples versions in {exampleFolder.name} folder
                    for exampleVersionFile in FileManager.getExampleFiles(exampleFolder.path):
                        exampleIndex += 1
                        # move file to right dataset
                        if exampleIndex in indexesOfTrainingExamples:
                            DATASET_TYPE = TRAINING_URL
                        else:
                            DATASET_TYPE = TESTING_URL

                        # prepare destination folder
                        example = str(exampleVersionFile.name).lower()
                        exampleFolderUri = os.path.join(DATASET_TYPE, language, example)
                        os.mkdir(exampleFolderUri)
                        # copy the ORIGINAL source file content
                        originalFileUri = FileManager.getOriginalFileUrl(exampleFolderUri)
                        FileManager.createFile(originalFileUri)
                        shutil.copyfile(exampleVersionFile.path, originalFileUri)
                        # create the  'PARSED' version of the orginal file
                        parsedFileUri = FileManager.getParsedFileUrl(exampleFolderUri)
                        FileManager.createFile(parsedFileUri)
                        parser = Parser()
                        parser.initialize(originalFileUri, parsedFileUri)
                        parser.parse()

        return self