Пример #1
0
def main():
    # load the dataset
    datasetManager = DatasetManager()
    datasetManager.initialize('CNN').load()

    #

    counter = 0
    code_archive = []
    languages = []

    for languageFolder in FileManager.getLanguagesFolders(
            FileManager.datasets['training']['url']):
        for exampleFolder in FileManager.getExamplesFolders(
                languageFolder.path):
            originalFileUrl = FileManager.getOriginalFileUrl(
                exampleFolder.path)
            originalFileContent = FileManager.readFile(originalFileUrl)
            #
            counter += 1
            code_archive.append(originalFileContent)
            languages.append(str(languageFolder.name).lower())

    # added - and @
    max_fatures = 100000
    embed_dim = 128
    lstm_out = 64
    batch_size = 32
    epochs = 30
    test_size = 0.001

    tokenizer = Tokenizer(num_words=max_fatures)
    tokenizer.fit_on_texts(code_archive)
    dictionary = tokenizer.word_index
    FileManager.createFile(
        os.path.join(FileManager.getRootUrl(), 'tmp/wordindex.json'),
        json.dumps(dictionary))

    X = tokenizer.texts_to_sequences(code_archive)
    X = pad_sequences(X, 100)
    Y = pd.get_dummies(languages)
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=test_size)

    # LSTM model
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim, input_length=100))
    model.add(
        Conv1D(filters=128,
               kernel_size=3,
               padding='same',
               dilation_rate=1,
               activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(
        Conv1D(filters=64,
               kernel_size=3,
               padding='same',
               dilation_rate=1,
               activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(lstm_out))
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(Dense(len(Y.columns), activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size)

    model.save(os.path.join(FileManager.getRootUrl(), 'tmp/code_model.h5'))
    model.save_weights(
        os.path.join(FileManager.getRootUrl(), 'tmp/code_model_weights.h5'))

    score, acc = model.evaluate(X_test,
                                Y_test,
                                verbose=2,
                                batch_size=batch_size)
    print(model.metrics_names)
    print("Validation loss: %f" % score)
    print("Validation acc: %f" % acc)
Пример #2
0
    def __cloneFilesSources(self):
        SOURCE_URL = FileManager.datasets['source']['url']
        TRAINING_URL = FileManager.datasets['training']['url']
        TESTING_URL = FileManager.datasets['testing']['url']

        # foreach directory in '/Lang' folder ...
        languagesExamplesCounter = {}
        for languageFolder in [f for f in os.scandir(SOURCE_URL) if f.is_dir()]:
            language = str(languageFolder.name).lower()
            languagesExamplesCounter[language] = 0
            # parse only selected languages
            if language in ConfigurationManager.getLanguages():
                # preparing empty {languageFolder.name} for each dataset
                if not (os.path.isdir(os.path.join(TRAINING_URL, language))):
                    os.mkdir(os.path.join(TRAINING_URL, language))
                if not (os.path.isdir(os.path.join(TESTING_URL, language))):
                    os.mkdir(os.path.join(TESTING_URL, language))

                # count example foreach language
                for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                    for _ in FileManager.getExampleFiles(exampleFolder.path):
                        languagesExamplesCounter[language] += 1

                # print languages with examples counter less than {TRAINING_EXAMPLES_NUMBER}
                if languagesExamplesCounter[language] < TRAINING_EXAMPLES_NUMBER:
                    print(' >  [dataset] the total number of examples for the '
                          + language + ' is less than ' + str(TRAINING_EXAMPLES_NUMBER))
                    continue

                # for this language, the total examples number could be less than {TRAINING_EXAMPLES_NUMBER}
                indexesOfTrainingExamples = random.sample(
                    range(1, languagesExamplesCounter[language]),
                    TRAINING_EXAMPLES_NUMBER
                )

                # list all examples in {languageFolder.name} folder
                exampleIndex = 0
                for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                    # list all examples versions in {exampleFolder.name} folder
                    for exampleVersionFile in FileManager.getExampleFiles(exampleFolder.path):
                        exampleIndex += 1
                        # move file to right dataset
                        if exampleIndex in indexesOfTrainingExamples:
                            DATASET_TYPE = TRAINING_URL
                        else:
                            DATASET_TYPE = TESTING_URL

                        # prepare destination folder
                        example = str(exampleVersionFile.name).lower()
                        exampleFolderUri = os.path.join(DATASET_TYPE, language, example)
                        os.mkdir(exampleFolderUri)
                        # copy the ORIGINAL source file content
                        originalFileUri = FileManager.getOriginalFileUrl(exampleFolderUri)
                        FileManager.createFile(originalFileUri)
                        shutil.copyfile(exampleVersionFile.path, originalFileUri)
                        # create the  'PARSED' version of the orginal file
                        parsedFileUri = FileManager.getParsedFileUrl(exampleFolderUri)
                        FileManager.createFile(parsedFileUri)
                        parser = Parser()
                        parser.initialize(originalFileUri, parsedFileUri)
                        parser.parse()

        return self