def getBestClassifierForEachProgram(possibleTargetColumns, possiblePrograms, possibleClassifiers, bestProgram_Classifier_Parameters, writeFile = False): bestProgram_Classifier = pd.DataFrame() for targetColumn in possibleTargetColumns: baseFolder = '{}/ML/Results/{}/Classification'.format(os.getcwd(), targetColumn) fileName = '{}/bestClassifierForEachProgram.csv'.format(baseFolder) if writeFile: # Percorre todos os programas for programName in possiblePrograms: program_ClassifierMetrics = pd.DataFrame() #print('Column: {} | Program: {}'.format(targetColumn, programName)) # Percorre todos os classificadores daquele programa for classifier in possibleClassifiers: programFileName = '{}/{}_{}.csv'.format(baseFolder, programName, classifier) fileExists = util.pathExists(programFileName) if fileExists: # Obtém as métricas do programa / classificador accuracy, precision, recall, f1 = getMLMetricsFromClassificationFile(programFileName, targetColumn, programName) newMutantsMetrics = pd.DataFrame(data=[[programName, accuracy * 100, precision * 100, recall * 100, f1 * 100, classifier]], columns=['ProgramName', 'Accuracy', 'Precision', 'Recall', 'F1', 'Classifier']) program_ClassifierMetrics = program_ClassifierMetrics.append(newMutantsMetrics) # Verifica qual o melhor parâmetro para aquele programa if fileExists: bestClassifier = program_ClassifierMetrics.sort_values('F1', ascending=False).head(n=1)['Classifier'].values[0] bestFile = '{}/{}_{}.csv'.format(baseFolder, programName, bestClassifier) newFile = '{}/{}.csv'.format(baseFolder, programName) bestParameter = bestProgram_Classifier_Parameters.query('Column == \'{}\' and Program == \'{}\' and Classifier == \'{}\''.format(targetColumn, programName, bestClassifier)) if not bestParameter.empty: bestParameter = bestParameter['Parameter'].values[0] else: bestParameter = '' newBestProgram_Classifier = pd.DataFrame(data=[[targetColumn, programName, bestClassifier, bestParameter]], columns=['Column', 'Program', 'Classifier', 'Parameter']) bestProgram_Classifier = bestProgram_Classifier.append(newBestProgram_Classifier) copyfile(bestFile, newFile) # Escreve o arquivo bestProgram_Classifier['Program.UPPER'] = bestProgram_Classifier["Program"].str.upper() bestProgram_Classifier = bestProgram_Classifier.sort_values(by=['Column', 'Program.UPPER']) del bestProgram_Classifier['Program.UPPER'] util.writeDataFrameInCsvFile(fileName, bestProgram_Classifier.query('Column == \'{}\''.format(targetColumn))) else: newBestProgram_Classifier = util.createDataFrameFromCSV(fileName, hasHeader=True, columnIndex=0) bestProgram_Classifier = pd.concat([bestProgram_Classifier, newBestProgram_Classifier]) bestProgram_Classifier['Program.UPPER'] = bestProgram_Classifier["Program"].str.upper() bestProgram_Classifier = bestProgram_Classifier.sort_values(by=['Column', 'Program.UPPER']) del bestProgram_Classifier['Program.UPPER'] return bestProgram_Classifier
def summarizeClassifications(targetColumn, possiblePrograms, df_Programs_BestClassifiers, overwrite=False): ''' Função responsável por fazer a análise dos resultados das classificações dos mutantes dos programas e sumarizar todos os resultados. ''' baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), targetColumn) fileName = 'ClassificationSummary' summaryFile = '{}/ML/Results/{}/Classification/{}.csv'.format( os.getcwd(), targetColumn, fileName) mutantsData = pd.DataFrame() df_Programs_BestClassifiers = df_Programs_BestClassifiers.query( 'Column == \'{}\''.format(targetColumn)) if overwrite or not util.pathExists(summaryFile): for programName in possiblePrograms: bestClassifier = df_Programs_BestClassifiers.query( 'ProgramName == \'{}\''.format( programName)).loc[:, 'Classifier'].values[0] programFileBestClassifier = '{}/{}_{}.csv'.format( baseFolder, programName, bestClassifier) if util.pathExists(programFileBestClassifier): classificationResult = util.createDataFrameFromCSV( programFileBestClassifier, hasHeader=True) mutantsData = mutantsData.append(classificationResult, ignore_index=True) util.writeDataFrameInCsvFile(summaryFile, mutantsData) return mutantsData elif util.pathExists(summaryFile): return util.createDataFrameFromCSV(summaryFile, True)
def main(_baseExperimentFolder, _baseFolder, executionMode): #executionMode | 1 - Run and analyze # 2 - Run # 3 - Analyze if int(executionMode) >= 1 and int(executionMode) <= 3: print( '####################################################################' ) print( '#\t Executing script to find minimal mutants properties\t #') print('#\t\t ' + util.formatNow() + '\t\t\t #') print( '####################################################################' ) #################### # Set main variables #################### baseExperimentFolder = _baseExperimentFolder baseFolder = _baseFolder sourceFile = baseFolder[baseFolder.rfind("/") + 1:] sessionName = sourceFile executableFile = sessionName executionType = "research" directory = baseFolder #unitName = util.getContentFromFile("{baseFolder}/unit.txt".format(baseFolder = baseFolder)) units = util.getContentFromFile( "{baseFolder}/unit.txt".format(baseFolder=baseFolder)) if int(executionMode) == 1 or int(executionMode) == 2: ################# # Execute proteum ################# executeProteum(baseFolder, sourceFile, sessionName, executableFile, executionType, directory, units) if int(executionMode) == 1 or int(executionMode) == 3: ##################### # Get minimal mutants ##################### print('\n##### \tBuscando mutantes minimais ' + util.formatNow() + '\t#####') minimalMutants = getMinimalMutants(baseFolder, sourceFile) ###################### # Simplifying GFC file ###################### # Desabilitado pois estou utilizando GFC já passados #print ('\n##### \tSimplificando arquivo GFC ' + util.formatNow() + '\t#####' ) #prot2PokeMain("{baseFolder}/__{sourceFile}.gfc".format( #baseFolder = baseFolder, sourceFile = sourceFile)) ################################ # Get basic mutants informations ################################ print('\n##### \tBuscando e calculando informações dos mutantes ' + util.formatNow() + '\t#####') mutantsHeader, mutantsInfo = getMutantsInfo( baseFolder, minimalMutants, sessionName, units) ################################################ # Write csv File with basic mutants informations ################################################ print('\n##### \tGerando arquivo com informações dos mutantes ' + util.formatNow() + '\t#####') fileNameResults = "{baseFolder}/log/{sessionName}_result.csv".format( sessionName=sessionName, baseFolder=baseFolder) util.writeInCsvFile(fileNameResults, mutantsInfo, mutantsHeader) ########################################################### # Write mutants info to compute machine learning algorithms ########################################################### datasetBaseFolder = '{}/ML/Dataset'.format( util.getPreviousFolder(baseExperimentFolder)) ######################## ### --- Minimals --- ### essentialInfo = computeEssencialInfo(mutantsInfo, minimal_Equivalent=0) # Gera apenas um arquivo com todos os mutantes essentialFileName = '{}/MINIMAL/mutants.csv'.format( datasetBaseFolder) util.writeDataFrameInCsvFile( essentialFileName, essentialInfo, sep=',', mode='a+', header=True if util.pathExists(essentialFileName) == False else False, index=False) # Gera um arquivo para cada programa com todos os seus mutantes essentialFileName = '{}/MINIMAL/Programs/{}.csv'.format( datasetBaseFolder, sessionName) util.writeDataFrameInCsvFile(essentialFileName, essentialInfo, sep=',', mode='w+', header=True, index=False) ########################### ### --- Equivalents --- ### essentialInfo = computeEssencialInfo(mutantsInfo, minimal_Equivalent=1) # Gera apenas um arquivo com todos os mutantes essentialFileName = '{}/EQUIVALENT/mutants.csv'.format( datasetBaseFolder) util.writeDataFrameInCsvFile( essentialFileName, essentialInfo, sep=',', mode='a+', header=True if util.pathExists(essentialFileName) == False else False, index=False) # Gera um arquivo para cada programa com todos os seus mutantes essentialFileName = '{}/EQUIVALENT/Programs/{}.csv'.format( datasetBaseFolder, sessionName) util.writeDataFrameInCsvFile(essentialFileName, essentialInfo, sep=',', mode='w+', header=True, index=False)
def classify(newDataSetFileName, resultDataSetFileName, targetColumn, classifier, algorithmParameter, programToClassify): """ Function responsable to classify a new data set as equivalent or minimal from predictive models are existing Args: newDataSetFileName (str): File name containing new mutants to be classified resultDataSetFileName (str): File name to be generated with the classification result. This file contains the same row number than 'newDataSetFileName'. targetColumn (str): Column to be classified. Must be 'MINIMAL' ou 'EQUIVALENT'. classifier (str): The classifier algorithm used to predict the new data inputed. Must be 'KNN', 'DT', 'RF', 'SVM', 'LDA', 'LR' or 'GNB' algorithmParameter (int): The parameter to be used on classifier. This parameter Must be K, as the number of neighbors on KNN, or min sample split to Decision Tree and RandomForest. programToClassify(str): --- """ ###################### # --- Setting datasets targetColumnName = targetColumn targetColumn = '_IM_{}'.format(targetColumn) trainDataSetFileName = 'ML/Dataset/{}/mutants.csv'.format(targetColumnName) if targetColumn == '_IM_MINIMAL': ##################### # --- Setting columns columnNames = getColumnNames_lastMinimal() elif targetColumn == '_IM_EQUIVALENT': ##################### # --- Setting columns columnNames = getColumnNames_lastEquivalent() ################### # --- PreProcessing # --- Import trainDataSet = importDataSet(trainDataSetFileName) trainDataSet = trainDataSet.query( '_IM_PROGRAM != \'{}\''.format(programToClassify)) newDataSetFrame = importDataSet(newDataSetFileName) # --- PreProccess operatorsToTrain = list(set(trainDataSet['_IM_OPERATOR'].values)) typeStatementsToTrain = list(set( trainDataSet['_IM_TYPE_STATEMENT'].values)) operatorsToTest = list(set(newDataSetFrame['_IM_OPERATOR'].values)) typeStatementsToTest = list( set(newDataSetFrame['_IM_TYPE_STATEMENT'].values)) allOperators = list(set(operatorsToTrain + operatorsToTest)) allTypeStatement = list(set(typeStatementsToTrain + typeStatementsToTest)) trainDataSetFrame, numProperties, numColumnsToDelete_train, _, _, groupedDataSetFrame = preProcessing( trainDataSet, targetColumn, columnNames, [], [], allOperators, allTypeStatement) newDataSetFrame, numProperties, numColumnsToDelete_test, _, _, groupedDataSetFrame = preProcessing( newDataSetFrame, targetColumn, columnNames, [], [], allOperators, allTypeStatement, False) # Separate the data into X (values) and y (target value) X_train = trainDataSetFrame.iloc[:, :-1].values X_test = newDataSetFrame.iloc[:, :-1].values y_train = trainDataSetFrame.iloc[:, numProperties + numColumnsToDelete_train].values ############################################################################## # --- Classify and write new CSV with informations about the prediction result y_test = trainingAndPredictions(classifier, algorithmParameter, X_train, y_train, X_test) # Create an array with the results of prediction | 1 for correct, 0 for incorrect result = [ 1 if predicted == groupedDataSetFrame[targetColumn][iCount] else 0 for iCount, predicted in zip(range(len(y_test)), y_test) ] ############################## # --- Metrics about prediction #total = len(result) #correct = result.count(1) #perc = correct * 100 / total #print('Total: {} | Correto: {} | Perc: {}'.format(total, correct, perc)) predictedDF = pd.DataFrame(groupedDataSetFrame) predictedDF['PREDICTED'] = y_test predictedDF['RESULT'] = result onlyResultDataSetFileName = str(resultDataSetFileName).replace( '.csv', '_result.csv') util.writeInCsvFile(onlyResultDataSetFileName, [str(value) for value in y_test]) util.writeDataFrameInCsvFile(resultDataSetFileName, predictedDF)
def analyzeClassificationsFromEachProgram(targetColumn, possiblePrograms, bestProgram_Classifier, overwrite=False): ''' Função responsável por fazer a análise dos resultados das classificações dos mutantes dos programas e obter as métricas para cada programa ''' baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), targetColumn) fileName = 'ML_Metrics' # Falta criar o arquivo '{}/Metrics_AllClassifiers.csv'.format(baseFolder) neste arquivo metricsFile = '{}/ML/Results/{}/Classification/{}.csv'.format( os.getcwd(), targetColumn, fileName) mutantsMetrics = pd.DataFrame() if overwrite or not util.pathExists(metricsFile): for file in util.getFilesInFolder(baseFolder): programName = util.getPathName(file) programName = programName[:programName.find('.')] if programName in possiblePrograms: programInfo_ClassifierParameter = bestProgram_Classifier.query( 'Column == \'{}\' and Program == \'{}\''.format( targetColumn, programName)) classifier = programInfo_ClassifierParameter[ 'Classifier'].values[0] parameter = programInfo_ClassifierParameter[ 'Parameter'].values[0] accuracy, precision, recall, f1 = getMLMetricsFromClassificationFile( file, targetColumn, programName) newMutantsMetrics = pd.DataFrame(data=[[ programName, targetColumn, classifier, parameter, accuracy * 100, precision * 100, recall * 100, f1 * 100 ]], columns=[ 'ProgramName', 'Column', 'Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1' ]) mutantsMetrics = mutantsMetrics.append(newMutantsMetrics) #print('Program: {}\tClassifier: {} | Parameter: {}\t\tAccuracy: {} | Precision: {} | Recall: {} | F1: {}'.format(programName, classifier, parameter, accuracy, precision, recall, f1)) mutantsMetrics['ProgramName.UPPER'] = mutantsMetrics[ "ProgramName"].str.upper() mutantsMetrics = mutantsMetrics.sort_values( by=['Column', 'ProgramName.UPPER']) del mutantsMetrics['ProgramName.UPPER'] util.writeDataFrameInCsvFile(metricsFile, mutantsMetrics) elif util.pathExists(metricsFile): mutantsMetrics = util.createDataFrameFromCSV(metricsFile, True) mutantsMetrics['ProgramName.UPPER'] = mutantsMetrics[ "ProgramName"].str.upper() mutantsMetrics = mutantsMetrics.sort_values( by=['Column', 'ProgramName.UPPER']) del mutantsMetrics['ProgramName.UPPER'] return mutantsMetrics
def analyzeResults(possibleTargetColumns, possibleClassifiers, overwriteFullFile=False): """ Analyze each of 30 run for each classifier with each target column and calculate the best (calculating the mean) metric for each ones (classifier and target column). Parameters ---------- possibleTargetColumns (list) Array containing all possible columns that can be sorted (MINIMAL and EQUIVALENT) possibleClassifiers (list) Array containing all possible classifiers that can be used (KNN, DT, RF, SVM, LDA, LR and GNB) overwriteFullFile (bool) Boolean indicating whether the file 'Summary_All30Runs' should be overwritten. Returns ------- (DataFrame, DataFrame, DataFrame) experimentResults - Dataframe containing all values of all predictive models and 30 executions. Corresponds to the file 'Summary_All30Runs.csv' bestParameterResults - Dataframe containing all values of the predictive models that was the best and 30 executions. Corresponds to the file 'Summary_BestClassifiers_All30Runs.csv' classifiersBestParameter - Dataframe containing the average values of the classifiers with the best results. Corresponds to the 'Summary_Classifiers.csv' file """ # Dataframe with all execution results# Dataframe with all execution results experimentResults = pd.DataFrame() # Dataframe with the indication of the best parameter for each classifier summaryClassifiersBestParameter = pd.DataFrame() # Base folder baseResultsFolderName = '{}/ML/Results'.format(os.getcwd()) # FullFile fullFileName = '{}/Summary/Summary_All30Runs.csv'.format( baseResultsFolderName) # FullFile for the best classifiers fullBestClassifiersFileName = '{}/Summary/Summary_BestClassifiers_All30Runs.csv'.format( baseResultsFolderName) # SummaryFile summaryFileName = '{}/Summary/Summary_Classifiers.csv'.format( baseResultsFolderName) # CustomParameterResults summaryCustomParametersFileName = '{}/Summary/Summary_CustomParameters.csv'.format( baseResultsFolderName) if overwriteFullFile or not util.pathExists(fullFileName): # If you are going to overwrite the existing file or if it does not exist, do all the reading and write a new file # Cycles through all columns (Minimal and equivalent) for targetColumn in possibleTargetColumns: # Cycle through all classifiers for classifier in possibleClassifiers: # Dataframe containing the result of the 30 executions of each program classifierResults = pd.DataFrame() # Cycles through all executions (from 1 to 30) for iCount in range(1, 31, 1): # Fetch execution results classifierRunResultFileName = '{}/{}_{}/{}.csv'.format( baseResultsFolderName, targetColumn, iCount, classifier) classifierRunResult = util.createDataFrameFromCSV( classifierRunResultFileName, hasHeader=True, separator=';', initialLine=5) # Concatenates the result of this execution to the other executions that in the end will be 30 classifierResults = classifierResults.append( classifierRunResult) # Insert information related to that execution pd.DataFrame(classifierRunResult).insert( 0, 'TargetColumn', targetColumn, True) pd.DataFrame(classifierRunResult).insert( 1, 'Classifier', classifier, True) pd.DataFrame(classifierRunResult).insert( 2, 'Run', iCount, True) # Concatenates this execution to all others experimentResults = experimentResults.append( classifierRunResult) # Calculates the best parameter for that classifier parameters = classifierResults['SampleSplit'].unique() parameterMetrics = pd.DataFrame() for parameter in parameters: # Search only the results of that parameter resultsFromThisParameter = classifierResults.query( 'SampleSplit == \'{}\''.format(parameter)) # Collect the average of the metrics of the 30 executions meanAccuracy = np.mean( resultsFromThisParameter['Accuracy']) meanPrecision = np.mean( resultsFromThisParameter['Precision']) meanRecall = np.mean(resultsFromThisParameter['Recall']) meanF1 = np.mean(resultsFromThisParameter['F1']) stdF1 = np.std(resultsFromThisParameter['F1']) parameterMetrics = parameterMetrics.append( pd.DataFrame(data=[[ targetColumn, classifier, parameter, meanAccuracy, meanPrecision, meanRecall, meanF1, stdF1 ]], columns=[ 'TargetColumn', 'Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1', 'StdDevF1' ])) #print('Parameter: {}\t\tAccuracy: {} | Precision: {} | Recall: {} | F1: {}'.format(parameter, meanAccuracy, meanPrecision, meanRecall, meanF1)) bestParameter = parameterMetrics.sort_values( by=['F1'], ascending=False).head(n=1) #print('\n--- {} - {}'.format(classifier, targetColumn)) #print(bestParameter.head()) summaryClassifiersBestParameter = summaryClassifiersBestParameter.append( bestParameter) #print(classifiersBestParameter.head(n=50)) # Write a file with all the results util.writeDataFrameInCsvFile(fullFileName, experimentResults, index=False) # Write a file with only the best parameters for each classifier util.writeDataFrameInCsvFile(summaryFileName, summaryClassifiersBestParameter, index=False) # Write a file with all the results but only for the best parameters # Exclude the non best executions bestParameterResults = getRunsOnlyFromBestParameter( experimentResults, summaryClassifiersBestParameter, possibleTargetColumns) util.writeDataFrameInCsvFile(fullBestClassifiersFileName, bestParameterResults, index=False) else: # Search for existing files experimentResults = util.createDataFrameFromCSV( fullFileName, True, ',') summaryClassifiersBestParameter = util.createDataFrameFromCSV( summaryFileName, True, ',') bestParameterResults = util.createDataFrameFromCSV( fullBestClassifiersFileName, True, ',') # Get the results from custom parameters customParameterResults = summarizeRunsFromCustomParameter( getRunsFromCustomParameters(experimentResults)) util.writeDataFrameInCsvFile(summaryCustomParametersFileName, customParameterResults, index=False) return experimentResults, bestParameterResults, summaryClassifiersBestParameter
def getBestClassifierForPrograms(program=None, targetColumn=None, write=True, overwrite=False): ''' Function responsible to analyze the results from all programs with all classifiers If the parameter 'writeFile' is True, this function writes a file with the result Returns a dataframe with the best classifier for each program and this metrics ''' # SetUp possibleTargetColumns, possibleClassifiers, possiblePrograms = setUp() # Indicate the best classifier for each classifier - ProgramName, Column, Classifier, Accuracy, Precision, Recall, F1 df_Programs_BestClassifiers = pd.DataFrame() # Set possible none parameters if program is not None: programs = [program] else: programs = possiblePrograms # Set possible none parameters if targetColumn is not None: columns = [targetColumn] else: columns = possibleTargetColumns mustWrite = write and program is None and targetColumn is None if mustWrite: for _column in columns: baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), _column) # Dataframe containing programs, the best classifier for each one and the metrics achieved df_Program_Metrics_BestClassifier = pd.DataFrame() # Dataframe containing programs and all classifiers for each one and the metrics achieved df_Program_Metrics_AllClassifiers = pd.DataFrame() for programName in programs: df_Program_Classifiers_Metrics = pd.DataFrame() for classifier in possibleClassifiers: fileName = '{}/{}_{}.csv'.format(baseFolder, programName, classifier) if util.pathExists(fileName): classificationResult = util.createDataFrameFromCSV( fileName, hasHeader=True) y_correct = classificationResult.loc[:, '_IM_{}'. format(_column )].values y_predicted = classificationResult.loc[:, 'PREDICTED'].values #accuracy, precision, recall, f1, TPR, FPR, TP, FN, FP, TN = evaluatingClassification(y_correct, y_predicted) accuracy, precision, recall, f1, _, _, _, _, _, _ = evaluatingClassification( y_correct, y_predicted) #print('Program: {} | Column: {} | Classifier: {} | Accuracy: {} | Precision: {} | Recall: {} | F1: {}'.format(programName, _column, classifier, accuracy, precision, recall, f1)) newDataFrame = pd.DataFrame(data=[[ programName, _column, classifier, accuracy * 100, precision * 100, recall * 100, f1 * 100 ]], columns=[ 'ProgramName', 'Column', 'Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1' ]) df_Program_Classifiers_Metrics = df_Program_Classifiers_Metrics.append( newDataFrame) df_Program_Metrics_AllClassifiers = df_Program_Metrics_AllClassifiers.append( df_Program_Classifiers_Metrics.sort_values( 'F1', ascending=False)) # Classify here the best classifier for these program df_Program_Classifiers_Metrics = df_Program_Classifiers_Metrics.sort_values( 'F1', ascending=False).head(n=1) df_Program_Metrics_BestClassifier = df_Program_Metrics_BestClassifier.append( df_Program_Classifiers_Metrics) df_Programs_BestClassifiers = df_Programs_BestClassifiers.append( df_Program_Classifiers_Metrics) # Save the dataframe into file if is specified to read all program files if mustWrite: fileName = '{}/ML_Metrics.csv'.format(baseFolder) fileNameAllClassifiers = '{}/Metrics_AllClassifiers.csv'.format( baseFolder) if not util.pathExists(fileName) or overwrite: util.writeDataFrameInCsvFile( fileName, df_Program_Metrics_BestClassifier) util.writeDataFrameInCsvFile( fileNameAllClassifiers, df_Program_Metrics_AllClassifiers) else: for _column in columns: baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), _column) fileName = '{}/ML_Metrics.csv'.format(baseFolder) df_Programs_BestClassifiers = df_Programs_BestClassifiers.append( util.createDataFrameFromCSV(fileName, True)) return df_Programs_BestClassifiers