def getDataByFile(file, metric): if metric not in ['Accuracy', 'Precision', 'Recall', 'F1']: sys.exit() data = util.createDataFrameFromCSV(file, True) minimal = pd.DataFrame(data).query('TargetColumn == \'MINIMAL\'') equivalent = pd.DataFrame(data).query('TargetColumn == \'EQUIVALENT\'') return minimal, equivalent
def getBestClassifierForEachProgram(possibleTargetColumns, possiblePrograms, possibleClassifiers, bestProgram_Classifier_Parameters, writeFile = False): bestProgram_Classifier = pd.DataFrame() for targetColumn in possibleTargetColumns: baseFolder = '{}/ML/Results/{}/Classification'.format(os.getcwd(), targetColumn) fileName = '{}/bestClassifierForEachProgram.csv'.format(baseFolder) if writeFile: # Percorre todos os programas for programName in possiblePrograms: program_ClassifierMetrics = pd.DataFrame() #print('Column: {} | Program: {}'.format(targetColumn, programName)) # Percorre todos os classificadores daquele programa for classifier in possibleClassifiers: programFileName = '{}/{}_{}.csv'.format(baseFolder, programName, classifier) fileExists = util.pathExists(programFileName) if fileExists: # Obtém as métricas do programa / classificador accuracy, precision, recall, f1 = getMLMetricsFromClassificationFile(programFileName, targetColumn, programName) newMutantsMetrics = pd.DataFrame(data=[[programName, accuracy * 100, precision * 100, recall * 100, f1 * 100, classifier]], columns=['ProgramName', 'Accuracy', 'Precision', 'Recall', 'F1', 'Classifier']) program_ClassifierMetrics = program_ClassifierMetrics.append(newMutantsMetrics) # Verifica qual o melhor parâmetro para aquele programa if fileExists: bestClassifier = program_ClassifierMetrics.sort_values('F1', ascending=False).head(n=1)['Classifier'].values[0] bestFile = '{}/{}_{}.csv'.format(baseFolder, programName, bestClassifier) newFile = '{}/{}.csv'.format(baseFolder, programName) bestParameter = bestProgram_Classifier_Parameters.query('Column == \'{}\' and Program == \'{}\' and Classifier == \'{}\''.format(targetColumn, programName, bestClassifier)) if not bestParameter.empty: bestParameter = bestParameter['Parameter'].values[0] else: bestParameter = '' newBestProgram_Classifier = pd.DataFrame(data=[[targetColumn, programName, bestClassifier, bestParameter]], columns=['Column', 'Program', 'Classifier', 'Parameter']) bestProgram_Classifier = bestProgram_Classifier.append(newBestProgram_Classifier) copyfile(bestFile, newFile) # Escreve o arquivo bestProgram_Classifier['Program.UPPER'] = bestProgram_Classifier["Program"].str.upper() bestProgram_Classifier = bestProgram_Classifier.sort_values(by=['Column', 'Program.UPPER']) del bestProgram_Classifier['Program.UPPER'] util.writeDataFrameInCsvFile(fileName, bestProgram_Classifier.query('Column == \'{}\''.format(targetColumn))) else: newBestProgram_Classifier = util.createDataFrameFromCSV(fileName, hasHeader=True, columnIndex=0) bestProgram_Classifier = pd.concat([bestProgram_Classifier, newBestProgram_Classifier]) bestProgram_Classifier['Program.UPPER'] = bestProgram_Classifier["Program"].str.upper() bestProgram_Classifier = bestProgram_Classifier.sort_values(by=['Column', 'Program.UPPER']) del bestProgram_Classifier['Program.UPPER'] return bestProgram_Classifier
def getMLMetricsFromClassificationFile(fileName, targetColumn, programName): # Obtém as métricas do programa selecionado classificationResult = util.createDataFrameFromCSV(fileName, hasHeader = True) y_correct = classificationResult.loc[ : , '_IM_{}'.format(targetColumn)].values y_predicted = classificationResult.loc[ : , 'PREDICTED'].values accuracy, precision, recall, f1, TPR, FPR, TP, FN, FP, TN = evaluatingClassification(y_correct, y_predicted) #newMutantsMetrics = pd.DataFrame(data=[[programName, accuracy, precision, recall, f1]], columns=['ProgramName', 'Accuracy', 'Precision', 'Recall', 'F1']) return (accuracy, precision, recall, f1)
def summarizeClassifications(targetColumn, possiblePrograms, df_Programs_BestClassifiers, overwrite=False): ''' Função responsável por fazer a análise dos resultados das classificações dos mutantes dos programas e sumarizar todos os resultados. ''' baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), targetColumn) fileName = 'ClassificationSummary' summaryFile = '{}/ML/Results/{}/Classification/{}.csv'.format( os.getcwd(), targetColumn, fileName) mutantsData = pd.DataFrame() df_Programs_BestClassifiers = df_Programs_BestClassifiers.query( 'Column == \'{}\''.format(targetColumn)) if overwrite or not util.pathExists(summaryFile): for programName in possiblePrograms: bestClassifier = df_Programs_BestClassifiers.query( 'ProgramName == \'{}\''.format( programName)).loc[:, 'Classifier'].values[0] programFileBestClassifier = '{}/{}_{}.csv'.format( baseFolder, programName, bestClassifier) if util.pathExists(programFileBestClassifier): classificationResult = util.createDataFrameFromCSV( programFileBestClassifier, hasHeader=True) mutantsData = mutantsData.append(classificationResult, ignore_index=True) util.writeDataFrameInCsvFile(summaryFile, mutantsData) return mutantsData elif util.pathExists(summaryFile): return util.createDataFrameFromCSV(summaryFile, True)
def analyzeClassificationsFromEachProgram(targetColumn, possiblePrograms, bestProgram_Classifier, overwrite=False): ''' Função responsável por fazer a análise dos resultados das classificações dos mutantes dos programas e obter as métricas para cada programa ''' baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), targetColumn) fileName = 'ML_Metrics' # Falta criar o arquivo '{}/Metrics_AllClassifiers.csv'.format(baseFolder) neste arquivo metricsFile = '{}/ML/Results/{}/Classification/{}.csv'.format( os.getcwd(), targetColumn, fileName) mutantsMetrics = pd.DataFrame() if overwrite or not util.pathExists(metricsFile): for file in util.getFilesInFolder(baseFolder): programName = util.getPathName(file) programName = programName[:programName.find('.')] if programName in possiblePrograms: programInfo_ClassifierParameter = bestProgram_Classifier.query( 'Column == \'{}\' and Program == \'{}\''.format( targetColumn, programName)) classifier = programInfo_ClassifierParameter[ 'Classifier'].values[0] parameter = programInfo_ClassifierParameter[ 'Parameter'].values[0] accuracy, precision, recall, f1 = getMLMetricsFromClassificationFile( file, targetColumn, programName) newMutantsMetrics = pd.DataFrame(data=[[ programName, targetColumn, classifier, parameter, accuracy * 100, precision * 100, recall * 100, f1 * 100 ]], columns=[ 'ProgramName', 'Column', 'Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1' ]) mutantsMetrics = mutantsMetrics.append(newMutantsMetrics) #print('Program: {}\tClassifier: {} | Parameter: {}\t\tAccuracy: {} | Precision: {} | Recall: {} | F1: {}'.format(programName, classifier, parameter, accuracy, precision, recall, f1)) mutantsMetrics['ProgramName.UPPER'] = mutantsMetrics[ "ProgramName"].str.upper() mutantsMetrics = mutantsMetrics.sort_values( by=['Column', 'ProgramName.UPPER']) del mutantsMetrics['ProgramName.UPPER'] util.writeDataFrameInCsvFile(metricsFile, mutantsMetrics) elif util.pathExists(metricsFile): mutantsMetrics = util.createDataFrameFromCSV(metricsFile, True) mutantsMetrics['ProgramName.UPPER'] = mutantsMetrics[ "ProgramName"].str.upper() mutantsMetrics = mutantsMetrics.sort_values( by=['Column', 'ProgramName.UPPER']) del mutantsMetrics['ProgramName.UPPER'] return mutantsMetrics
def analyzeResults(possibleTargetColumns, possibleClassifiers, overwriteFullFile=False): """ Analyze each of 30 run for each classifier with each target column and calculate the best (calculating the mean) metric for each ones (classifier and target column). Parameters ---------- possibleTargetColumns (list) Array containing all possible columns that can be sorted (MINIMAL and EQUIVALENT) possibleClassifiers (list) Array containing all possible classifiers that can be used (KNN, DT, RF, SVM, LDA, LR and GNB) overwriteFullFile (bool) Boolean indicating whether the file 'Summary_All30Runs' should be overwritten. Returns ------- (DataFrame, DataFrame, DataFrame) experimentResults - Dataframe containing all values of all predictive models and 30 executions. Corresponds to the file 'Summary_All30Runs.csv' bestParameterResults - Dataframe containing all values of the predictive models that was the best and 30 executions. Corresponds to the file 'Summary_BestClassifiers_All30Runs.csv' classifiersBestParameter - Dataframe containing the average values of the classifiers with the best results. Corresponds to the 'Summary_Classifiers.csv' file """ # Dataframe with all execution results# Dataframe with all execution results experimentResults = pd.DataFrame() # Dataframe with the indication of the best parameter for each classifier summaryClassifiersBestParameter = pd.DataFrame() # Base folder baseResultsFolderName = '{}/ML/Results'.format(os.getcwd()) # FullFile fullFileName = '{}/Summary/Summary_All30Runs.csv'.format( baseResultsFolderName) # FullFile for the best classifiers fullBestClassifiersFileName = '{}/Summary/Summary_BestClassifiers_All30Runs.csv'.format( baseResultsFolderName) # SummaryFile summaryFileName = '{}/Summary/Summary_Classifiers.csv'.format( baseResultsFolderName) # CustomParameterResults summaryCustomParametersFileName = '{}/Summary/Summary_CustomParameters.csv'.format( baseResultsFolderName) if overwriteFullFile or not util.pathExists(fullFileName): # If you are going to overwrite the existing file or if it does not exist, do all the reading and write a new file # Cycles through all columns (Minimal and equivalent) for targetColumn in possibleTargetColumns: # Cycle through all classifiers for classifier in possibleClassifiers: # Dataframe containing the result of the 30 executions of each program classifierResults = pd.DataFrame() # Cycles through all executions (from 1 to 30) for iCount in range(1, 31, 1): # Fetch execution results classifierRunResultFileName = '{}/{}_{}/{}.csv'.format( baseResultsFolderName, targetColumn, iCount, classifier) classifierRunResult = util.createDataFrameFromCSV( classifierRunResultFileName, hasHeader=True, separator=';', initialLine=5) # Concatenates the result of this execution to the other executions that in the end will be 30 classifierResults = classifierResults.append( classifierRunResult) # Insert information related to that execution pd.DataFrame(classifierRunResult).insert( 0, 'TargetColumn', targetColumn, True) pd.DataFrame(classifierRunResult).insert( 1, 'Classifier', classifier, True) pd.DataFrame(classifierRunResult).insert( 2, 'Run', iCount, True) # Concatenates this execution to all others experimentResults = experimentResults.append( classifierRunResult) # Calculates the best parameter for that classifier parameters = classifierResults['SampleSplit'].unique() parameterMetrics = pd.DataFrame() for parameter in parameters: # Search only the results of that parameter resultsFromThisParameter = classifierResults.query( 'SampleSplit == \'{}\''.format(parameter)) # Collect the average of the metrics of the 30 executions meanAccuracy = np.mean( resultsFromThisParameter['Accuracy']) meanPrecision = np.mean( resultsFromThisParameter['Precision']) meanRecall = np.mean(resultsFromThisParameter['Recall']) meanF1 = np.mean(resultsFromThisParameter['F1']) stdF1 = np.std(resultsFromThisParameter['F1']) parameterMetrics = parameterMetrics.append( pd.DataFrame(data=[[ targetColumn, classifier, parameter, meanAccuracy, meanPrecision, meanRecall, meanF1, stdF1 ]], columns=[ 'TargetColumn', 'Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1', 'StdDevF1' ])) #print('Parameter: {}\t\tAccuracy: {} | Precision: {} | Recall: {} | F1: {}'.format(parameter, meanAccuracy, meanPrecision, meanRecall, meanF1)) bestParameter = parameterMetrics.sort_values( by=['F1'], ascending=False).head(n=1) #print('\n--- {} - {}'.format(classifier, targetColumn)) #print(bestParameter.head()) summaryClassifiersBestParameter = summaryClassifiersBestParameter.append( bestParameter) #print(classifiersBestParameter.head(n=50)) # Write a file with all the results util.writeDataFrameInCsvFile(fullFileName, experimentResults, index=False) # Write a file with only the best parameters for each classifier util.writeDataFrameInCsvFile(summaryFileName, summaryClassifiersBestParameter, index=False) # Write a file with all the results but only for the best parameters # Exclude the non best executions bestParameterResults = getRunsOnlyFromBestParameter( experimentResults, summaryClassifiersBestParameter, possibleTargetColumns) util.writeDataFrameInCsvFile(fullBestClassifiersFileName, bestParameterResults, index=False) else: # Search for existing files experimentResults = util.createDataFrameFromCSV( fullFileName, True, ',') summaryClassifiersBestParameter = util.createDataFrameFromCSV( summaryFileName, True, ',') bestParameterResults = util.createDataFrameFromCSV( fullBestClassifiersFileName, True, ',') # Get the results from custom parameters customParameterResults = summarizeRunsFromCustomParameter( getRunsFromCustomParameters(experimentResults)) util.writeDataFrameInCsvFile(summaryCustomParametersFileName, customParameterResults, index=False) return experimentResults, bestParameterResults, summaryClassifiersBestParameter
def getBestClassifierForPrograms(program=None, targetColumn=None, write=True, overwrite=False): ''' Function responsible to analyze the results from all programs with all classifiers If the parameter 'writeFile' is True, this function writes a file with the result Returns a dataframe with the best classifier for each program and this metrics ''' # SetUp possibleTargetColumns, possibleClassifiers, possiblePrograms = setUp() # Indicate the best classifier for each classifier - ProgramName, Column, Classifier, Accuracy, Precision, Recall, F1 df_Programs_BestClassifiers = pd.DataFrame() # Set possible none parameters if program is not None: programs = [program] else: programs = possiblePrograms # Set possible none parameters if targetColumn is not None: columns = [targetColumn] else: columns = possibleTargetColumns mustWrite = write and program is None and targetColumn is None if mustWrite: for _column in columns: baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), _column) # Dataframe containing programs, the best classifier for each one and the metrics achieved df_Program_Metrics_BestClassifier = pd.DataFrame() # Dataframe containing programs and all classifiers for each one and the metrics achieved df_Program_Metrics_AllClassifiers = pd.DataFrame() for programName in programs: df_Program_Classifiers_Metrics = pd.DataFrame() for classifier in possibleClassifiers: fileName = '{}/{}_{}.csv'.format(baseFolder, programName, classifier) if util.pathExists(fileName): classificationResult = util.createDataFrameFromCSV( fileName, hasHeader=True) y_correct = classificationResult.loc[:, '_IM_{}'. format(_column )].values y_predicted = classificationResult.loc[:, 'PREDICTED'].values #accuracy, precision, recall, f1, TPR, FPR, TP, FN, FP, TN = evaluatingClassification(y_correct, y_predicted) accuracy, precision, recall, f1, _, _, _, _, _, _ = evaluatingClassification( y_correct, y_predicted) #print('Program: {} | Column: {} | Classifier: {} | Accuracy: {} | Precision: {} | Recall: {} | F1: {}'.format(programName, _column, classifier, accuracy, precision, recall, f1)) newDataFrame = pd.DataFrame(data=[[ programName, _column, classifier, accuracy * 100, precision * 100, recall * 100, f1 * 100 ]], columns=[ 'ProgramName', 'Column', 'Classifier', 'Parameter', 'Accuracy', 'Precision', 'Recall', 'F1' ]) df_Program_Classifiers_Metrics = df_Program_Classifiers_Metrics.append( newDataFrame) df_Program_Metrics_AllClassifiers = df_Program_Metrics_AllClassifiers.append( df_Program_Classifiers_Metrics.sort_values( 'F1', ascending=False)) # Classify here the best classifier for these program df_Program_Classifiers_Metrics = df_Program_Classifiers_Metrics.sort_values( 'F1', ascending=False).head(n=1) df_Program_Metrics_BestClassifier = df_Program_Metrics_BestClassifier.append( df_Program_Classifiers_Metrics) df_Programs_BestClassifiers = df_Programs_BestClassifiers.append( df_Program_Classifiers_Metrics) # Save the dataframe into file if is specified to read all program files if mustWrite: fileName = '{}/ML_Metrics.csv'.format(baseFolder) fileNameAllClassifiers = '{}/Metrics_AllClassifiers.csv'.format( baseFolder) if not util.pathExists(fileName) or overwrite: util.writeDataFrameInCsvFile( fileName, df_Program_Metrics_BestClassifier) util.writeDataFrameInCsvFile( fileNameAllClassifiers, df_Program_Metrics_AllClassifiers) else: for _column in columns: baseFolder = '{}/ML/Results/{}/Classification'.format( os.getcwd(), _column) fileName = '{}/ML_Metrics.csv'.format(baseFolder) df_Programs_BestClassifiers = df_Programs_BestClassifiers.append( util.createDataFrameFromCSV(fileName, True)) return df_Programs_BestClassifiers