Пример #1
0
def testHypothesis(metric):
    minimalData, equivalentData = getDataByFile('{}/ML/Results/Summary/Summary_BestClassifiers_All30Runs.csv'.format(os.getcwd()), 'F1')

    # ========================================
    # ===== Hypothesis Testing ===============
    # ========================================   
    for model, data in [['MINIMAL', minimalData], ['EQUIVALENT', equivalentData]]:
        print(int(len(model) + 20 + 2) * '=')
        print('===== {} ==============='.format(model))
        print(int(len(model) + 20 + 2) * '=')
        
        allClassifiers = getPossibleClassifiers()
        classifiersToBeTested = getPossibleClassifiers()

        for classifierA in allClassifiers:
            if len(classifiersToBeTested) > 0:
                classifiersToBeTested.remove(classifierA)
            
            for classifierB in classifiersToBeTested:
                print('{} x {}'.format(classifierA, classifierB))
                dataA = data.query('Classifier == \'{}\''.format(classifierA))
                dataB = data.query('Classifier == \'{}\''.format(classifierB))
                if (classifierA == 'GNB' or classifierB == 'GNB'):
                    runWilcoxon(dataA[metric], dataB[metric], 0.05)
                else:
                    runTTestPaired(dataA[metric], dataB[metric], 0.05)
                print('')
        print('')
Пример #2
0
def setUp():
    possibleTargetColumns = ['MINIMAL', 'EQUIVALENT']
    possibleClassifiers = getPossibleClassifiers()
    possiblePrograms = [
        util.getPathName(program) for program in util.getPrograms()
    ]

    return possibleTargetColumns, possibleClassifiers, possiblePrograms
Пример #3
0
def testDataDistribution(metric):
    minimalData, equivalentData = getDataByFile('{}/ML/Results/Summary/Summary_BestClassifiers_All30Runs.csv'.format(os.getcwd()), 'F1')

    # ============================================
    # ===== Test Data distribution ===============
    # ============================================
    for model, data in [['MINIMAL', minimalData], ['EQUIVALENT', equivalentData]]:
        for classifier in getPossibleClassifiers():
            print('{} - {}'.format(model, classifier))
            runShapiroTest(data.query('Classifier == \'{}\''.format(classifier))[metric], 0.05)
            print('')
        print('')
Пример #4
0
def executeAll(targetColumns,
               classifiers,
               specifiedProgram=None,
               executeWithBestParameter=False):
    '''
		Function used to execute all classifiers in all columns to be sorted
	'''
    for column in targetColumns:
        for classifier in getPossibleClassifiers():
            print('Classifier: {} | Column: {}'.format(classifier, column))
            parameter = bestParameter(
                column, classifier) if executeWithBestParameter else None
            crossValidation(column,
                            classifier,
                            specifiedProgram,
                            parameter=parameter)
Пример #5
0
def experiment():
    baseFolder = '{}/ML/Results'.format(os.getcwd())

    targetColumns = getPossibleTargetColumns()
    targetColumns.sort()

    classifiers = getPossibleClassifiers()
    classifiers.sort()

    dropableColumns = getDropableColumns()
    dropableColumns.sort()

    resultsFile = []

    for targetColumn in targetColumns:
        for classifier in classifiers:
            originalFile = '{}/{}/{}.csv'.format(baseFolder, targetColumn,
                                                 classifier)
            columnsResult = []
            f1 = float(getMaxF1(originalFile))
            columnsResult.append(f1)
            print('{} - {} - Original: {:.2f}'.format(targetColumn, classifier,
                                                      f1))

            for column in dropableColumns:
                columnFile = '{}/{}/{} - gbs_[\'{}\'].csv'.format(
                    baseFolder, targetColumn, classifier, column)
                f1 = float(getMaxF1(columnFile))
                columnsResult.append(f1)
                print('{} - {} - {}: {:.2f}'.format(targetColumn, classifier,
                                                    column, f1))

            resultsFile.append([targetColumn, classifier, columnsResult])

    util.writeInCsvFile('{}/ML/Results/gbs.csv'.format(os.getcwd()),
                        resultsFile)
Пример #6
0
def classify_main(arguments):
    '''
		Function responsible for receiving a mutant dataset and classifying those mutants as minimal, equivalent or traditional.
	'''
    # Possible parameters
    possibleTargetColumns = getPossibleTargetColumns()
    possibleClassifiers = getPossibleClassifiers()
    possiblePrograms = [
        util.getPathName(program)
        for program in util.getPrograms('{}/Programs'.format(os.getcwd()))
    ]

    # Parameters
    targetColumn = None
    allTargetColumns = False
    programToClassify = None
    classifier = None
    algorithmParameter = None
    executeAllPrograms = False
    executeBestClassifierForProgram = False
    programsBestClassifiers = None
    executeAllClassifiers = False
    executeAllParameters = False

    # Trought into all parameters
    for iCount in range(1, len(arguments), 1):
        arg = arguments[iCount]
        if arg == '--column':
            targetColumn = arguments[iCount + 1]
        elif arg == '--allColumns':
            allTargetColumns = True
        elif arg == '--program':
            programToClassify = arguments[iCount + 1]
        elif arg == '--allPrograms':
            executeAllPrograms = True
        elif arg == '--classifier':
            classifier = arguments[iCount + 1]
        elif arg == '--bestClassifier':
            executeBestClassifierForProgram = True
            programsBestClassifiers = analyzes.getBestClassifierForPrograms()
        elif arg == '--allClassifiers':
            executeAllClassifiers = True
        elif arg == '--parameter':
            algorithmParameter = int(arguments[iCount + 1])
        elif arg == '--allParameters':
            executeAllParameters = True

    withoutProgramMessage = 'Please specify the program correctly. The {program} could be ' + str(
        possiblePrograms)
    withoutColumnMessage = 'Please specify the target column throught --column {targetColumn}. The {targetColumn} could be ' + str(
        possibleTargetColumns)
    withoutClassifierMessage = 'Please specify the classifier to be used throught --classifier {classifier}. The {classifier} could be ' + str(
        possibleClassifiers)
    errorMessage = ''

    if (targetColumn is None or not targetColumn
            in possibleTargetColumns) and allTargetColumns == False:
        errorMessage = '{}{}\n'.format(errorMessage, withoutColumnMessage)

    if programToClassify is None and executeAllPrograms == False:
        errorMessage = '{}{}\n'.format(errorMessage, withoutProgramMessage)

    if classifier is None and executeBestClassifierForProgram == False and executeAllClassifiers == False:
        errorMessage = '{}{}\n'.format(errorMessage, withoutClassifierMessage)

    if len(errorMessage) > 0:
        print(errorMessage)
        return

    if executeAllPrograms:
        programsToBeClassified = possiblePrograms.copy()
    else:
        programsToBeClassified = [programToClassify]

    if allTargetColumns:
        targetColumns = possibleTargetColumns.copy()
    else:
        targetColumns = [targetColumn]

    for column in targetColumns:
        for program in programsToBeClassified:
            if executeBestClassifierForProgram:
                classifier, _ = programsBestClassifiers['{}_{}'.format(
                    program, column)]

            if executeAllClassifiers:
                classifiers = possibleClassifiers
            else:
                classifiers = [classifier]

            for _classifier in classifiers:

                if executeAllParameters:
                    parameters = getPossibleParameters(_classifier)
                else:
                    if _classifier == 'SVM' or _classifier == 'LDA' or _classifier == 'LR' or _classifier == 'GNB':
                        parameters = ['']
                    elif _classifier == 'KNN' and column == 'MINIMAL':
                        parameters = [1]
                    elif _classifier == 'KNN' and column == 'EQUIVALENT':
                        parameters = [11]
                    elif _classifier == 'DT' and column == 'MINIMAL':
                        parameters = [15]
                    elif _classifier == 'DT' and column == 'EQUIVALENT':
                        parameters = [35]
                    elif _classifier == 'RF' and column == 'MINIMAL':
                        parameters = [5]
                    elif _classifier == 'RF' and column == 'EQUIVALENT':
                        parameters = [15]

                for parameter in parameters:
                    complementClassifierName = '_{}'.format(
                        _classifier) if executeAllClassifiers else ''
                    complementClassifierName = '{baseName}{parameter}'.format(
                        baseName=complementClassifierName,
                        parameter='_{}'.format(parameter)
                        if executeAllParameters else '')
                    dataSetFileName = '{}/ML/Dataset/{}/Programs/{}.csv'.format(
                        os.getcwd(), column, program)
                    resultDataSetFileName = '{baseFolder}/ML/Results/{targetColumn}/Classification/{programName}{complement}.csv'.format(
                        baseFolder=os.getcwd(),
                        targetColumn=column,
                        programName=program,
                        complement=complementClassifierName)

                    print(
                        '\nProgram: {} | Column: {} | Classifier: {} | Parameter: {}'
                        .format(program, column, _classifier, parameter))
                    if parameter != '':
                        classify(dataSetFileName, resultDataSetFileName,
                                 column, _classifier, parameter, program)
Пример #7
0
def debug_main(arguments):
    '''
		Main function performed at the time of running the experiment
	'''
    # Possible parameters
    possibleTargetColumns = getPossibleTargetColumns()
    possibleClassifiers = getPossibleClassifiers()
    possiblePrograms = [
        util.getPathName(program)
        for program in util.getPrograms('{}/Programs'.format(os.getcwd()))
    ]

    # Parameters
    targetColumn = None
    classifier = None
    columnsToDrop = []
    columnsToAdd = []
    program = None
    programByProgram = False
    executeWithBestParameter = False

    # Trought into all parameters
    for iCount in range(1, len(arguments), 1):
        arg = arguments[iCount]
        if arg == '--column':
            targetColumn = arguments[iCount + 1]
        elif arg == '--classifier':
            classifier = arguments[iCount + 1]
        elif arg == '--program':
            program = arguments[iCount + 1]
        elif arg == '--pbp':
            programByProgram = True
        elif arg == '--best':
            executeWithBestParameter = True

    # Set the best parameter if it is necessary
    parameter = bestParameter(targetColumn,
                              classifier) if executeWithBestParameter else None

    if len(arguments) > 1:
        if arguments[
                1] == '--all':  # Verify if it is for execute all classifiers with all classifications
            executeAll(possibleTargetColumns,
                       possibleClassifiers,
                       parameter,
                       executeWithBestParameter=executeWithBestParameter)
            return
        elif arguments[
                1] == '--allPbP':  #Verify if it is for execute all, but program a program
            executeAllEachProgram(possibleTargetColumns, possibleClassifiers,
                                  possiblePrograms, executeWithBestParameter)
            return

    withoutColumnMessage = 'Please specify the target column throught --column {targetColumn}. The {targetColumn} could be ' + str(
        possibleTargetColumns)
    withoutClassifierMessage = 'Please specify the classifier throught --classifier {classifier}. The {classifier} could be ' + str(
        possibleClassifiers)
    withoutProgramMessage = 'Please specify the program correctly. The {program} could be ' + str(
        possiblePrograms)
    errorMessage = ''
    if targetColumn is None or not targetColumn in possibleTargetColumns:
        errorMessage = '{}{}\n'.format(errorMessage, withoutColumnMessage)

    if classifier is None:
        errorMessage = '{}{}\n'.format(errorMessage, withoutClassifierMessage)

    if not program is None and not program in possiblePrograms:
        errorMessage = '{}{}\n'.format(errorMessage, withoutProgramMessage)

    if len(errorMessage) > 0:
        print(errorMessage)
        return

    # Execute cross validation
    if not programByProgram:
        crossValidation(targetColumn,
                        classifier,
                        program,
                        columnsToDrop,
                        columnsToAdd,
                        parameter=parameter)
    else:
        for specifiedProgram in possiblePrograms:
            crossValidation(targetColumn,
                            classifier,
                            specifiedProgram,
                            columnsToDrop,
                            columnsToAdd,
                            parameter=parameter)
Пример #8
0
def crossValidation(targetColumn,
                    classifier,
                    specifiedProgram=None,
                    columnsToDrop=[],
                    columnsToAdd=[],
                    printResults=False,
                    parameter=None):
    if not classifier in getPossibleClassifiers(
    ) or not targetColumn in getPossibleTargetColumns():
        return None

    ####################################
    # --- Setting independent properties
    maxNeighbors = 40
    maxSamplesSplit = 100
    maxIterations = maxNeighbors if classifier == 'KNN' else maxSamplesSplit

    ######################
    # --- Setting datasets
    targetColumnName = targetColumn
    targetColumn = '_IM_{}'.format(targetColumn)

    # Verify if it setted a specified program to be classified
    if not specifiedProgram is None:
        dataSetFileName = 'ML/Dataset/{}/Programs/{}.csv'.format(
            targetColumnName, specifiedProgram)
    else:
        dataSetFileName = 'ML/Dataset/{}/mutants.csv'.format(targetColumnName)

    if targetColumn == '_IM_MINIMAL':
        #####################
        # --- Setting columns
        columnNames = getColumnNames_lastMinimal()

        print('####################################################')
        print(' ----- Calculando para identificar mutantes minimais')

    elif targetColumn == '_IM_EQUIVALENT':
        #####################
        # --- Setting columns
        columnNames = getColumnNames_lastEquivalent()

        print('########################################################')
        print(' ----- Calculando para identificar mutantes equivalentes')
    else:
        return

    ###################
    # --- PreProcessing
    dataSet = importDataSet(dataSetFileName)

    ##############################
    # --- Setting results filename
    bestParameter = '_bestParameter' if not parameter is None else ''
    gbs = ' - gbs_{columns}'.format(
        columns=columnsToDrop) if len(columnsToDrop) > 0 else ''
    gfs = ' - gfs_{columns}' if len(columnsToAdd) > 0 else ''
    if specifiedProgram is None:
        resultsFileName = 'ML/Results/{targetColumnName}/{classifier}{bestParameter}{gbs}{gfs}.csv'.format(
            targetColumnName=targetColumnName,
            classifier=classifier,
            gbs=gbs,
            gfs=gfs,
            bestParameter=bestParameter)
    else:
        resultsFileName = 'ML/Results/{targetColumnName}/Programs/{specifiedProgram}_{classifier}{bestParameter}.csv'.format(
            targetColumnName=targetColumnName,
            specifiedProgram=specifiedProgram,
            classifier=classifier,
            bestParameter=bestParameter)

    ##########################################
    # --- Executing classifier | KNN, DT ou RF
    print(' ----- {}'.format(classifier))
    crossValidation_main(dataSet,
                         targetColumn,
                         classifier,
                         maxIterations,
                         resultsFileName,
                         columnNames,
                         columnsToDrop,
                         columnsToAdd,
                         parameter=parameter)