def pintoKNN(outDir, projectBasePath, projectName, kf):
    v0 = time.perf_counter()
    dataPointsFlaky, dataPointsNonFlaky = flast.getDataPointsInfo(
        projectBasePath, projectName)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    countVec = CountVectorizer()
    Z = countVec.fit_transform(dataPoints)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1] * len(dataPointsFlaky) +
                              [0] * len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # storage
    kNN = (dataPointsList, dataLabelsList)
    pickleDumpKNN = os.path.join(outDir, "PintoKNN.pickle")
    with open(pickleDumpKNN, "wb") as pickleFile:
        pickle.dump(kNN, pickleFile)
    storage = os.path.getsize(pickleDumpKNN)
    os.remove(pickleDumpKNN)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    successFold, precisionFold = 0, 0
    for (trnIdx, tstIdx) in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[
            tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold...")
            print(" Flaky Train Tests", sum(trainLabels))
            print(" Flaky Test Tests", sum(testLabels))
            continue

        successFold += 1

        # prepare the data in the right format for kNN
        nSamplesTrainData, nxTrain, nyTrain = trainData.shape
        trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
        nSamplesTestData, nxTest, nyTest = testData.shape
        testData = testData.reshape((nSamplesTestData, nxTest * nyTest))

        # training
        t0 = time.perf_counter()
        kNN = KNeighborsClassifier(algorithm="brute",
                                   metric="euclidean",
                                   weights="uniform",
                                   n_neighbors=1,
                                   n_jobs=1)
        kNN.fit(trainData, trainLabels)
        t1 = time.perf_counter()
        trainTime = t1 - t0

        # testing
        p0 = time.perf_counter()
        predictLabels = kNN.predict(testData)
        p1 = time.perf_counter()
        testTime = p1 - p0

        preparationTime = (vecTime * len(trainData) /
                           len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime /
                                                        len(testData))
        (precision, recall) = flast.computeResults(testLabels, predictLabels)
        print(precision, recall)
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold == 0:
        avgP = "-"
    else:
        avgP /= precisionFold
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold

    return (avgP, avgR, storage, avgTPrep, avgTPred)
Пример #2
0
    for projectName in projectList:
        for threshold in [0.5, 0.95]:
            p0 = time.time()
            print("#" * 80)
            print("TESTING {}".format(projectName.upper()))

            pName = projectName.split("/")[-1]
            outputFile = f"{outDir}/{pName}___t{threshold}.csv"
            with open(outputFile, "w") as fileOut:
                fileOut.write(
                    "fold,testSetSize,numFlakyTrainSet,numNonFlakyTrainSet,numFlakyTestSet,numNonFlakyTestSet,vecTime,trainTime,testTime,avgPredTime,f-measure,precision,recall,accuracy,tp,fp,fn,tn\n"
                )

            # data points vectorization
            v0 = time.perf_counter()
            dataPointsFlaky, dataPointsNonFlaky = flast.getDataPointsInfo(
                projectBasePath, projectName)
            dataPoints = dataPointsFlaky + dataPointsNonFlaky
            Z = flast.flastVectorization(dataPoints,
                                         reduceDim=reduceDim,
                                         dim=dim,
                                         eps=eps)
            dataPointsList = np.array(
                [Z[i].toarray() for i in range(Z.shape[0])])
            dataLabelsList = np.array([1] * len(dataPointsFlaky) +
                                      [0] * len(dataPointsNonFlaky))
            v1 = time.perf_counter()
            vecTime = v1 - v0

            for testSetSize in testSetSizes:
                t0 = time.time()
                print()
def flastKNN(outDir, projectBasePath, projectName, kf, dim, eps, k, sigma,
             params):
    v0 = time.perf_counter()
    dataPointsFlaky, dataPointsNonFlaky = flast.getDataPointsInfo(
        projectBasePath, projectName)
    dataPoints = dataPointsFlaky + dataPointsNonFlaky
    Z = flast.flastVectorization(dataPoints, dim=dim, eps=eps)
    dataPointsList = np.array([Z[i].toarray() for i in range(Z.shape[0])])
    dataLabelsList = np.array([1] * len(dataPointsFlaky) +
                              [0] * len(dataPointsNonFlaky))
    v1 = time.perf_counter()
    vecTime = v1 - v0

    # storage
    kNN = (dataPointsList, dataLabelsList)
    pickleDumpKNN = os.path.join(outDir,
                                 "flast-k{}-sigma{}.pickle".format(k, sigma))
    with open(pickleDumpKNN, "wb") as pickleFile:
        pickle.dump(kNN, pickleFile)
    storage = os.path.getsize(pickleDumpKNN)
    os.remove(pickleDumpKNN)

    avgP, avgR = 0, 0
    avgTPrep, avgTPred = 0, 0
    successFold, precisionFold = 0, 0
    for (trnIdx, tstIdx) in kf.split(dataPointsList, dataLabelsList):
        trainData, testData = dataPointsList[trnIdx], dataPointsList[tstIdx]
        trainLabels, testLabels = dataLabelsList[trnIdx], dataLabelsList[
            tstIdx]
        if sum(trainLabels) == 0 or sum(testLabels) == 0:
            print("Skipping fold...")
            print(" Flaky Train Tests", sum(trainLabels))
            print(" Flaky Test Tests", sum(testLabels))
            continue

        successFold += 1

        # prepare the data in the right format for kNN
        nSamplesTrainData, nxTrain, nyTrain = trainData.shape
        trainData = trainData.reshape((nSamplesTrainData, nxTrain * nyTrain))
        nSamplesTestData, nxTest, nyTest = testData.shape
        testData = testData.reshape((nSamplesTestData, nxTest * nyTest))
        trainTime, testTime, predictLabels = flast.flastClassification(
            trainData, trainLabels, testData, sigma, k, params)

        preparationTime = (vecTime * len(trainData) /
                           len(dataPoints)) + trainTime
        predictionTime = (vecTime / len(dataPoints)) + (testTime /
                                                        len(testData))
        (precision, recall) = flast.computeResults(testLabels, predictLabels)
        print(precision, recall)
        if precision != "-":
            precisionFold += 1
            avgP += precision
        avgR += recall
        avgTPrep += preparationTime
        avgTPred += predictionTime

    if precisionFold == 0:
        avgP = "-"
    else:
        avgP /= precisionFold
    avgR /= successFold
    avgTPrep /= successFold
    avgTPred /= successFold

    return (avgP, avgR, storage, avgTPrep, avgTPred)