Пример #1
0
def generateExamples(dataset, predClasses):
    '''Given the entire dataset and a predicted label, print TOP_EG number of example pairs.'''
    # Init
    labels = [dataset.dict_Index_Cluster[i] for i in predClasses]
    prettyLabels = [i.replace('\n', ' ') for i in labels]
    examplesList = []
    
    print 'Top-{} Predicted Class-IDs:\n{}'.format(TOP_K, predClasses)
    print 'TOP-{} Predicted Class-Labels (Error-IDs and Unique Repairs):\n{}\nExamples:'.format(TOP_K, prettyLabels)

    # For each label
    numC = 0
    for label in labels:
        dataSamples = dataset.dict_ClusterRaw_DataSamples[label] # Fetch all training examples
        srcTrgtPairs = [(H.joinList(i.src, ''), H.joinList(i.trgt, '')) for i in dataSamples] # Create list of (before, after)
        
        topEgs = H.getTop_K(srcTrgtPairs, TOP_EG) # Sort them based on frequency and fetch Top-EG #examples

        for i in range(len(topEgs)): # For each example
            eg = topEgs[i]
            before, after = eg[0].replace('\n', ''), eg[1].replace('\n', '') # Remove new-line characters
            if before.strip() == '': before = '// Empty Line' # Deal with empty lines
            if after.strip() == '': after = '// Empty Line' # Deal with empty lines

            print 'Eg #{} class-{} before: {}'.format(i+1, predClasses[numC], before) # Print them
            print 'Eg #{} class-{} after : {}'.format(i+1, predClasses[numC], after)
            
            examplesList.append(eg)
        numC += 1

    # Extend to the examplesList, proportionately    
    return examplesList
Пример #2
0
def writeTypeKind():
    path = './data/'
    nameRead = path + 'subset-srcTrgtPairs.csv'
    nameWrite = path + 'TokenKind.csv'
    headers, lines = H.readCSV(nameRead)
    writeH = ['spell', 'kind', 'cursorTypeKind']
    dictSpell = collections.defaultdict(lambda: {})

    count = 0
    for line in lines:
        srcText = line[headers.index('sourceText')]
        codeObj = Code(srcText)
        for token in codeObj.getTokens():
            cToken = CToken(token, codeObj)

            dictSpell[cToken.spell][str(cToken.kind) + '!@#$%' +
                                    str(cToken.cursorType)] = 0

        count += 1
        print count, line[headers.index('sourceID')]

    writeL = [[spell,
               kindType.split('!@#$%')[0],
               kindType.split('!@#$%')[1]] for spell in dictSpell
              for kindType in dictSpell[spell]]
    H.writeCSV(nameWrite, writeH, writeL)
Пример #3
0
def writeAbstractions():
    path = './data/'
    nameRead = path + 'subset-srcTrgtPairs'
    nameWrite = nameRead + '_newAbs'

    headers, lines = H.readCSV(nameRead + '.csv')
    headers += ['NEW_SrcAbs', 'NEW_TrgtAbs']
    writeLines = []

    count = 0
    for line in lines[:10]:
        writeLine = line
        srcText = line[headers.index('sourceText')]
        trgtText = line[headers.index('targetText')]

        for text, hname in zip([srcText, trgtText], ['', '']):
            codeObj = Code(text)
            absLines = getProgAbstraction(codeObj)
            writeLine.append(H.joinLL(absLines))

        count += 1
        print count, line[headers.index('sourceID')]
        writeLines.append(writeLine)

    H.writeCSV(nameWrite + '.csv', headers, writeLines)
Пример #4
0
def writeErrSets(fname):
    '''Invoke this function on a "clean" dataset - a dataset.csv which doesn't contain the ErrSet column.'''
    headers, lines = H.readCSV(fname)
    headers.append("ErrSet")
    dictErrDiff = {} # {CompErr1:ErrSet1, ...}
    allErrs = readAllErrors()
    count = 0
    print 'Total #src-target pairs=',len(lines)

    indexErrClang = headers.index("errorClang")
    indexErrLLVM = headers.index("errorLLVM")
    indexLineNums = headers.index("lineNums_Abs")
    indexDi, indexDd = headers.index("diffAbs_ins"), headers.index("diffAbs_del")

    for line in lines:        
        count += 1
        if count%1000==0:
            print count,'/',len(lines),'done ...'

        diffsI, diffsD = line[indexDi].splitlines(), line[indexDd].splitlines()
        errClang, errLLVM, diffLineNums = line[indexErrClang], line[indexErrLLVM], set(line[indexLineNums].splitlines())
        errClang, errLLVM = errClang.replace('\r', '\n'), errLLVM.replace('\r', '\n')

        errSet, errExpList, compLineNums = getErrSet(allErrs, dictErrDiff, errClang) # Get the err-set (unique rep for set of errors)
        clusterErr(errSet, diffsI, diffsD) # Cluster the diffs (add the diff to dictErrDiff)
        errSet.calcIntersection(compLineNums, diffLineNums) # Update counts to calc precision-recall of compiler lineNums

        line.append(errSet.key)
        
    H.writeCSV(fname, headers, lines)
    writeAllErrs(allErrs)
    writeClusterErr(dictErrDiff)
Пример #5
0
def writeSummary(row):
    headers = [
        'time_Recorded', 'train_set', 'predict_set', 'totalNumPairs',
        'Train,Valid,Test', 'numClasses', 'modelName', 'modelSummary',
        'Pred@1,3,5', 'precision', 'recall', 'trainTime', 'max_seq_length',
        'max_vocab_size', 'TRAIN_MULT_FACTOR', 'EPOCHS',
        'EMBEDDING_VECTOR_LENGTH', 'classMapping_RawCluster',
        'confusion_matrix', 'acc', 'loss', 'val_acc', 'val_loss'
    ]
    H.appendCSV(fname_summary, headers, [row])
Пример #6
0
    def getAbstractLiteral(self):
        flagIsString = False
        quotes = ['\'', '"']

        if self.cursorType == TypeKind.CONSTANTARRAY:
            flagIsString = True

        elif len(self.spell) >= 2 and self.spell[0] in quotes and self.spell[
                -1] in quotes:
            flagIsString = True

        if flagIsString:  # TypeKind.CONSTANTARRAY or TypeKind.INT with single quotes - char or Invalids with double quotes
            self.addAbstract(self.spell[0], self.spell[0])  # Add First Quote

            intermediateStr = self.spell[1:-1]
            if len(
                    intermediateStr
            ) > 0:  # Ignore 0 length LITERAL, to differentiate those cases when nothing exists inside quotes
                formatSpecs = self.extractFormatSpec(
                    intermediateStr
                )  # If String, abstract format spec (%d), special chars,...

                if len(
                        formatSpecs
                ) > 0:  # If format specifiers present, add them instead of Char/String
                    map(self.addAbstract, formatSpecs, formatSpecs)
                elif len(intermediateStr
                         ) == 1:  # Character: Otherwise, if no formatSpecs
                    self.addAbstract(
                        str(self.kind) + '_CHAR',
                        intermediateStr)  #Add a placeholder Literal_Char
                else:  # String - if len(intermediateStr) >= 1
                    self.addAbstract(
                        str(self.kind) + '_STRING', intermediateStr
                    )  # Else, add a placeholder Literal_String

            self.addAbstract(self.spell[-1], self.spell[-1])  # Add Last Quote

        elif isInt(self.spell):  # If actually an integer literal
            self.addAbstract(str(self.kind) + '_INT',
                             self.spell)  # Add a placeholder Literal_Int
        elif isFloat(self.spell):  # If actually a float literal
            self.addAbstract(str(self.kind) + "_DOUBLE",
                             self.spell)  # Add a placeholder Literal_Int
        else:  # If neither String, nor int/float: add cursorType (can't abstract - mostly Invalid)
            self.addAbstract(
                str(self.kind) + '_' + str(self.cursorType), self.spell)
            if self.cursorType != TypeKind.INVALID:  # Log the "special" type of Literal (unless its INVALID)
                H.errorLog([['CodeID', self.codeID],
                            [
                                'AbstractToken new literal-type',
                                str(self.kind) + '_' + str(self.cursorType)
                            ], ['lineNum', self.lineNum],
                            ['spell', self.spell]])
Пример #7
0
    def setDict_Indices(self):
        '''Once raw cluster/label dicts are created, assign indices to them'''
        # Assign indices to class: Order of "inverse length", then by "key ascending"
        for clusterRaw, dataSamples in H.sortDictLen_Rev(self.dict_ClusterRaw_DataSamples):
            self.dict_Cluster_Index[clusterRaw] = len(self.dict_Cluster_Index) + 1

        # Assign indices to labels: Order of "inverse length", then by "key ascending"
        for label, dataSamples in H.sortDictLen_Rev(self.dict_Label_DataSamples):
            self.dict_Label_Index[label] = len(self.dict_Label_Index) + 1

        self.dict_Index_Cluster = {v: k for k, v in self.dict_Cluster_Index.iteritems()}
        self.dict_Index_Label = {v: k for k, v in self.dict_Label_Index.iteritems()}
Пример #8
0
    def setDict_Cluster(self, fname, H_src=None, H_trgt=None):
        self.dict_ClusterRaw_DataSamples = {}
        if '.xlsx' in fname:
            df=pandas.read_excel(fname, converters={'clusterID': str, 'subClassID': str})
            headers, lines = df.columns.tolist(), df.values
        else:
            headers, lines = H.readCSV(fname)

        lines = lines
        hList = map(lambda x:x.lower(), headers)
        indexTrain, indexPredict = hList.index(self.trainSet.lower()), hList.index(self.predictSet.lower())
        i_errSet = hList.index('errset')

        for l in lines:
            trainRaw, predictRaw = l[indexTrain], l[indexPredict]
            errSet = l[i_errSet]
            src, trgt = None, None
            if H_src:  src  = l[headers.index(H_src)]  # If headers for source-target pairs
            if H_trgt: trgt = l[headers.index(H_trgt)] # Then, associate with the dataSample

            if predictRaw == predictRaw: # is not empty (i.e, shouldn't be a NaN, for pandas)
                d=DataSample(str(trainRaw), errSet, str(predictRaw), self.predictSet, src, trgt)

                # Append to self.dict_ClusterRaw_DataSamples
                if d.clusterRaw not in self.dict_ClusterRaw_DataSamples: 
                    self.dict_ClusterRaw_DataSamples[d.clusterRaw] = []
                self.dict_ClusterRaw_DataSamples[d.clusterRaw].append(d)

        # Once raw cluster/label dicts are created, 
        # filter out small clusters, assign labels and indices to rest
        self.setDict_filterSize()
        self.setDict_labels()
        self.setDict_Indices()
Пример #9
0
    def splitTrainTest(self):
        print colored('\tSplitting Train+Test ...', 'magenta')    
        self.num_classes = len(self.dict_ClusterRaw_DataSamples) + 1
        self.num_labels = len(self.dict_Label_DataSamples) 
        print 'NumClasses=', self.num_classes - 1
        print 'NumLabels=', self.num_labels - 1

        for clusterRaw, dataSamples in H.sortDictLen_Rev(self.dict_ClusterRaw_DataSamples):
            clusterIndex = self.dict_Cluster_Index[clusterRaw]
            li = dataSamples
            labelList = li[0].labelList # Pick any dataSamples labelList - would be the same for all similar clusterRaw
            labelIndices = [self.dict_Label_Index[label] for label in labelList]

            numTrain = int(math.ceil(CF.TRAIN_SPLIT * len(li)))
            numValid = int(math.floor(CF.VALIDATION_SPLIT * len(li)))
            numTest =  len(li) - numTrain - numValid
            print 'Class-',clusterIndex, 'NumTrain=', numTrain, 'NumValid=', numValid, 'NumTest=', numTest
            self.X_train_DataSample.extend(li[:numTrain])
            self.X_valid_DataSample.extend(li[numTrain : numTrain+numValid])
            self.X_test_DataSample.extend(li[numTrain + numValid :])

            self.y_train_cluster.extend([clusterIndex] * numTrain)
            self.y_valid_cluster.extend([clusterIndex] * numValid)
            self.y_test_cluster.extend([clusterIndex] * (len(li) - numTrain - numValid))
            
            self.y_train_label.extend([labelIndices] * numTrain)
            self.y_valid_label.extend([labelIndices] * numValid)
            self.y_test_label.extend([labelIndices] * (len(li) - numTrain - numValid))
    
        self.X_train_rawText = [i.rawText for i in self.X_train_DataSample]
        self.X_valid_rawText = [i.rawText for i in self.X_valid_DataSample]
        self.X_test_rawText = [i.rawText for i in self.X_test_DataSample] 
Пример #10
0
    def writeConfMat(self, confMat):
        headers = [
            'actualClass', '#test-count', 'precision', 'recall', 'egBefore',
            'predClass-1', 'predCount-1'
        ]
        rows = []
        for confRow in confMat:

            accs = confRow.getCSV_Acc()
            egBefore = confRow.egBefore
            liSorted = confRow.getCSV_Conf()

            row = accs + [egBefore] + [j for li in liSorted for j in li]
            rows.append(row)

        H.writeCSV(CF.fnameConfMat, headers, rows)
Пример #11
0
def recordAccModel(modelName, dataset):
    strDeepModel, h, trainTime, prec, recall, strConfMat, predAtK = trainTest(
        modelName, dataset)

    currTime = datetime.datetime.now().ctime()
    numPairs = dataset.getTotalNumPairs()
    tvt = (round(CF.TRAIN_SPLIT, 2), round(CF.VALIDATION_SPLIT, 2),
           round(1 - CF.TRAIN_SPLIT - CF.VALIDATION_SPLIT, 2))

    dict_index, num_classes = dataset.dict_Cluster_Index, dataset.num_classes
    if dataset.multiClass:
        dict_index, num_classes = dataset.dict_Label_Index, dataset.num_labels
    classMapStr = '\n'.join(
        [str(j) + ' -> ' + str(i) for i, j in H.sortDictVal(dict_index)])

    row = [currTime, trainSet, predictSet, numPairs, tvt, num_classes - 1]
    row += [
        modelName, strDeepModel, predAtK, prec, recall, trainTime,
        dataset.max_seq_length, dataset.max_vocab_size
    ]
    row += [TRAIN_MULT_FACTOR, EPOCHS, CF.EMBEDDING_VECTOR_LENGTH]
    row += [
        classMapStr, strConfMat,
        roundH(h, 'acc'),
        roundH(h, 'loss'),
        roundH(h, 'val_acc'),
        roundH(h, 'val_loss')
    ]

    writeSummary(row)
Пример #12
0
    def getAbstractIdentifier(self, symbTable):
        '''If Identifier, then add the type of identifier as Abstract token (except for special cases)'''
        if self.flagIsDirective:  # If directive declaration (#include<>), add actual spellings to abstraction (and not invalid-types)
            self.addAbstract(self.spell, self.spell)
        elif self.spell in CF.IncludeIdentifiers:  # Handle specials like printf
            self.addAbstract(self.spell, self.spell)
        else:  # All other cursorTypes
            #print '-getAbstractIdentifier-\n', self.spell, self.cursorType

            symbTable.insertToken(
                self.spell, self.cursor
            )  # Check & Add unknown variable/func declaration to Symbol-Table
            symbTypes = symbTable.lookup(
                self.spell)  # try to fetch type from sybmTable

            if len(
                    symbTypes
            ) > 0:  # If lookup success, add the symbType as the abstraction
                map(self.addAbstract, symbTypes, [self.spell] * len(symbTypes))
                # Add self.spell as Concretization of all AbstractTypes

                # Log error in case SymbTable and Clang differ in claimed Type
                if len(
                        symbTypes
                ) == 1 and self.cursorType != TypeKind.INVALID and self.cursorType != TypeKind.FUNCTIONPROTO:
                    # Unless the type is INVALID or FUNCTION
                    if symbTypes[0] != self.cursorType:
                        H.errorLog(
                            [['CodeID', self.codeID],
                             [
                                 'AbstractToken SymbTab & Clang mismatch type',
                                 str(symbTypes[0]) + ' and ' +
                                 str(self.cursorType)
                             ], ['lineNum', self.lineNum],
                             ['spell', self.spell]])

            else:  # Otherwise, If symbTable doesn't have the type, insert the cursorType (probably INVALID type)
                self.addAbstract(self.cursorType, self.spell)
Пример #13
0
def readAllErrors():
    '''Check if indexing of errors (sorted based on count) is already present in the path.
    Based on some previous run (or semester). If so, use that indexing (most freq comp error gets index-1)'''
    allErrs = {}
    try:
        headers, lines = H.readCSV(CF.fnameErrorIDs)
        indexIndex, indexErrExp = headers.index('index'), headers.index('error_message')

        for line in lines:
            index, errExp = line[indexIndex], line[indexErrExp]
            allErrs[errExp] = Error(errExp, index=index)

    except IOError:
        pass

    return allErrs
Пример #14
0
def getBuggyAbsLine(codeText):
    '''Given codeText, return the buggy abstract lines (abstraction of erroneous lines) and their line numbers'''
    codeObj = Code(codeText)
    absLines = getProgAbstraction(codeObj)
    errs = codeObj.getSevereErrors()
    absLinesBuggy, lineNums = [], []

    if len(errs) > 0:
        for err in errs:
            lineNum = err.line  # Pick the first error line Num
            if lineNum > 0 and lineNum <= len(absLines):
                # If line-num reported by compiler doesn't exceeds #absLines (and is >=1)

                absLine = H.joinList(absLines[lineNum - 1], ' ')
                if lineNum not in lineNums:  # Add unique lineNum / absLine
                    absLinesBuggy.append(absLine)
                    lineNums.append(lineNum)

    return absLinesBuggy, lineNums
Пример #15
0
 def addRaw_Bigram(self):
     for bigram in H.pairwise(self.rawText.split()):
         p1, p2 = bigram
         biRawText = p1 +'<BIGRAM>'+ p2
         self.rawText += ' '+ biRawText
Пример #16
0
def printProgAbstraction(fnamePath):
    codeText = open(fnamePath).read()
    codeObj = Code(codeText)
    absLines = getProgAbstraction(codeObj)
    for line in absLines:
        print H.joinList(line, ' ')
Пример #17
0
 def getCSV_Conf(self):
     return [(k, v)
             for k, v in H.sortDictVal(self.predClasses, reverse=True)]
Пример #18
0
    def calcConfMat(self):
        print colored('\n\tConfusion Matrix: ...', 'magenta')
        predAtK = []

        for topK in [1, 3, 5]:
            countM, countN = 0, 0
            predClasses_Tests = self.deepModel.getPrediction(topK)

            for actClasses_bin, predClasses_bin in zip(self.y_test,
                                                       predClasses_Tests):
                if self.multiClass:
                    # Add +1 to index since off-by-one with dict_index2class
                    actClasses_indices = [
                        index + 1 for index in range(len(actClasses_bin))
                        if actClasses_bin[index] == 1
                    ]
                    predClasses_indices = [
                        index + 1 for index in range(len(predClasses_bin))
                        if predClasses_bin[index] == 1
                    ]
                else:
                    actClasses_indices, predClasses_indices = [
                        np.argmax(actClasses_bin)
                    ], predClasses_bin

                actClasses = [
                    self.dict_index2class[index]
                    for index in actClasses_indices
                ]
                predClasses = [
                    self.dict_index2class[index]
                    for index in predClasses_indices
                ]

                for actClass in actClasses:
                    if actClass in predClasses:  # True-Positive
                        countM += 1
                        if topK == 1:  # Conf Matrix only for Pred@1
                            self.confMatrix[actClass].truePos += 1
                            self.confMatrix[actClass].updatePred(actClass)

                    else:  # False-Negative: Not predicted at all
                        countN += 1
                        if topK == 1:  # Conf Matrix only for Pred@1
                            self.confMatrix[actClass].falseNeg += 1

                            for predClass in predClasses:  # Add all confusion labels
                                self.confMatrix[actClass].updatePred(predClass)

                if topK == 1:  # Conf Matrix only for Pred@1
                    for predClass in predClasses:
                        if predClass not in actClasses:  # False-Positive: Predicted, but falsely
                            self.confMatrix[predClass].falsePos += 1

            if topK == 1:  # Conf Matrix only for Pred@1
                sortedConfMat = [
                    self.confMatrix[i] for i in self.getSortedConfMat()[0]
                ]
                strConf = H.joinList(sortedConfMat)
                self.writeConfMat(sortedConfMat)

            prec_at_k = round(100 * float(countM) / (countM + countN), 2)
            predAtK.append(prec_at_k)
            print 'Pred@{}= {}'.format(topK, prec_at_k)

        return strConf, predAtK
Пример #19
0
 def __str__(self):
     return H.joinList(self.abstractTokens, ' ')