Exemplo n.º 1
0
class ModelBuilder():
    '''
    def datagen(self, batchSize, xMaxLen, yMaxLen):
        while self.ObjInd < self.ObjMax:
            end = batchSize
            if end + self.ObjInd > self.ObjMax:
                end = self.ObjMax - self.ObjInd
            x = np.zeros((end, xMaxLen, self.totalDictionaryLength))
            y = np.zeros((end, yMaxLen, self.totalDictionaryLength))
            for j in range(self.ObjInd, self.ObjInd + end):
                valueX = self.X[j]
                noZerosToPad = xMaxLen - len(valueX)
                if noZerosToPad > 0:
                    valueX = self.coder.applyPadding(valueX, noZerosToPad)
                valueY = self.Y[j]
                noZerosToPad = yMaxLen - len(valueY)
                if noZerosToPad > 0:
                    valueY = self.coder.applyPadding(valueY, noZerosToPad)
                self.coder.convertToOneHot(valueX, x[j - self.ObjInd])
                self.coder.convertToOneHot(valueY, y[j - self.ObjInd])
            self.ObjInd += end
            #some code here to load and manipulate data into x and y. Mostly numpy functions
            yield x,y
    '''
    def build(self, checker, startK, startBatch):
        # Initialize coder
        print("Initializing coder...")
        self.dictionary = Dictionary(checker)
        self.coder = Coder(self.dictionary)
        self.totalDictionaryLength = self.dictionary.length(
        )  # + globals.firstAvailableToken

        # Load training data from file
        print("Loading training data...")
        data = []
        with open(config.cfTrainFilenameFormat.format(checker), "r") as f:
            data = f.readlines()
        random.shuffle(data)
        dataLen = len(data)
        print("Done, fetched {0} records".format(dataLen))
        if dataLen < 1:
            print("No data found")
            return

        # Json load
        print("Converting to objects...")
        self.X = []
        self.Y = []
        self.ObjInd = 0
        self.ObjMax = dataLen
        xMaxLen = 0
        yMaxLen = 0
        for record in data:
            obj = json.loads(record[:-1])
            self.X.append(obj['x'])
            self.Y.append(obj['y'])
            if len(obj['x']) > xMaxLen:
                xMaxLen = len(obj['x'])
            if len(obj['y']) > yMaxLen:
                yMaxLen = len(obj['y'])

        # Padding
        print("Counted input and output lengths (X = {0}, Y = {1})...".format(
            xMaxLen, yMaxLen))

        # Preparing model
        print("Preparing model...")
        batchSaveIndex = 0
        batchSaveCounter = 0
        batchSaveThreshold = 10000

        if startK == 0 and startBatch == 0:
            model = Sequential()
            model.add(
                LSTM(config.cfTrainHiddenSize,
                     input_shape=(xMaxLen, self.totalDictionaryLength)))
            model.add(RepeatVector(yMaxLen))
            for _ in range(config.cfTrainNumLayers):
                model.add(LSTM(config.cfTrainHiddenSize,
                               return_sequences=True))
            model.add(TimeDistributed(Dense(self.totalDictionaryLength)))
            model.add(Activation('softmax'))
            model.compile(loss='categorical_crossentropy',
                          optimizer='rmsprop',
                          metrics=['accuracy'])
        else:
            modelFormat = 'checkpoint_epoch_{0}.{1}.h5'.format(
                startK - 1, checker)
            if startBatch > 0:
                batchSaveIndex = int(startBatch / batchSaveThreshold)
                modelFormat = 'checkpoint_epoch_b{2}.{0}.{1}.h5'.format(
                    startK, checker, batchSaveIndex - 1)
            model = load_model(modelFormat)
        '''
        print("Converting data...")
        X_s = np.zeros((dataLen, xMaxLen, self.totalDictionaryLength))
        Y_s = np.zeros((dataLen, yMaxLen, self.totalDictionaryLength))
        for j in range(dataLen):
            valueX = X[j]
            noZerosToPad = xMaxLen - len(valueX)
            if noZerosToPad > 0:
                valueX = self.coder.applyPadding(valueX, noZerosToPad)
            valueY = Y[j]
            noZerosToPad = yMaxLen - len(valueY)
            if noZerosToPad > 0:
                valueY = self.coder.applyPadding(valueY, noZerosToPad)
            self.coder.convertToOneHot(valueX, X_s[j])
            self.coder.convertToOneHot(valueY, Y_s[j])
        '''
        # Training model
        '''
        print("Training...")
        for k in range(startK, config.cfTrainNoEpochs):
            #model.fit(X_s, Y_s, epochs=1)#, validation_split=0.2)
            model.fit(self.datagen(config.cfTrainBatchSize, xMaxLen, yMaxLen), epochs=1, steps_per_epoch=103)#, validation_split=0.2)
            model.save('checkpoint_epoch_{0}.{1}.h5'.format(k, checker))
        '''
        #"""
        print("Training model...")
        for k in range(startK, config.cfTrainNoEpochs):
            i = 0
            model.reset_metrics()
            if k == startK:
                i = startBatch
            while i < dataLen:
                end = i + config.cfTrainBatchSize
                if end > dataLen:
                    end = dataLen
                #'''
                X_s = np.zeros((end - i, xMaxLen, self.totalDictionaryLength))
                Y_s = np.zeros((end - i, yMaxLen, self.totalDictionaryLength))
                for j in range(i, end):
                    valueX = self.X[j]
                    noZerosToPad = xMaxLen - len(
                        valueX)  #int((xMaxLen - len(valueX)) / 2)
                    if noZerosToPad > 0:
                        valueX = self.coder.applyPadding(valueX, noZerosToPad)
                    valueY = self.Y[j]
                    noZerosToPad = yMaxLen - len(
                        valueY)  #int((yMaxLen - len(valueY)) / 2)
                    if noZerosToPad > 0:
                        valueY = self.coder.applyPadding(valueY, noZerosToPad)
                    zerosX = np.zeros((xMaxLen, self.totalDictionaryLength))
                    zerosY = np.zeros((yMaxLen, self.totalDictionaryLength))
                    X_s[j - i] = self.coder.convertToOneHot(valueX, zerosX)
                    Y_s[j - i] = self.coder.convertToOneHot(valueY, zerosY)
                result = model.train_on_batch(X_s, Y_s, reset_metrics=False)
                #'''
                #result = model.train_on_batch(X_s[i:end], Y_s[i:end])
                #del X_s
                #del Y_s
                print(
                    "[{2}] Done batch {0}-{1} (loss: {3:.3f}, accuracy: {4:.3f})"
                    .format(i, end, k, result[0], result[1]))
                i += config.cfTrainBatchSize
                batchSaveCounter += config.cfTrainBatchSize
                if batchSaveCounter >= batchSaveThreshold:
                    batchSaveCounter = 0
                    model.save('checkpoint_epoch_b{2}.{0}.{1}.h5'.format(
                        k, checker, batchSaveIndex))
                    batchSaveIndex += 1
            model.save('checkpoint_epoch_{0}.{1}.h5'.format(k, checker))
            batchSaveIndex = 0
            batchSaveCounter = 0
        #"""
        print("All done, exiting...")
Exemplo n.º 2
0
class Predictor():
    def __init__(self):
        self.vcs = GitProvider(config.getRepoDir())
        self.ccdb = CCDatabase(config.getCcDbFile())
        self.codeChecker = CodeChecker(config.getRepoDir())
        self.checkers = Checkers()
        self.loadCommitList()

    def loadCommitList(self):
        self.commits = self.vcs.getAllVersions(config.getBranch())
        self.currentCommitIndex = 0

    def convertFilePathToRepoRelativePath(self, path):
        return os.path.relpath(path, config.getRepoDir())

    def getDiffResolvedIds(self):
        resolved = self.codeChecker.diffResolved(config.getCcRunName(),
                                                 config.getTmpDir(), self.ccdb)
        ids = []
        for bug in resolved:
            ids.append(bug['reportId'])
        return ids

    def predict(self, id, checker):
        # Load all bugs
        print("Loading bug data...")
        ids = []
        if id == -1:
            bugs = self.ccdb.getAllBugsForChecker(checker)
            ids = [x[0] for x in bugs]
        else:
            ids.append(id)

        # Loading model
        print("Loading model...")
        model = load_model(config.cfModelFilenameFormat.format(checker))
        model.summary()
        vLabels = ['NOT OK', 'OK', 'Skipped']

        # Initialize coder
        print("Initializing coder...")
        self.dictionary = Dictionary(checker)
        self.coder = Coder(self.dictionary)
        self.totalDictionaryLength = self.dictionary.length()

        # Predicting
        print("Starting predictions...")
        for i in ids:
            allData = self.ccdb.getBugData(i)
            if allData.getChecker(
            ) not in globals.availableCheckers or allData.getChecker(
            ) != checker:
                print("Bug #{0} - checker not supported".format(i))
            else:
                # Load extra tokens from checker message
                checkerInfo = self.checkers.extractTokensForChecker(
                    allData.getChecker(), allData.getMessage())
                # Retrieve code fragment with bug
                fileRelativePath = self.convertFilePathToRepoRelativePath(
                    allData.getFile())
                fullCodeWithBug = self.vcs.getFileContents(
                    fileRelativePath, self.commits[self.currentCommitIndex])
                extractor = CodeExtractor(allData)
                extractor.loadCodeFromText(fullCodeWithBug)
                extractor.extractBugCode()
                bugCodeFragment = extractor.getBugCodeFragment()
                fixCodeFragment = ''
                # Encode it
                encodedBugData, initialUnkList = self.coder.encode(
                    bugCodeFragment, checkerData=checkerInfo)
                # Convert to one-hot
                MODEL_X_MAX_LEN = model.get_layer(index=0).input_shape[1]
                if len(encodedBugData) > MODEL_X_MAX_LEN:
                    print(
                        "Bug #{0} - Code too big for model, ignored".format(i))
                    continue
                elif id == -1:
                    print("Bug #{0} - Good to go".format(i))
                    continue
                noZerosToPad = MODEL_X_MAX_LEN - len(encodedBugData)
                if noZerosToPad > 0:
                    encodedBugData = self.coder.applyPadding(
                        encodedBugData, noZerosToPad)
                X = np.zeros((1, MODEL_X_MAX_LEN, self.totalDictionaryLength))
                X[0] = self.coder.convertToOneHot(
                    encodedBugData,
                    np.zeros((MODEL_X_MAX_LEN, self.totalDictionaryLength)))
                # Predict and convert from one-hot
                Y = self.coder.convertFromOneHot(model.predict(X)[0])
                print(Y)
                # Decode
                Y = self.coder.removePadding(Y)
                fixCodeFragment = self.coder.decode(Y, initialUnkList)

                #Verify?
                vStatus = 2
                if config.cfVerifyPrediction:
                    # Apply fix in source code file
                    extractor.applyFix(fixCodeFragment)
                    extractor.saveToFile(allData.getFile())
                    # Run CodeChecker and analyze code
                    self.codeChecker.check(True)
                    resolvedIds = self.getDiffResolvedIds()
                    # Check if ID is resolved in tmp folder
                    isFixed = i in resolvedIds
                    # Set vStatus accordingly
                    if isFixed:
                        vStatus = 1
                    else:
                        vStatus = 0
                #Print
                print("Bug #{0} - summary".format(i))
                print("== Code fragment with bug ==")
                print(bugCodeFragment)
                print("== Suggested fix ==")
                print(fixCodeFragment)
                print("Verification: {0}".format(vLabels[vStatus]))
                a = ' '
                while a != 'y' and a != 'n':
                    a = input("Apply fix? (y/n): ")
                if a == 'y':
                    if not config.cfVerifyPrediction:
                        # Apply fix in source code file
                        extractor.applyFix(fixCodeFragment)
                        extractor.saveToFile(allData.getFile())
                elif config.cfVerifyPrediction:
                    # Revert file contents
                    self.vcs.checkout(self.commits[self.currentCommitIndex])
                print('Done')
        print("All done, exiting...")
Exemplo n.º 3
0
    def main(self):
        # Do analysis
        shutil.rmtree(config.getTmpDir())
        self.codeChecker.check(True)

        # Diff new
        newBugs = self.getDiffNew()

        if len(newBugs) < 1:
            print('No new bugs introduced, commit is accepted!')
            return
        
        print("New bugs found! Count: {0}. Attempting repairs...".format(len(newBugs)))

        # Load models
        models = {}
        for checker in globals.availableCheckers:
            models[checker] = load_model(config.cfModelFilenameFormat.format(checker))

        # Load all content from files having new
        files = set([self.convertFilePathToRepoRelativePath(x.getFile()) for x in newBugs])
        fileContents = {}
        for f in files:
            fn = config.getRepoDir() + f
            with open(fn, 'r') as fh:
                fileContents[f] = ''.join(fh.readlines())

        # For each file sort by bug line desc
        suggestions = []
        validSuggestions = 0
        for f in files:
            bugs = [x for x in newBugs if self.convertFilePathToRepoRelativePath(x.getFile()) == f]
            bugs.sort(key=lambda x: x.getLine(), reverse=True)
            print("=== File: {0} ===".format(f))
            # For each bug get a suggestion and test it
            for b in bugs:
                print("L{0}, Type: {1}".format(b.getLine(), b.getChecker()))
                # Prepare useful data
                dictionary = Dictionary(b.getChecker())
                coder = Coder(dictionary)
                totalDictionaryLength = dictionary.length()
                # Prepare and extract bug fragment
                checkerInfo = self.checkers.extractTokensForChecker(b.getChecker(), b.getMessage())
                extractor = CodeExtractor(b)
                extractor.loadCodeFromText(fileContents[f])
                extractor.extractBugCode()
                bugCodeFragment = extractor.getBugCodeFragment()
                fixCodeFragment = ''
                # Encode it
                encodedBugData, initialUnkList = coder.encode(bugCodeFragment, checkerData = checkerInfo)
                # Convert to one-hot
                MODEL_X_MAX_LEN = models[b.getChecker()].get_layer(index = 0).input_shape[1]

                if len(encodedBugData) > MODEL_X_MAX_LEN:
                    print("Ignored: Code too big for model")
                    continue

                noZerosToPad = MODEL_X_MAX_LEN - len(encodedBugData)
                if noZerosToPad > 0:
                    encodedBugData = coder.applyPadding(encodedBugData, noZerosToPad)
                X = np.zeros((1, MODEL_X_MAX_LEN, totalDictionaryLength))
                X[0] = coder.convertToOneHot(encodedBugData, np.zeros((MODEL_X_MAX_LEN, totalDictionaryLength)))
                # Predict and convert from one-hot
                Y = coder.convertFromOneHot(models[b.getChecker()].predict(X)[0])
                Y = coder.removePadding(Y)
                # Decode
                fixCodeFragment = coder.decode(Y, initialUnkList)[:-1]
                
                #Verify?
                vStatus = 2
                if config.cfVerifyPrediction:
                    # Apply fix in source code file
                    extractor.applyFix(fixCodeFragment)
                    extractor.saveToFile(b.getFile())
                    # Run CodeChecker and analyze code
                    shutil.rmtree(config.getTmpDir())
                    compilationLog = self.codeChecker.check(True)
                    newBugsAfterFix = self.getDiffNew()
                    # Check if ID is resolved in tmp folder
                    isFixed = 'Build failed' not in compilationLog
                    for nb in newBugsAfterFix:
                        if self.isBugDataEqual(b, nb):
                            isFixed = False
                    # Set vStatus accordingly
                    if isFixed:
                        vStatus = 1
                    else:
                        vStatus = 0
                    # Revert file
                    extractor.loadCodeFromText(fileContents[f])
                    extractor.saveToFile(b.getFile())
                if vStatus == 0:
                    print("Verification: Negative, cannot be applied")
                elif vStatus == 1:
                    print("Verification: Positive, can be applied")
                    validSuggestions += 1
                elif vStatus == 2:
                    print("Verification: Skipped")
                    validSuggestions += 1
                sugg = SuggestionData(f, b, bugCodeFragment, fixCodeFragment, vStatus)
                suggestions.append(sugg)
        print("Valid suggestions prepared for {0} / {1} bugs.".format(validSuggestions, len(newBugs)))

        if validSuggestions > 0:
            print("Apply valid suggestions (a), display them (d), ignore them (i) or abort commit (q)?")
            apply = False
            choice = True
            while choice:
                c = sys.stdin.read(1)
                if c == 'a':
                    apply = True
                    choice = False
                    print("Applying fixes...")
                elif c == 'i':
                    choice = False
                    print("Fixes ignored...")
                elif c == 'd':
                    self.displaySuggestions(suggestions)
                    print("Apply valid suggestions (a), ignore them (i) or abort commit (q)?")
                elif c == 'q':
                    print("Aborting commit...")
                    sys.exit(1)
            if apply:
                self.applyValidFixes(suggestions, files)
                print("Fixes applied!")
        if validSuggestions != len(newBugs):
            print("Unable to fix all bugs, continue with commit (c) or abort (q)?")
            choice = True
            while choice:
                c = sys.stdin.read(1)
                if c == 'c':
                    choice = False
                    print("Continuing...")
                elif c == 'q':
                    print("Aborting commit...")
                    sys.exit(1)
        else:
            print("Bugs corrected, commit is good to go!")