class ModelBuilder(): ''' def datagen(self, batchSize, xMaxLen, yMaxLen): while self.ObjInd < self.ObjMax: end = batchSize if end + self.ObjInd > self.ObjMax: end = self.ObjMax - self.ObjInd x = np.zeros((end, xMaxLen, self.totalDictionaryLength)) y = np.zeros((end, yMaxLen, self.totalDictionaryLength)) for j in range(self.ObjInd, self.ObjInd + end): valueX = self.X[j] noZerosToPad = xMaxLen - len(valueX) if noZerosToPad > 0: valueX = self.coder.applyPadding(valueX, noZerosToPad) valueY = self.Y[j] noZerosToPad = yMaxLen - len(valueY) if noZerosToPad > 0: valueY = self.coder.applyPadding(valueY, noZerosToPad) self.coder.convertToOneHot(valueX, x[j - self.ObjInd]) self.coder.convertToOneHot(valueY, y[j - self.ObjInd]) self.ObjInd += end #some code here to load and manipulate data into x and y. Mostly numpy functions yield x,y ''' def build(self, checker, startK, startBatch): # Initialize coder print("Initializing coder...") self.dictionary = Dictionary(checker) self.coder = Coder(self.dictionary) self.totalDictionaryLength = self.dictionary.length( ) # + globals.firstAvailableToken # Load training data from file print("Loading training data...") data = [] with open(config.cfTrainFilenameFormat.format(checker), "r") as f: data = f.readlines() random.shuffle(data) dataLen = len(data) print("Done, fetched {0} records".format(dataLen)) if dataLen < 1: print("No data found") return # Json load print("Converting to objects...") self.X = [] self.Y = [] self.ObjInd = 0 self.ObjMax = dataLen xMaxLen = 0 yMaxLen = 0 for record in data: obj = json.loads(record[:-1]) self.X.append(obj['x']) self.Y.append(obj['y']) if len(obj['x']) > xMaxLen: xMaxLen = len(obj['x']) if len(obj['y']) > yMaxLen: yMaxLen = len(obj['y']) # Padding print("Counted input and output lengths (X = {0}, Y = {1})...".format( xMaxLen, yMaxLen)) # Preparing model print("Preparing model...") batchSaveIndex = 0 batchSaveCounter = 0 batchSaveThreshold = 10000 if startK == 0 and startBatch == 0: model = Sequential() model.add( LSTM(config.cfTrainHiddenSize, input_shape=(xMaxLen, self.totalDictionaryLength))) model.add(RepeatVector(yMaxLen)) for _ in range(config.cfTrainNumLayers): model.add(LSTM(config.cfTrainHiddenSize, return_sequences=True)) model.add(TimeDistributed(Dense(self.totalDictionaryLength))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) else: modelFormat = 'checkpoint_epoch_{0}.{1}.h5'.format( startK - 1, checker) if startBatch > 0: batchSaveIndex = int(startBatch / batchSaveThreshold) modelFormat = 'checkpoint_epoch_b{2}.{0}.{1}.h5'.format( startK, checker, batchSaveIndex - 1) model = load_model(modelFormat) ''' print("Converting data...") X_s = np.zeros((dataLen, xMaxLen, self.totalDictionaryLength)) Y_s = np.zeros((dataLen, yMaxLen, self.totalDictionaryLength)) for j in range(dataLen): valueX = X[j] noZerosToPad = xMaxLen - len(valueX) if noZerosToPad > 0: valueX = self.coder.applyPadding(valueX, noZerosToPad) valueY = Y[j] noZerosToPad = yMaxLen - len(valueY) if noZerosToPad > 0: valueY = self.coder.applyPadding(valueY, noZerosToPad) self.coder.convertToOneHot(valueX, X_s[j]) self.coder.convertToOneHot(valueY, Y_s[j]) ''' # Training model ''' print("Training...") for k in range(startK, config.cfTrainNoEpochs): #model.fit(X_s, Y_s, epochs=1)#, validation_split=0.2) model.fit(self.datagen(config.cfTrainBatchSize, xMaxLen, yMaxLen), epochs=1, steps_per_epoch=103)#, validation_split=0.2) model.save('checkpoint_epoch_{0}.{1}.h5'.format(k, checker)) ''' #""" print("Training model...") for k in range(startK, config.cfTrainNoEpochs): i = 0 model.reset_metrics() if k == startK: i = startBatch while i < dataLen: end = i + config.cfTrainBatchSize if end > dataLen: end = dataLen #''' X_s = np.zeros((end - i, xMaxLen, self.totalDictionaryLength)) Y_s = np.zeros((end - i, yMaxLen, self.totalDictionaryLength)) for j in range(i, end): valueX = self.X[j] noZerosToPad = xMaxLen - len( valueX) #int((xMaxLen - len(valueX)) / 2) if noZerosToPad > 0: valueX = self.coder.applyPadding(valueX, noZerosToPad) valueY = self.Y[j] noZerosToPad = yMaxLen - len( valueY) #int((yMaxLen - len(valueY)) / 2) if noZerosToPad > 0: valueY = self.coder.applyPadding(valueY, noZerosToPad) zerosX = np.zeros((xMaxLen, self.totalDictionaryLength)) zerosY = np.zeros((yMaxLen, self.totalDictionaryLength)) X_s[j - i] = self.coder.convertToOneHot(valueX, zerosX) Y_s[j - i] = self.coder.convertToOneHot(valueY, zerosY) result = model.train_on_batch(X_s, Y_s, reset_metrics=False) #''' #result = model.train_on_batch(X_s[i:end], Y_s[i:end]) #del X_s #del Y_s print( "[{2}] Done batch {0}-{1} (loss: {3:.3f}, accuracy: {4:.3f})" .format(i, end, k, result[0], result[1])) i += config.cfTrainBatchSize batchSaveCounter += config.cfTrainBatchSize if batchSaveCounter >= batchSaveThreshold: batchSaveCounter = 0 model.save('checkpoint_epoch_b{2}.{0}.{1}.h5'.format( k, checker, batchSaveIndex)) batchSaveIndex += 1 model.save('checkpoint_epoch_{0}.{1}.h5'.format(k, checker)) batchSaveIndex = 0 batchSaveCounter = 0 #""" print("All done, exiting...")
class Predictor(): def __init__(self): self.vcs = GitProvider(config.getRepoDir()) self.ccdb = CCDatabase(config.getCcDbFile()) self.codeChecker = CodeChecker(config.getRepoDir()) self.checkers = Checkers() self.loadCommitList() def loadCommitList(self): self.commits = self.vcs.getAllVersions(config.getBranch()) self.currentCommitIndex = 0 def convertFilePathToRepoRelativePath(self, path): return os.path.relpath(path, config.getRepoDir()) def getDiffResolvedIds(self): resolved = self.codeChecker.diffResolved(config.getCcRunName(), config.getTmpDir(), self.ccdb) ids = [] for bug in resolved: ids.append(bug['reportId']) return ids def predict(self, id, checker): # Load all bugs print("Loading bug data...") ids = [] if id == -1: bugs = self.ccdb.getAllBugsForChecker(checker) ids = [x[0] for x in bugs] else: ids.append(id) # Loading model print("Loading model...") model = load_model(config.cfModelFilenameFormat.format(checker)) model.summary() vLabels = ['NOT OK', 'OK', 'Skipped'] # Initialize coder print("Initializing coder...") self.dictionary = Dictionary(checker) self.coder = Coder(self.dictionary) self.totalDictionaryLength = self.dictionary.length() # Predicting print("Starting predictions...") for i in ids: allData = self.ccdb.getBugData(i) if allData.getChecker( ) not in globals.availableCheckers or allData.getChecker( ) != checker: print("Bug #{0} - checker not supported".format(i)) else: # Load extra tokens from checker message checkerInfo = self.checkers.extractTokensForChecker( allData.getChecker(), allData.getMessage()) # Retrieve code fragment with bug fileRelativePath = self.convertFilePathToRepoRelativePath( allData.getFile()) fullCodeWithBug = self.vcs.getFileContents( fileRelativePath, self.commits[self.currentCommitIndex]) extractor = CodeExtractor(allData) extractor.loadCodeFromText(fullCodeWithBug) extractor.extractBugCode() bugCodeFragment = extractor.getBugCodeFragment() fixCodeFragment = '' # Encode it encodedBugData, initialUnkList = self.coder.encode( bugCodeFragment, checkerData=checkerInfo) # Convert to one-hot MODEL_X_MAX_LEN = model.get_layer(index=0).input_shape[1] if len(encodedBugData) > MODEL_X_MAX_LEN: print( "Bug #{0} - Code too big for model, ignored".format(i)) continue elif id == -1: print("Bug #{0} - Good to go".format(i)) continue noZerosToPad = MODEL_X_MAX_LEN - len(encodedBugData) if noZerosToPad > 0: encodedBugData = self.coder.applyPadding( encodedBugData, noZerosToPad) X = np.zeros((1, MODEL_X_MAX_LEN, self.totalDictionaryLength)) X[0] = self.coder.convertToOneHot( encodedBugData, np.zeros((MODEL_X_MAX_LEN, self.totalDictionaryLength))) # Predict and convert from one-hot Y = self.coder.convertFromOneHot(model.predict(X)[0]) print(Y) # Decode Y = self.coder.removePadding(Y) fixCodeFragment = self.coder.decode(Y, initialUnkList) #Verify? vStatus = 2 if config.cfVerifyPrediction: # Apply fix in source code file extractor.applyFix(fixCodeFragment) extractor.saveToFile(allData.getFile()) # Run CodeChecker and analyze code self.codeChecker.check(True) resolvedIds = self.getDiffResolvedIds() # Check if ID is resolved in tmp folder isFixed = i in resolvedIds # Set vStatus accordingly if isFixed: vStatus = 1 else: vStatus = 0 #Print print("Bug #{0} - summary".format(i)) print("== Code fragment with bug ==") print(bugCodeFragment) print("== Suggested fix ==") print(fixCodeFragment) print("Verification: {0}".format(vLabels[vStatus])) a = ' ' while a != 'y' and a != 'n': a = input("Apply fix? (y/n): ") if a == 'y': if not config.cfVerifyPrediction: # Apply fix in source code file extractor.applyFix(fixCodeFragment) extractor.saveToFile(allData.getFile()) elif config.cfVerifyPrediction: # Revert file contents self.vcs.checkout(self.commits[self.currentCommitIndex]) print('Done') print("All done, exiting...")
def main(self): # Do analysis shutil.rmtree(config.getTmpDir()) self.codeChecker.check(True) # Diff new newBugs = self.getDiffNew() if len(newBugs) < 1: print('No new bugs introduced, commit is accepted!') return print("New bugs found! Count: {0}. Attempting repairs...".format(len(newBugs))) # Load models models = {} for checker in globals.availableCheckers: models[checker] = load_model(config.cfModelFilenameFormat.format(checker)) # Load all content from files having new files = set([self.convertFilePathToRepoRelativePath(x.getFile()) for x in newBugs]) fileContents = {} for f in files: fn = config.getRepoDir() + f with open(fn, 'r') as fh: fileContents[f] = ''.join(fh.readlines()) # For each file sort by bug line desc suggestions = [] validSuggestions = 0 for f in files: bugs = [x for x in newBugs if self.convertFilePathToRepoRelativePath(x.getFile()) == f] bugs.sort(key=lambda x: x.getLine(), reverse=True) print("=== File: {0} ===".format(f)) # For each bug get a suggestion and test it for b in bugs: print("L{0}, Type: {1}".format(b.getLine(), b.getChecker())) # Prepare useful data dictionary = Dictionary(b.getChecker()) coder = Coder(dictionary) totalDictionaryLength = dictionary.length() # Prepare and extract bug fragment checkerInfo = self.checkers.extractTokensForChecker(b.getChecker(), b.getMessage()) extractor = CodeExtractor(b) extractor.loadCodeFromText(fileContents[f]) extractor.extractBugCode() bugCodeFragment = extractor.getBugCodeFragment() fixCodeFragment = '' # Encode it encodedBugData, initialUnkList = coder.encode(bugCodeFragment, checkerData = checkerInfo) # Convert to one-hot MODEL_X_MAX_LEN = models[b.getChecker()].get_layer(index = 0).input_shape[1] if len(encodedBugData) > MODEL_X_MAX_LEN: print("Ignored: Code too big for model") continue noZerosToPad = MODEL_X_MAX_LEN - len(encodedBugData) if noZerosToPad > 0: encodedBugData = coder.applyPadding(encodedBugData, noZerosToPad) X = np.zeros((1, MODEL_X_MAX_LEN, totalDictionaryLength)) X[0] = coder.convertToOneHot(encodedBugData, np.zeros((MODEL_X_MAX_LEN, totalDictionaryLength))) # Predict and convert from one-hot Y = coder.convertFromOneHot(models[b.getChecker()].predict(X)[0]) Y = coder.removePadding(Y) # Decode fixCodeFragment = coder.decode(Y, initialUnkList)[:-1] #Verify? vStatus = 2 if config.cfVerifyPrediction: # Apply fix in source code file extractor.applyFix(fixCodeFragment) extractor.saveToFile(b.getFile()) # Run CodeChecker and analyze code shutil.rmtree(config.getTmpDir()) compilationLog = self.codeChecker.check(True) newBugsAfterFix = self.getDiffNew() # Check if ID is resolved in tmp folder isFixed = 'Build failed' not in compilationLog for nb in newBugsAfterFix: if self.isBugDataEqual(b, nb): isFixed = False # Set vStatus accordingly if isFixed: vStatus = 1 else: vStatus = 0 # Revert file extractor.loadCodeFromText(fileContents[f]) extractor.saveToFile(b.getFile()) if vStatus == 0: print("Verification: Negative, cannot be applied") elif vStatus == 1: print("Verification: Positive, can be applied") validSuggestions += 1 elif vStatus == 2: print("Verification: Skipped") validSuggestions += 1 sugg = SuggestionData(f, b, bugCodeFragment, fixCodeFragment, vStatus) suggestions.append(sugg) print("Valid suggestions prepared for {0} / {1} bugs.".format(validSuggestions, len(newBugs))) if validSuggestions > 0: print("Apply valid suggestions (a), display them (d), ignore them (i) or abort commit (q)?") apply = False choice = True while choice: c = sys.stdin.read(1) if c == 'a': apply = True choice = False print("Applying fixes...") elif c == 'i': choice = False print("Fixes ignored...") elif c == 'd': self.displaySuggestions(suggestions) print("Apply valid suggestions (a), ignore them (i) or abort commit (q)?") elif c == 'q': print("Aborting commit...") sys.exit(1) if apply: self.applyValidFixes(suggestions, files) print("Fixes applied!") if validSuggestions != len(newBugs): print("Unable to fix all bugs, continue with commit (c) or abort (q)?") choice = True while choice: c = sys.stdin.read(1) if c == 'c': choice = False print("Continuing...") elif c == 'q': print("Aborting commit...") sys.exit(1) else: print("Bugs corrected, commit is good to go!")