def testDatabaseStoreAndClean(self): dbPath = config.getRepoDir() + "/testtmp.sqlite" dbFile = Path(dbPath) db = CFDatabase(dbPath) self.assertEqual(len(db.getAllFixData()), 0) db.store('', '', 'a') self.assertEqual(len(db.getAllFixData()), 1) self.assertEqual(len(db.getFixDataForChecker('a')), 1) self.assertEqual(len(db.getFixDataForChecker('b')), 0) db.clean() self.assertEqual(len(db.getAllFixData()), 0) del db dbFile.unlink()
class DictionaryBuilder(): def __init__(self): self.db = CFDatabase(config.getCfDbFile()) self.lexer = CxxLexer() self.checkers = Checkers() def build(self, checker): # Load all data from DB print("Fetching data from database...") allData = self.db.getFixDataForChecker(checker) allDataLen = len(allData) print("Done, fetched {0} records".format(allDataLen)) if allDataLen < 1: print("No data found") return # Tokenize all code snippets and extract extra tokens from checker's messages # Labelize all tokens existing only in fixed code (added data) # Labelize all tokens appearing more than X times # Labelize all C++ STL names (namespaces, constants, defines, variables, functions, headers, numeric literals) # Labelize all UNK token indexes print("Converting to tokens...") tokens = deque() tokensLen = 0 labels = {} i = 0 tokensLen = 0 minTokens1Len = 9999 minTokens2Len = 9999 maxTokens1Len = 0 maxTokens2Len = 0 uniqTokenIDs = {} for tid in range(globals.firstAvailableToken): uniqTokenIDs[tid] = 0 uniqTokenIDs[0] = 1 # T_ZERO uniqTokenIDs[349] = 1 # T_SOS uniqTokenIDs[351] = 1 # T_UNK while i < allDataLen: # Tokenize tokens1 = self.lexer.tokenize(allData[i][1]) tokens2 = self.lexer.tokenize(allData[i][2]) extra = self.checkers.extractTokensForChecker( checker, allData[i][4]) newTokens = [] # Extract new tokens for token2 in tokens2: matchFound = False for token1 in tokens1: if token1['token'] == token2['token'] and token1[ 'has_value'] == token2['has_value']: if token1['has_value']: if token1['value'] == token2['value']: matchFound = True else: matchFound = True if not matchFound: newTokens.append(token2) tokens1Len = len(tokens1) tokens2Len = len(tokens2) # Statistics if tokens1Len < minTokens1Len: minTokens1Len = tokens1Len if tokens2Len < minTokens2Len: minTokens2Len = tokens2Len if tokens1Len > maxTokens1Len: maxTokens1Len = tokens1Len if tokens2Len > maxTokens2Len: maxTokens2Len = tokens2Len # Count occurrences of each label allTokens = tokens1 + tokens2 + extra for token in allTokens: value = globals.emptyValue if token['has_value']: value = token['value'] if value in labels: labels[value] += 1 else: labels[value] = 1 uniqTokenIDs[int(token['token'])] += 1 tokensLen += 1 if len(newTokens) > 0: tokens.append(newTokens) i += 1 print('Done {0}, processed {1} tokens ({2}/{3}/{4}/{5})'.format( i, len(allTokens), tokens1Len, tokens2Len, len(extra), len(newTokens)), file=sys.stderr) print("Done, converted {0} tokens".format(tokensLen)) # Labelizing labelDb = [globals.emptyValue] # UNK print("Adding UNK token labels") for i in range(config.cfNoOfUnkTokens): labelDb.append("UNK_{0}".format(i)) print("Done, current label DB has {0} entries".format(len(labelDb))) # Common occurrences print("Filtering labels, selecting only those with > {0} occurrences". format(config.cfLabelThreshold)) for key in labels.keys(): if labels[key] > config.cfLabelThreshold: labelDb.append(key) print("Done, current label DB has {0} entries".format(len(labelDb))) # New tokens in fixed code print("Filtering labels, selecting only tokens introduced with fix") for entry in tokens: for token in entry: if token['has_value']: labelDb.append(token['value']) print("Done, current label DB has {0} entries".format(len(labelDb))) # STL part # Token IDs for i in range(globals.firstAvailableToken): if uniqTokenIDs[i] > 0: labelDb.append("T_{0}".format(i)) # Printout print("Uniqueing labels") labelsUnique = list(set(labelDb)) print("Done, current label DB has {0} entries".format( len(labelsUnique))) print("Data set info") print("Min no of tokens (bug): {0}".format(minTokens1Len)) print("Min no of tokens (fix): {0}".format(minTokens2Len)) print("Max no of tokens (bug): {0}".format(maxTokens1Len)) print("Max no of tokens (fix): {0}".format(maxTokens2Len)) print("Extracted labels:") print(labelsUnique) print("Token uses:") for i in range(globals.firstAvailableToken): if uniqTokenIDs[i] > 0: print("{0}: {1}".format(i, uniqTokenIDs[i])) # Save to file print("Writing to dictionary file") with open(config.cfDictFilenameFormat.format(checker), "w") as f: f.write(json.dumps(labelsUnique)) print("Done, exiting...")
class LearningDataBuilder(): def __init__(self): self.db = CFDatabase(config.getCfDbFile()) self.checkers = Checkers() def build(self, checker): # Initialize coder print("Initializing coder...") self.dictionary = Dictionary(checker) self.coder = Coder(self.dictionary) # Load all data from DB print("Fetching data from database...") allData = self.db.getFixDataForChecker(checker) allDataLen = len(allData) print("Done, fetched {0} records".format(allDataLen)) if allDataLen < 1: print("No data found") return # Encode all data print("Encoding all data and writing to output file...") i = 0 (maxBug, maxFix, maxUnk) = self.checkers.getModelStatsForChecker(checker) with open(config.cfTrainFilenameFormat.format(checker), 'w') as f: while i < allDataLen: checkerInfo = self.checkers.extractTokensForChecker( checker, allData[i][4]) encodedBugData, initialUnkList = self.coder.encode( allData[i][1], checkerData=checkerInfo) encodedFixData, finalUnkList = self.coder.encode( allData[i][2], unkList=initialUnkList, reverse=False) if -1 in encodedBugData: print( "{0}: [{2} - {3} ({1})] Some tokens were not parsed (bug), ignoring (lenUnk = {1})" .format(i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) elif -1 in encodedFixData: print( "{0}: [{2} - {3} ({1})] Some tokens were not parsed (fix), ignoring (lenUnk = {1})" .format(i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) elif len(encodedBugData) > maxBug or len( encodedFixData) > maxFix or len(finalUnkList) > maxUnk: print( "{0}: [{2} - {3} ({1})] Some tokens were not parsed (lengths), ignoring (lenUnk = {1})" .format(i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) else: print("{0}: [{2} - {3} ({1})] Done (lenUnk = {1})".format( i + 1, len(finalUnkList), len(encodedBugData), len(encodedFixData))) f.write( json.dumps({ 'x': encodedBugData, 'y': encodedFixData }) + '\n') i += 1 print('Done {0}'.format(i), file=sys.stderr) print("All done, exiting...")