示例#1
0
 def testDatabaseStoreAndClean(self):
     dbPath = config.getRepoDir() + "/testtmp.sqlite"
     dbFile = Path(dbPath)
     db = CFDatabase(dbPath)
     self.assertEqual(len(db.getAllFixData()), 0)
     db.store('', '', 'a')
     self.assertEqual(len(db.getAllFixData()), 1)
     self.assertEqual(len(db.getFixDataForChecker('a')), 1)
     self.assertEqual(len(db.getFixDataForChecker('b')), 0)
     db.clean()
     self.assertEqual(len(db.getAllFixData()), 0)
     del db
     dbFile.unlink()
class DictionaryBuilder():
    def __init__(self):
        self.db = CFDatabase(config.getCfDbFile())
        self.lexer = CxxLexer()
        self.checkers = Checkers()

    def build(self, checker):
        # Load all data from DB
        print("Fetching data from database...")
        allData = self.db.getFixDataForChecker(checker)
        allDataLen = len(allData)
        print("Done, fetched {0} records".format(allDataLen))
        if allDataLen < 1:
            print("No data found")
            return

        # Tokenize all code snippets and extract extra tokens from checker's messages
        # Labelize all tokens existing only in fixed code (added data)
        # Labelize all tokens appearing more than X times
        # Labelize all C++ STL names (namespaces, constants, defines, variables, functions, headers, numeric literals)
        # Labelize all UNK token indexes
        print("Converting to tokens...")
        tokens = deque()
        tokensLen = 0
        labels = {}
        i = 0
        tokensLen = 0

        minTokens1Len = 9999
        minTokens2Len = 9999
        maxTokens1Len = 0
        maxTokens2Len = 0

        uniqTokenIDs = {}
        for tid in range(globals.firstAvailableToken):
            uniqTokenIDs[tid] = 0
        uniqTokenIDs[0] = 1  # T_ZERO
        uniqTokenIDs[349] = 1  # T_SOS
        uniqTokenIDs[351] = 1  # T_UNK

        while i < allDataLen:
            # Tokenize
            tokens1 = self.lexer.tokenize(allData[i][1])
            tokens2 = self.lexer.tokenize(allData[i][2])
            extra = self.checkers.extractTokensForChecker(
                checker, allData[i][4])
            newTokens = []

            # Extract new tokens
            for token2 in tokens2:
                matchFound = False
                for token1 in tokens1:
                    if token1['token'] == token2['token'] and token1[
                            'has_value'] == token2['has_value']:
                        if token1['has_value']:
                            if token1['value'] == token2['value']:
                                matchFound = True
                        else:
                            matchFound = True
                if not matchFound:
                    newTokens.append(token2)
            tokens1Len = len(tokens1)
            tokens2Len = len(tokens2)

            # Statistics
            if tokens1Len < minTokens1Len:
                minTokens1Len = tokens1Len
            if tokens2Len < minTokens2Len:
                minTokens2Len = tokens2Len
            if tokens1Len > maxTokens1Len:
                maxTokens1Len = tokens1Len
            if tokens2Len > maxTokens2Len:
                maxTokens2Len = tokens2Len

            # Count occurrences of each label
            allTokens = tokens1 + tokens2 + extra
            for token in allTokens:
                value = globals.emptyValue
                if token['has_value']:
                    value = token['value']
                if value in labels:
                    labels[value] += 1
                else:
                    labels[value] = 1
                uniqTokenIDs[int(token['token'])] += 1
                tokensLen += 1
            if len(newTokens) > 0:
                tokens.append(newTokens)
            i += 1
            print('Done {0}, processed {1} tokens ({2}/{3}/{4}/{5})'.format(
                i, len(allTokens), tokens1Len, tokens2Len, len(extra),
                len(newTokens)),
                  file=sys.stderr)
        print("Done, converted {0} tokens".format(tokensLen))

        # Labelizing
        labelDb = [globals.emptyValue]
        # UNK
        print("Adding UNK token labels")
        for i in range(config.cfNoOfUnkTokens):
            labelDb.append("UNK_{0}".format(i))
        print("Done, current label DB has {0} entries".format(len(labelDb)))

        # Common occurrences
        print("Filtering labels, selecting only those with > {0} occurrences".
              format(config.cfLabelThreshold))
        for key in labels.keys():
            if labels[key] > config.cfLabelThreshold:
                labelDb.append(key)
        print("Done, current label DB has {0} entries".format(len(labelDb)))

        # New tokens in fixed code
        print("Filtering labels, selecting only tokens introduced with fix")
        for entry in tokens:
            for token in entry:
                if token['has_value']:
                    labelDb.append(token['value'])
        print("Done, current label DB has {0} entries".format(len(labelDb)))

        # STL part

        # Token IDs
        for i in range(globals.firstAvailableToken):
            if uniqTokenIDs[i] > 0:
                labelDb.append("T_{0}".format(i))

        # Printout
        print("Uniqueing labels")
        labelsUnique = list(set(labelDb))
        print("Done, current label DB has {0} entries".format(
            len(labelsUnique)))
        print("Data set info")
        print("Min no of tokens (bug): {0}".format(minTokens1Len))
        print("Min no of tokens (fix): {0}".format(minTokens2Len))
        print("Max no of tokens (bug): {0}".format(maxTokens1Len))
        print("Max no of tokens (fix): {0}".format(maxTokens2Len))
        print("Extracted labels:")
        print(labelsUnique)
        print("Token uses:")
        for i in range(globals.firstAvailableToken):
            if uniqTokenIDs[i] > 0:
                print("{0}: {1}".format(i, uniqTokenIDs[i]))

        # Save to file
        print("Writing to dictionary file")
        with open(config.cfDictFilenameFormat.format(checker), "w") as f:
            f.write(json.dumps(labelsUnique))
        print("Done, exiting...")
示例#3
0
class LearningDataBuilder():
    def __init__(self):
        self.db = CFDatabase(config.getCfDbFile())
        self.checkers = Checkers()

    def build(self, checker):
        # Initialize coder
        print("Initializing coder...")
        self.dictionary = Dictionary(checker)
        self.coder = Coder(self.dictionary)

        # Load all data from DB
        print("Fetching data from database...")
        allData = self.db.getFixDataForChecker(checker)
        allDataLen = len(allData)
        print("Done, fetched {0} records".format(allDataLen))
        if allDataLen < 1:
            print("No data found")
            return

        # Encode all data
        print("Encoding all data and writing to output file...")
        i = 0
        (maxBug, maxFix,
         maxUnk) = self.checkers.getModelStatsForChecker(checker)
        with open(config.cfTrainFilenameFormat.format(checker), 'w') as f:
            while i < allDataLen:
                checkerInfo = self.checkers.extractTokensForChecker(
                    checker, allData[i][4])
                encodedBugData, initialUnkList = self.coder.encode(
                    allData[i][1], checkerData=checkerInfo)
                encodedFixData, finalUnkList = self.coder.encode(
                    allData[i][2], unkList=initialUnkList, reverse=False)
                if -1 in encodedBugData:
                    print(
                        "{0}: [{2} - {3} ({1})] Some tokens were not parsed (bug), ignoring (lenUnk = {1})"
                        .format(i + 1, len(finalUnkList), len(encodedBugData),
                                len(encodedFixData)))
                elif -1 in encodedFixData:
                    print(
                        "{0}: [{2} - {3} ({1})] Some tokens were not parsed (fix), ignoring (lenUnk = {1})"
                        .format(i + 1, len(finalUnkList), len(encodedBugData),
                                len(encodedFixData)))
                elif len(encodedBugData) > maxBug or len(
                        encodedFixData) > maxFix or len(finalUnkList) > maxUnk:
                    print(
                        "{0}: [{2} - {3} ({1})] Some tokens were not parsed (lengths), ignoring (lenUnk = {1})"
                        .format(i + 1, len(finalUnkList), len(encodedBugData),
                                len(encodedFixData)))
                else:
                    print("{0}: [{2} - {3} ({1})] Done (lenUnk = {1})".format(
                        i + 1, len(finalUnkList), len(encodedBugData),
                        len(encodedFixData)))
                    f.write(
                        json.dumps({
                            'x': encodedBugData,
                            'y': encodedFixData
                        }) + '\n')

                i += 1
                print('Done {0}'.format(i), file=sys.stderr)

        print("All done, exiting...")