def _termDictsFromContext(self, context, symbol): vecs = NameToDictMap() allNgrams = OccurrenceCounter() context.neighbours.append(context.origin) for neighbour in context.neighbours: nOcc = neighbour.nOccurrences location = neighbour.location expressions = self.treeToExprConverter.getExpressionsForSymbol(location, symbol) # expressions.append('@+$_+@') # expressions.append('@+EXPR@+$_+@+@') # print 'FOO %s: %s: %s' % (symbol, location, expressions) neighbour.setExpressions(expressions) # add null-vector for function if it does not contain expressions if len(expressions) == 0: vecs.add(None, location) for expr in expressions: # vecs.add(expr, location, 1.0/nOcc) # vecs.add(expr, location, 1.0) vecs.setItem(expr, location, 1.0) allNgrams.add(expr) context.neighbours.pop() if len(vecs.d) == 0 or len(allNgrams.d) == 0: return None return (vecs, allNgrams)
class SinkMatrixCreator: def __init__(self, projectRoot): self.projectRoot = projectRoot self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/' self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../') def createMatrixForSink(self, sinkName): (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName) functionNames = self.uniq( ['%s%s' % (self.programDir, c[1]) for c in callsToSink]) return self.createMatrixForFunctionNames(functionNames) """ This operation looses TF-IDF. I don't think that's the way to go. """ def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc, self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix def _loadFunc2SubtreesMap(self): filename = self.projectRoot + 'func2SubtreesMap.pickl' self.func2SubtreesMap = pickle.load(file(filename)) def uniq(self, seq, idfun=None): # order preserving if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) if marker in seen: continue seen[marker] = 1 result.append(item) return result
class SinkMatrixCreator: def __init__(self, projectRoot): self.projectRoot = projectRoot self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/' self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../') def createMatrixForSink(self, sinkName): (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName) functionNames = self.uniq([ '%s%s' % (self.programDir, c[1]) for c in callsToSink]) return self.createMatrixForFunctionNames(functionNames) """ This operation looses TF-IDF. I don't think that's the way to go. """ def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix def _loadFunc2SubtreesMap(self): filename = self.projectRoot + 'func2SubtreesMap.pickl' self.func2SubtreesMap = pickle.load(file(filename)) def uniq(self, seq, idfun=None): # order preserving if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) if marker in seen: continue seen[marker] = 1 result.append(item) return result
def calculateCheckVectors(WFuncs, CFuncs, F, binary=True, alpha=1, weighByF = False): WDict = NameToDictMap() for (functionLocation, symbols) in WFuncs.d.iteritems(): if not functionLocation in CFuncs.d: # The function does not contain any check, # thus, projected onto the check-space, it's # the NULL-vector WDict.d[functionLocation] = {} continue CFunc = CFuncs.d[functionLocation] for (s,occurrences) in symbols.iteritems(): if binary: occurrences = 1 if (not s in F): # This symbol is never checked WDict.setItem(s, functionLocation, 0) elif (s in CFunc): w = 1.0 if weighByF: w = F[s] nChecks = CFunc[s] if binary: nChecks = 1 WDict.setItem(s, functionLocation, (occurrences - alpha*nChecks)*w) else: w = 1.0 if weighByF: w = F[s] WDict.setItem(s, functionLocation, occurrences*w) return WDict
def relevancyWeighting(checkVectors, featureDir): k = 20 termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl')) functionLocations = termDocMatrix.index2Doc # it doesn't make much sense that we use euclidean distances here, # should be L1, but I can't calculate L1 on the sparse matrices for now. from scipy.spatial.distance import squareform D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl'))) anomalyCalculator = AnomalyCalculator() (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D) WDict = NameToDictMap() for i in xrange(len(functionLocations)): location = functionLocations[i] if not location in checkVectors.d: continue WDict.d[location] = checkVectors.d[location] indices = NNI[:, i] gamma = float(numpy.sum(NNV[:, i])) / k locations = [functionLocations[j] for j in indices] V = [checkVectors.d[l] for l in locations if l in checkVectors.d] distances = [ NNV[j, i] for j in xrange(len(locations)) if locations[j] in checkVectors.d ] # len(V) may be unequal to k if at least one of the nearest neighbours has no checks. # It is then a null-vector, so we're implicitly adding it in mean-calculation meanVector = {} for (v, d) in zip(V, distances): for (name, score) in v.iteritems(): try: meanVector[name] += (1 - d) * (float(score) / k) except KeyError: meanVector[name] = (1 - d) * (float(score) / k) for (name, score) in checkVectors.d[location].iteritems(): if meanVector.has_key(name): score -= meanVector[name] if score < 0: score = 0 WDict.setItem(name, location, score) return WDict
def relevancyWeighting(checkVectors, featureDir): k = 20 termDocMatrix = pickle.load(file(featureDir + 'termDocMatrix.pickl')) functionLocations = termDocMatrix.index2Doc # it doesn't make much sense that we use euclidean distances here, # should be L1, but I can't calculate L1 on the sparse matrices for now. from scipy.spatial.distance import squareform D = squareform(pickle.load(file(featureDir + 'D_euclidean.pickl'))) anomalyCalculator = AnomalyCalculator() (NNV, NNI) = anomalyCalculator.calculateNearestNeighbours(k, D) WDict = NameToDictMap() for i in xrange(len(functionLocations)): location = functionLocations[i] if not location in checkVectors.d: continue WDict.d[location] = checkVectors.d[location] indices = NNI[:,i] gamma = float(numpy.sum(NNV[:,i]))/k locations = [functionLocations[j] for j in indices] V = [checkVectors.d[l] for l in locations if l in checkVectors.d] distances = [NNV[j,i] for j in xrange(len(locations)) if locations[j] in checkVectors.d] # len(V) may be unequal to k if at least one of the nearest neighbours has no checks. # It is then a null-vector, so we're implicitly adding it in mean-calculation meanVector = {} for (v,d) in zip(V,distances): for (name, score) in v.iteritems(): try: meanVector[name] += (1-d)* (float(score)/k) except KeyError: meanVector[name] = (1-d)* (float(score)/k) for (name, score) in checkVectors.d[location].iteritems(): if meanVector.has_key(name): score -= meanVector[name] if score < 0: score = 0 WDict.setItem(name, location, score) return WDict
class FeatureArray(object): def __init__(self): self.vecs = NameToDictMap() self.allSymbols = OccurrenceCounter() def add(self, label, items): if len(items) == 0: self.vecs.add(None, label) return for item in items: itemStr = str(item) self.vecs.add(itemStr, label) self.allSymbols.add(itemStr) def __iter__(self): for x in self.vecs.iteritems(): yield x
def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc, self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix
def _termDictsFromContext(self, context, symbol): vecs = NameToDictMap() allNgrams = OccurrenceCounter() context.neighbours.append(context.origin) for neighbour in context.neighbours: nOcc = neighbour.nOccurrences location = neighbour.location expressions = self.treeToExprConverter.getExpressionsForSymbol( location, symbol) # expressions.append('@+$_+@') # expressions.append('@+EXPR@+$_+@+@') # print 'FOO %s: %s: %s' % (symbol, location, expressions) neighbour.setExpressions(expressions) # add null-vector for function if it does not contain expressions if len(expressions) == 0: vecs.add(None, location) for expr in expressions: # vecs.add(expr, location, 1.0/nOcc) # vecs.add(expr, location, 1.0) vecs.setItem(expr, location, 1.0) allNgrams.add(expr) context.neighbours.pop() if len(vecs.d) == 0 or len(allNgrams.d) == 0: return None return (vecs, allNgrams)
def calculateCheckVectors(WFuncs, CFuncs, F, binary=True, alpha=1, weighByF=False): WDict = NameToDictMap() for (functionLocation, symbols) in WFuncs.d.iteritems(): if not functionLocation in CFuncs.d: # The function does not contain any check, # thus, projected onto the check-space, it's # the NULL-vector WDict.d[functionLocation] = {} continue CFunc = CFuncs.d[functionLocation] for (s, occurrences) in symbols.iteritems(): if binary: occurrences = 1 if (not s in F): # This symbol is never checked WDict.setItem(s, functionLocation, 0) elif (s in CFunc): w = 1.0 if weighByF: w = F[s] nChecks = CFunc[s] if binary: nChecks = 1 WDict.setItem(s, functionLocation, (occurrences - alpha * nChecks) * w) else: w = 1.0 if weighByF: w = F[s] WDict.setItem(s, functionLocation, occurrences * w) return WDict
def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix
def __init__(self): self.vecs = NameToDictMap() self.allSymbols = OccurrenceCounter()