def _getFunctionNamesFromSink(self, sinkName): sinkUserProvider = SinkUserProvider(self.projectRoot) sink = sinkUserProvider.getSinkByName(sinkName) (unused, callsToSink) = sink functionNames = self.unique( [self.projectRoot + c[1] for c in callsToSink]) return functionNames
class SinkMatrixCreator: def __init__(self, projectRoot): self.projectRoot = projectRoot self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/' self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../') def createMatrixForSink(self, sinkName): (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName) functionNames = self.uniq( ['%s%s' % (self.programDir, c[1]) for c in callsToSink]) return self.createMatrixForFunctionNames(functionNames) """ This operation looses TF-IDF. I don't think that's the way to go. """ def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc, self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix def _loadFunc2SubtreesMap(self): filename = self.projectRoot + 'func2SubtreesMap.pickl' self.func2SubtreesMap = pickle.load(file(filename)) def uniq(self, seq, idfun=None): # order preserving if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) if marker in seen: continue seen[marker] = 1 result.append(item) return result
def sinkSnippetEmbedder(projectRoot, sinkOfInterest, configuration): print 'embed for sink: %s' % sinkOfInterest sink = SinkUserProvider(projectRoot).getSinkByName(sinkOfInterest) sinkUserEmbedder = SinkSnippetEmbedder(projectRoot, configuration['ngramN'], configuration['smallerNgramsToo']) (name, termDocMatrix) = sinkUserEmbedder.embedSinkUsers(sink) #@UnusedVariable return termDocMatrix
def embedFunctionsUsingSink(projectRoot, filterName, ngramN, smallerNgramsToo, sinkOfInterest): from tools.SinkSnippetEmbedder.SinkSnippetEmbedder import SinkSnippetEmbedder from tools.SinkSnippetEmbedder.SinkUserProvider import SinkUserProvider name = '%s_%d.pickl' % (filterName, ngramN) embedder = SinkSnippetEmbedder(projectRoot, ngramN, smallerNgramsToo) sink = SinkUserProvider(projectRoot).getSinkByName(sinkOfInterest) embedder.embedSinkUsers(sink) return embedder.save(name, sinkOfInterest)
class SinkMatrixCreator: def __init__(self, projectRoot): self.projectRoot = projectRoot self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/' self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../') def createMatrixForSink(self, sinkName): (unused, callsToSink) = self.sinkUserProvider.getSinkByName(sinkName) functionNames = self.uniq([ '%s%s' % (self.programDir, c[1]) for c in callsToSink]) return self.createMatrixForFunctionNames(functionNames) """ This operation looses TF-IDF. I don't think that's the way to go. """ def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix def _loadFunc2SubtreesMap(self): filename = self.projectRoot + 'func2SubtreesMap.pickl' self.func2SubtreesMap = pickle.load(file(filename)) def uniq(self, seq, idfun=None): # order preserving if idfun is None: def idfun(x): return x seen = {} result = [] for item in seq: marker = idfun(item) if marker in seen: continue seen[marker] = 1 result.append(item) return result
def _getAvailableSinks(self): sinkUserProvider = SinkUserProvider(self.projectRoot) return sinkUserProvider.getSinks(self.args.min_calls_to_sink)
def _getFunctionNamesFromSink(self, sinkName): sinkUserProvider = SinkUserProvider(self.projectRoot) sink = sinkUserProvider.getSinkByName(sinkName) (unused, callsToSink) = sink functionNames = self.unique([self.projectRoot + c[1] for c in callsToSink]) return functionNames
def __init__(self, projectRoot): self.projectRoot = projectRoot self.programDir = '/'.join(self.projectRoot.split('/')[:-3]) + '/' self.sinkUserProvider = SinkUserProvider(self.projectRoot + '../../')
def _getAvailableSinks(self): from tools.SinkSnippetEmbedder.SinkUserProvider import SinkUserProvider sinkUserProvider = SinkUserProvider(self.args.projectDirectory) return sinkUserProvider.getSinks(self.args.min_calls_to_sink)