class ExpressionTreeEmbedder: def __init__(self): self.exprTreeProvider = ExpressionTreeProvider() self.trackingInfoProvider = TrackingInfoProvider() self.treeToExprConverter = TreeToExpressionConverter() def embed(self, context, symbol): return self._termDocumentMatrixFromContext(context, symbol) def _termDocumentMatrixFromContext(self, context, symbol): x = self._termDictsFromContext(context, symbol) if x == None: return None (vecs, allNgrams) = x self.nameDictMapToMatrix = NameDictMapToMatrix() self.nameDictMapToMatrix.convertFromDicts(vecs, allNgrams) termDocMatrix = self.nameDictMapToMatrix.termDocumentMatrix return termDocMatrix def _termDictsFromContext(self, context, symbol): vecs = NameToDictMap() allNgrams = OccurrenceCounter() context.neighbours.append(context.origin) for neighbour in context.neighbours: nOcc = neighbour.nOccurrences location = neighbour.location expressions = self.treeToExprConverter.getExpressionsForSymbol( location, symbol) # expressions.append('@+$_+@') # expressions.append('@+EXPR@+$_+@+@') # print 'FOO %s: %s: %s' % (symbol, location, expressions) neighbour.setExpressions(expressions) # add null-vector for function if it does not contain expressions if len(expressions) == 0: vecs.add(None, location) for expr in expressions: # vecs.add(expr, location, 1.0/nOcc) # vecs.add(expr, location, 1.0) vecs.setItem(expr, location, 1.0) allNgrams.add(expr) context.neighbours.pop() if len(vecs.d) == 0 or len(allNgrams.d) == 0: return None return (vecs, allNgrams) def getAllConditionNodes(self): return self.tree.conditionalNodes
class ExpressionTreeEmbedder: def __init__(self): self.exprTreeProvider = ExpressionTreeProvider() self.trackingInfoProvider = TrackingInfoProvider() self.treeToExprConverter = TreeToExpressionConverter() def embed(self, context, symbol): return self._termDocumentMatrixFromContext(context, symbol) def _termDocumentMatrixFromContext(self, context, symbol): x = self._termDictsFromContext(context, symbol) if x == None: return None (vecs, allNgrams)= x self.nameDictMapToMatrix = NameDictMapToMatrix() self.nameDictMapToMatrix.convertFromDicts(vecs, allNgrams) termDocMatrix = self.nameDictMapToMatrix.termDocumentMatrix return termDocMatrix def _termDictsFromContext(self, context, symbol): vecs = NameToDictMap() allNgrams = OccurrenceCounter() context.neighbours.append(context.origin) for neighbour in context.neighbours: nOcc = neighbour.nOccurrences location = neighbour.location expressions = self.treeToExprConverter.getExpressionsForSymbol(location, symbol) # expressions.append('@+$_+@') # expressions.append('@+EXPR@+$_+@+@') # print 'FOO %s: %s: %s' % (symbol, location, expressions) neighbour.setExpressions(expressions) # add null-vector for function if it does not contain expressions if len(expressions) == 0: vecs.add(None, location) for expr in expressions: # vecs.add(expr, location, 1.0/nOcc) # vecs.add(expr, location, 1.0) vecs.setItem(expr, location, 1.0) allNgrams.add(expr) context.neighbours.pop() if len(vecs.d) == 0 or len(allNgrams.d) == 0: return None return (vecs, allNgrams) def getAllConditionNodes(self): return self.tree.conditionalNodes
def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc ,self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix
def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc, self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix
class SinkSnippetEmbedder: def __init__(self, projectRoot, ngramN, smallerNgramsToo): self.projectRoot = projectRoot self.nCalls = self._determineTotalNumberOfCalls() self.callAreaExtractor = SinkSnippetExtractor() self.embedder = Embedder(projectRoot) self.embedder.configureNgramCalculator(ngramN, smallerNgramsToo) self.nameDictMapToMatrix = NameDictMapToMatrix() def _isSinkCalledTooOften(self, callsToSink): # If more than 50 percent of calls are calls to # this function, this sink is just called to often # to be interesting if float(len(callsToSink))/self.nCalls > UPPER_BOUND_FOR_NUMBER_OF_CALLS_AS_FRACTION: print 'Sink called too often' return True return False def _isSinkNotCalledOftenEnough(self, callsToSink): return (len(callsToSink) < LOWER_BOUND_FOR_NUMBER_OF_CALLS) def _determineTotalNumberOfCalls(self): callIndex = pickle.load(file(self.projectRoot + 'callIndex.pickl')) return numpy.sum([len(v) for v in callIndex.d.itervalues()]) def embedSinkUsers(self, sink): callsToSink = sink[1] if self._isSinkCalledTooOften(callsToSink): return (None, None) if self._isSinkNotCalledOftenEnough(callsToSink): print 'Sink not called often enough' return (None, None) getSinkAreaSubtree = self.callAreaExtractor.getSinkAreaSubtree filterAndAddAST = self.embedder.filterAndAddAST for label in callsToSink: areaSubtree = getSinkAreaSubtree(self.projectRoot, label) filterAndAddAST(label, areaSubtree) (vecs, allNgrams) = self.embedder.getMaps() self.nameDictMapToMatrix.convertFromDicts(vecs, allNgrams) return (sink[0], self.nameDictMapToMatrix.termDocumentMatrix) def save(self, name, sinkName): import os embeddingsDir = self.projectRoot + 'embeddings' thisEmbeddingDir = embeddingsDir + '/'+ name sinkEmbeddingDir = thisEmbeddingDir + '/' + 'sinks' thisSinkEmbeddingDir = sinkEmbeddingDir + '/' + sinkName if not os.path.exists(embeddingsDir): os.mkdir(embeddingsDir) if not os.path.exists(thisEmbeddingDir): os.mkdir(thisEmbeddingDir) if not os.path.exists(sinkEmbeddingDir): os.mkdir(sinkEmbeddingDir) if not os.path.exists(thisSinkEmbeddingDir): os.mkdir(thisSinkEmbeddingDir) pickle.dump(self.nameDictMapToMatrix.nameDictMap, file( thisSinkEmbeddingDir + '/func2SubtreesMap.pickl', 'w')) pickle.dump(self.nameDictMapToMatrix.allSymbolsDict, file( thisSinkEmbeddingDir + '/allSubtreesDict.pickl', 'w'))