def __init__(self, projectRoot, ngramN, smallerNgramsToo): self.projectRoot = projectRoot self.nCalls = self._determineTotalNumberOfCalls() self.callAreaExtractor = SinkSnippetExtractor() self.embedder = Embedder(projectRoot) self.embedder.configureNgramCalculator(ngramN, smallerNgramsToo) self.nameDictMapToMatrix = NameDictMapToMatrix()
def _termDocumentMatrixFromContext(self, context, symbol): x = self._termDictsFromContext(context, symbol) if x == None: return None (vecs, allNgrams) = x self.nameDictMapToMatrix = NameDictMapToMatrix() self.nameDictMapToMatrix.convertFromDicts(vecs, allNgrams) termDocMatrix = self.nameDictMapToMatrix.termDocumentMatrix return termDocMatrix
def main(projectRoot, tfidf=True): nameDictMapFilename = projectRoot + 'func2SubtreesMap.pickl' allSymbolsFilename = projectRoot + 'allSubtreesDict.pickl' termDocMatrixFilename = projectRoot + 'termDocMatrix.pickl' # if os.path.exists(termDocMatrixFilename): # print 'Term by Document Matrix already exists, skipping.' # return converter = NameDictMapToMatrix() converter.convertFromFiles(nameDictMapFilename, allSymbolsFilename) if tfidf: converter.termDocumentMatrix.tfidf() converter.save(projectRoot)
def createMatrixForFunctionNames(self, functionNames): self._loadFunc2SubtreesMap() self.nameToDictMap = NameToDictMap() self.allSymbolsDict = OccurrenceCounter() nameDictMapToMatrix = NameDictMapToMatrix() functions = [(doc, self.func2SubtreesMap.d[doc]) for doc in functionNames] for (doc, func) in functions: for (ngram, nOccurrences) in func.iteritems(): for unused in xrange(nOccurrences): self.nameToDictMap.add(ngram, doc) self.allSymbolsDict.add(ngram) nameDictMapToMatrix.convertFromDicts(self.nameToDictMap, self.allSymbolsDict) newTermDocMatrix = nameDictMapToMatrix.termDocumentMatrix return newTermDocMatrix