def compareSimilarity(file, label, margin=0.1): total = 0 nBiggerMargin = 0 nBigger = 0 f = open(file, 'r') for l in f: bug, pos, neg = l.strip().split(',') bugText = concatenateSummaryAndDescription(bugReportDataset.getBug(bug)) posText = concatenateSummaryAndDescription(bugReportDataset.getBug(pos)) negText = concatenateSummaryAndDescription(bugReportDataset.getBug(neg)) ftrsBug, ftrsPos, ftrsNeg = vectorizer.transform([bugText, posText, negText]) simPos = cosine_similarity(ftrsBug, ftrsPos) simNeg = cosine_similarity(ftrsBug, ftrsNeg) if simPos - simNeg >= margin: nBiggerMargin += 1 if simPos > simNeg: nBigger += 1 total += 1 print("%s - triplets which the difference between positive and negative is bigger than %d: %.3f (%d/%d)" % ( label, margin, nBiggerMargin * 100.0 / total, nBiggerMargin, total)) print("%s - triplets which the positive similarity is bigger than the negative: %.3f (%d/%d)" % ( label, nBigger * 100.0 / total, nBigger, total))
def pregenerateBugEmbedding(self, bugIds): # Generate bag of words representation for each bug texts = [ concatenateSummaryAndDescription( self.bugReportDatabase.getBug(bugId)) for bugId in bugIds ] vectors = self.tfIdfVectorizer.transform(texts) for idx, bugId in enumerate(bugIds): self.bugEmbeddingById[bugId] = vectors[idx]
def generateNegativeListSparseVector(listSize, bugReportDatabase, bugIds, vectorizerClass, masterBugIdByBugId, normalize, nProcesses, sameProd=False): bugTexts = [concatenateSummaryAndDescription(bugReportDatabase.getBug(bugId)) for bugId in bugIds] logger = logging.getLogger(__name__) logger.info("Transforming text to vector") vectors = vectorizerClass.transform(bugTexts) if normalize: logger.info("Normalizing vectors to length 1") matrixRep = sklearn.preprocessing.data.normalize(vectors) else: matrixRep = vectors similarityIterByBugId = {} # Cache the similarity list of the bugs logger.info("Starting to cache the similarity list") bugToCreateList = [] bugPosition = {id: idx for idx, id in enumerate(bugIds)} for master in bugReportDatabase.getMasterSetById(bugIds).values(): for bugId in master: if bugPosition.get(bugId, -1) != -1: bugToCreateList.append(bugId) def parallel(chunk, queue, index): logger = logging.getLogger() logger.info( "Process %s started to compute the similarity for %d duplicate bugs. Start idx: %d" % ( os.getpid(), len(chunk), index)) start = time() for idx, bugIdx in enumerate(chunk): bugId = bugToCreateList[bugIdx] position = bugPosition[bugId] bugRep = matrixRep[position] product = bugReportDatabase.getBug(bugId)['product'] if sameProd else None similarityMatrix = matrixRep.dot(bugRep.T).toarray() masterId = masterBugIdByBugId.get(bugId) nMostSimilar = heapq.nlargest(listSize, iterateSimList(bugIds, similarityMatrix, masterId, masterBugIdByBugId, product, bugReportDatabase)) output = (bugId, nMostSimilar) if (idx + 1) % 100 == 0: t = time() - start logger.info("%s computed similarity list for %d of %d in %f seconds" % ( os.getpid(), idx + 1, len(chunk), t)) start = time() queue.put(output) queue.put([]) logger.info("Process %s has finished" % (os.getpid())) bugIdxs = list(range(len(bugToCreateList))) q = Queue() processes = [] for idx, chunk in enumerate(createChunks(bugIdxs, nProcesses)): arr = RawArray(c_ulong, chunk) processes.append(multiprocessing.Process(target=parallel, args=(arr, q, idx))) for p in processes: p.start() count = 0 while True: try: out = q.get() if len(out) == 0: count += 1 logger.info("One process was ended up! Count: %d/%d" % (count, len(processes))) else: bugId, simList = out similarityIterByBugId[bugId] = ([int(negBugId) for cosine, negBugId in simList], [cosine for cosine, negBugId in simList[:30]], [cosine for cosine, negBugId in simList[-30:]]) nProcessedBugs = len(similarityIterByBugId) if nProcessedBugs % 100 == 0: logger.info("Main Thread: Processed %d " % (nProcessedBugs)) if nProcessedBugs == len(bugIdxs): logger.info("It is over!") break if count == len(processes): break except Empty as e: pass return similarityIterByBugId
bugIds, duplicateByBugId, pairs, validations = loadData(args.input) biggestValidation = validations[-1] bugReportDataset = BugReportDatabase.fromJson(args.bug_dataset) bugIds = list(bugIds) similarityListByDuplicate = [] if args.model_type == 'tfidf': # Load Model global vectorByBug vectorByBug = {} tfIdfVectorizer = pickle.load(open(args.model, 'rb')) # Generate bag of words representation for each bug texts = [ concatenateSummaryAndDescription(bugReportDataset.getBug(bugId)) for bugId in bugIds ] vectors = tfIdfVectorizer.transform(texts) for idx, bugId in enumerate(bugIds): vectorByBug[bugId] = vectors[idx] else: # We can't import torch without allocating a GPU in Cedar cluster. from experiments.duplicate_bug_detection_deep_learning import generateBugEmbeddings, \ calculateSimilarityScoresDL, \ CosinePrediction, getDataHandlerLexiconEmb, getModel import torch import torch.nn.functional as F from util.torch_util import softmaxPrediction, getVariable
bugId1, bugId2, label = l.strip().split(',') trainingBugs.add(bugId1) trainingBugs.add(bugId2) else: logger.info("Reading training") bugDataset = BugDataset(args.training) trainingBugs.update(bugDataset.bugIds) logger.info("Preprocessing and fitting data") trainingText = [] for bugId in trainingBugs: bugReport = bugReportDataset.getBug(bugId) text = concatenateSummaryAndDescription(bugReport) trainingText.append(text) if args.load: logger.info('Loading object') vectorizer = pickle.load(open(args.load, 'rb')) else: tokenizer = WhitespaceTokenizer() if args.space_tokenize else MultiLineTokenizer() stemmer = SnowballStemmer('english', ignore_stopwords=True) if args.stemmer else None stopWords = set(stopwords.words('english') + list(string.punctuation) + ["n't", "'t", ]) filters = [TransformNumberToZeroFilter(), StripPunctuactionFilter(), DectectNotUsualWordFilter()] tokenizerStemmer = ClassicalPreprocessing(tokenizer, stemmer, stopWords, filters) logger.info("Using %s to tokenize" % tokenizer.__class__.__name__) if args.trigram:
def __init__(self, bugReportDatabase, bugIds, vectorizerClass, bugToCache, mastersetByBugId, normalize): self.bugReportDatabase = bugReportDatabase self.bugIds = bugIds self.vectorizerClass = vectorizerClass bugTexts = [ concatenateSummaryAndDescription(bugReportDatabase.getBug(bugId)) for bugId in bugIds ] self.logger = logging.getLogger(__name__) self.logger.info("Transforming text to vector") vectors = self.vectorizerClass.transform(bugTexts) if normalize: self.logger.info("Normalizing vectors to length 1") self.matrixRep = sklearn.preprocessing.data.normalize(vectors) else: self.matrixRep = vectors self.sparseRepByBugId = {} for bugId, representation in zip(self.bugIds, self.matrixRep): self.sparseRepByBugId[bugId] = representation.T self.similarityIterByBugId = {} # Cache the similarity list of the bugs self.logger.info("Starting to cache the similarity list") def parallel(chunk, queue, index): logger = logging.getLogger() logger.info( "Process %s started to compute the similarity for %d duplicate bugs. Start idx: %d" % (os.getpid(), len(chunk), index)) output = [] start = time() for idx, bugId in enumerate(chunk): bugId = str(bugId) simList = self.generateSimilarityList(bugId, mastersetByBugId[bugId], iterator=False) output.append((bugId, simList)) if (idx + 1) % 100 == 0: t = time() - start self.logger.info( "%s computed similarity list for %d of %d in %f seconds" % (os.getpid(), idx + 1, len(chunk), t)) queue.put(output) q = Queue() nProcesses = 6 processes = [] for idx, chunk in enumerate(createChunks(bugToCache, nProcesses)): arr = RawArray(c_ulong, [int(bugId) for bugId in chunk]) processes.append( multiprocessing.Process(target=parallel, args=(arr, q, idx))) for p in processes: p.start() count = 0 while True: try: for bugId, simList in q.get(): self.similarityIterByBugId[bugId] = iter(simList) count += 1 if count == len(processes): break except Empty as e: pass