def compareSimilarity(file, label, margin=0.1):
    total = 0
    nBiggerMargin = 0
    nBigger = 0
    f = open(file, 'r')

    for l in f:
        bug, pos, neg = l.strip().split(',')

        bugText = concatenateSummaryAndDescription(bugReportDataset.getBug(bug))
        posText = concatenateSummaryAndDescription(bugReportDataset.getBug(pos))
        negText = concatenateSummaryAndDescription(bugReportDataset.getBug(neg))

        ftrsBug, ftrsPos, ftrsNeg = vectorizer.transform([bugText, posText, negText])

        simPos = cosine_similarity(ftrsBug, ftrsPos)
        simNeg = cosine_similarity(ftrsBug, ftrsNeg)

        if simPos - simNeg >= margin:
            nBiggerMargin += 1

        if simPos > simNeg:
            nBigger += 1

        total += 1

    print("%s - triplets which the difference between positive and negative is bigger than %d: %.3f (%d/%d)" % (
        label, margin, nBiggerMargin * 100.0 / total, nBiggerMargin, total))
    print("%s - triplets which the positive similarity is bigger than the negative: %.3f (%d/%d)" % (
        label, nBigger * 100.0 / total, nBigger, total))
    def pregenerateBugEmbedding(self, bugIds):
        # Generate bag of words representation for each bug
        texts = [
            concatenateSummaryAndDescription(
                self.bugReportDatabase.getBug(bugId)) for bugId in bugIds
        ]
        vectors = self.tfIdfVectorizer.transform(texts)

        for idx, bugId in enumerate(bugIds):
            self.bugEmbeddingById[bugId] = vectors[idx]
Exemplo n.º 3
0
def generateNegativeListSparseVector(listSize, bugReportDatabase, bugIds, vectorizerClass, masterBugIdByBugId,
                                     normalize, nProcesses, sameProd=False):
    bugTexts = [concatenateSummaryAndDescription(bugReportDatabase.getBug(bugId)) for bugId in bugIds]
    logger = logging.getLogger(__name__)

    logger.info("Transforming text to vector")
    vectors = vectorizerClass.transform(bugTexts)

    if normalize:
        logger.info("Normalizing vectors to length 1")
        matrixRep = sklearn.preprocessing.data.normalize(vectors)
    else:
        matrixRep = vectors

    similarityIterByBugId = {}

    # Cache the similarity list of the bugs
    logger.info("Starting to cache the similarity list")

    bugToCreateList = []
    bugPosition = {id: idx for idx, id in enumerate(bugIds)}

    for master in bugReportDatabase.getMasterSetById(bugIds).values():
        for bugId in master:
            if bugPosition.get(bugId, -1) != -1:
                bugToCreateList.append(bugId)

    def parallel(chunk, queue, index):
        logger = logging.getLogger()
        logger.info(
            "Process %s started to compute the similarity for %d duplicate bugs. Start idx: %d" % (
                os.getpid(), len(chunk), index))

        start = time()
        for idx, bugIdx in enumerate(chunk):
            bugId = bugToCreateList[bugIdx]
            position = bugPosition[bugId]
            bugRep = matrixRep[position]
            product = bugReportDatabase.getBug(bugId)['product'] if sameProd else None

            similarityMatrix = matrixRep.dot(bugRep.T).toarray()

            masterId = masterBugIdByBugId.get(bugId)
            nMostSimilar = heapq.nlargest(listSize,
                                          iterateSimList(bugIds, similarityMatrix, masterId, masterBugIdByBugId,
                                                         product, bugReportDatabase))

            output = (bugId, nMostSimilar)

            if (idx + 1) % 100 == 0:
                t = time() - start
                logger.info("%s computed similarity list for %d of %d in %f seconds" % (
                    os.getpid(), idx + 1, len(chunk), t))
                start = time()

            queue.put(output)

        queue.put([])

        logger.info("Process %s has finished" % (os.getpid()))

    bugIdxs = list(range(len(bugToCreateList)))

    q = Queue()
    processes = []

    for idx, chunk in enumerate(createChunks(bugIdxs, nProcesses)):
        arr = RawArray(c_ulong, chunk)
        processes.append(multiprocessing.Process(target=parallel, args=(arr, q, idx)))

    for p in processes:
        p.start()

    count = 0

    while True:
        try:
            out = q.get()

            if len(out) == 0:
                count += 1
                logger.info("One process was ended up! Count: %d/%d" % (count, len(processes)))
            else:
                bugId, simList = out
                similarityIterByBugId[bugId] = ([int(negBugId) for cosine, negBugId in simList],
                                                [cosine for cosine, negBugId in simList[:30]],
                                                [cosine for cosine, negBugId in simList[-30:]])

            nProcessedBugs = len(similarityIterByBugId)

            if nProcessedBugs % 100 == 0:
                logger.info("Main Thread: Processed %d " % (nProcessedBugs))

            if nProcessedBugs == len(bugIdxs):
                logger.info("It is over!")
                break

            if count == len(processes):
                break
        except Empty as e:
            pass

    return similarityIterByBugId
Exemplo n.º 4
0
    bugIds, duplicateByBugId, pairs, validations = loadData(args.input)

    biggestValidation = validations[-1]
    bugReportDataset = BugReportDatabase.fromJson(args.bug_dataset)
    bugIds = list(bugIds)
    similarityListByDuplicate = []

    if args.model_type == 'tfidf':
        # Load Model
        global vectorByBug
        vectorByBug = {}
        tfIdfVectorizer = pickle.load(open(args.model, 'rb'))

        # Generate bag of words representation for each bug
        texts = [
            concatenateSummaryAndDescription(bugReportDataset.getBug(bugId))
            for bugId in bugIds
        ]
        vectors = tfIdfVectorizer.transform(texts)

        for idx, bugId in enumerate(bugIds):
            vectorByBug[bugId] = vectors[idx]

    else:
        # We can't import torch without allocating a GPU in Cedar cluster.
        from experiments.duplicate_bug_detection_deep_learning import generateBugEmbeddings, \
            calculateSimilarityScoresDL, \
            CosinePrediction, getDataHandlerLexiconEmb, getModel
        import torch
        import torch.nn.functional as F
        from util.torch_util import softmaxPrediction, getVariable
                bugId1, bugId2, label = l.strip().split(',')

                trainingBugs.add(bugId1)
                trainingBugs.add(bugId2)
        else:
            logger.info("Reading training")
            bugDataset = BugDataset(args.training)
            trainingBugs.update(bugDataset.bugIds)


    logger.info("Preprocessing and fitting data")
    trainingText = []

    for bugId in trainingBugs:
        bugReport = bugReportDataset.getBug(bugId)
        text = concatenateSummaryAndDescription(bugReport)
        trainingText.append(text)

    if args.load:
        logger.info('Loading  object')
        vectorizer = pickle.load(open(args.load, 'rb'))
    else:
        tokenizer = WhitespaceTokenizer() if args.space_tokenize else MultiLineTokenizer()
        stemmer = SnowballStemmer('english', ignore_stopwords=True) if args.stemmer else None
        stopWords = set(stopwords.words('english') + list(string.punctuation) + ["n't", "'t", ])
        filters = [TransformNumberToZeroFilter(), StripPunctuactionFilter(), DectectNotUsualWordFilter()]
        tokenizerStemmer = ClassicalPreprocessing(tokenizer, stemmer, stopWords, filters)

        logger.info("Using %s to tokenize" % tokenizer.__class__.__name__)

        if args.trigram:
    def __init__(self, bugReportDatabase, bugIds, vectorizerClass, bugToCache,
                 mastersetByBugId, normalize):
        self.bugReportDatabase = bugReportDatabase
        self.bugIds = bugIds
        self.vectorizerClass = vectorizerClass

        bugTexts = [
            concatenateSummaryAndDescription(bugReportDatabase.getBug(bugId))
            for bugId in bugIds
        ]
        self.logger = logging.getLogger(__name__)

        self.logger.info("Transforming text to vector")
        vectors = self.vectorizerClass.transform(bugTexts)

        if normalize:
            self.logger.info("Normalizing vectors to length 1")
            self.matrixRep = sklearn.preprocessing.data.normalize(vectors)
        else:
            self.matrixRep = vectors

        self.sparseRepByBugId = {}
        for bugId, representation in zip(self.bugIds, self.matrixRep):
            self.sparseRepByBugId[bugId] = representation.T

        self.similarityIterByBugId = {}

        # Cache the similarity list of the bugs
        self.logger.info("Starting to cache the similarity list")

        def parallel(chunk, queue, index):
            logger = logging.getLogger()
            logger.info(
                "Process %s started to compute the similarity for %d duplicate bugs. Start idx: %d"
                % (os.getpid(), len(chunk), index))

            output = []
            start = time()
            for idx, bugId in enumerate(chunk):
                bugId = str(bugId)
                simList = self.generateSimilarityList(bugId,
                                                      mastersetByBugId[bugId],
                                                      iterator=False)
                output.append((bugId, simList))

                if (idx + 1) % 100 == 0:
                    t = time() - start
                    self.logger.info(
                        "%s computed similarity list for %d of %d in %f seconds"
                        % (os.getpid(), idx + 1, len(chunk), t))

            queue.put(output)

        q = Queue()
        nProcesses = 6
        processes = []
        for idx, chunk in enumerate(createChunks(bugToCache, nProcesses)):
            arr = RawArray(c_ulong, [int(bugId) for bugId in chunk])
            processes.append(
                multiprocessing.Process(target=parallel, args=(arr, q, idx)))

        for p in processes:
            p.start()

        count = 0

        while True:
            try:
                for bugId, simList in q.get():
                    self.similarityIterByBugId[bugId] = iter(simList)

                count += 1

                if count == len(processes):
                    break
            except Empty as e:
                pass