Exemplo n.º 1
0
    def test_parse_info_level1(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_text import IRText
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        assert None != IRConfig.get_instance()
        IRText.parse_info_level1('../data/test/info_level1_test')
        IRLog.get_instance().stop_log()

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # in the test data, we have 1000 in total.
        # within, 40 have no resolution, 154 are incomplete
        assert 833 == col.count()
        assert 'gnome is full of bugs ! (100000 currently)' == \
                col.find({'bug_id':100000})[0]["summ"]
        
        res = col.find({"summ":{'$regex':'(&gt)|(&lt)|(&quot)|(&apo)s|(&amp)'}})
        assert res.count() == 0
Exemplo n.º 2
0
 def test_get_stacktrace_text_of_bug(self):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_text import IRText
     IRConfig.get_instance().load('../data/test/bug_test.cfg')
     stacktrace_text = IRText.get_stacktrace_text_of_bug(104400)
     IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
Exemplo n.º 3
0
 def __is_collection_close(self):
     """Check if the operation is conducted after the collection is closed."""
     if self.__is_closed:
         from ir_log import IRLog
         IRLog.get_instance().println(
             'Error! Cannot write to closed collection.')
         assert False
Exemplo n.º 4
0
    def test_filter(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_gnome_st_tools import IRSTTools
        from ir_text import IRText
        import pymongo

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRText.parse_info_level1('../data/test/info_level1_test')

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # Maybe a bug here:
        # The test of filter (originally) depends on parse_info_level1
        # But parse_info_level1 seems to invoke filter...
        for bug in col.find():
            # TODO: it's not correct. no stacktrace in desc
            desc, stack = IRSTTools.filter(bug["desc"])

        IRLog.get_instance().stop_log()
Exemplo n.º 5
0
 def test_get_stacktrace_text_of_bug(self):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_text import IRText
     IRConfig.get_instance().load('../data/test/bug_test.cfg')
     stacktrace_text = IRText.get_stacktrace_text_of_bug(104400)
     IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
Exemplo n.º 6
0
 def run(self):
     from ir_log import IRLog
     session_state = STATE_ALIVE
     while session_state == STATE_ALIVE:
         try:
             msgpack = self.__msg_queue.get(True)
             # do something to msgpack
             conn = msgpack['connection']
             respack = msgpack['respack']
             respack[SESSION_ID] = msgpack[SESSION_ID]
             # set phase
             for key, value in msgpack.items():
                 if key in SET_COMMANDS:
                     self.__report = SET_COMMANDS[key](self.__report, value)
             # do phase
             signal = SIGNAL_CONTINUE
             for key, value in msgpack.items():
                 if key in CTL_COMMANDS:
                     signal = CTL_COMMANDS[key](self.__report, respack)
                     if signal == SIGNAL_BREAK:
                         session_state = STATE_EXPIRED
             self.__pack_report_info(respack)
             IRLog.get_instance().println('Send message: %s' % str(respack))
             conn.send(str(respack))
         except Queue.Empty:
             from ir_log import IRLog
             IRLog.get_instance().println('Session %d time out' % self.__id,
                                          2)
             break
     self.__dispatcher.remove_session(self.__id)
Exemplo n.º 7
0
        def func_each_line(line):
            bug_id, summary, description, resolution, create_ts, product = \
                    cls.__extract_information_from_info_level1_line(line)

            if resolution is not None and resolution != "INCOMPLETE":
                # post process description
                description, stacktrace = \
                        cls.extract_raw_description_info(description,
                                                         community_name)
                # drop the report whose description containing stacktrace info
                if cls.is_drop_report(description):
                    from ir_log import IRLog
                    IRLog.get_instance().println('Drop report#=%d because it '\
                            'contains unrecognizable stacktrace.' % bug_id, 3)
                    return

                collection.insert({
                    bug_id_name: bug_id,
                    summary_name: summary,
                    description_name: description,
                    stacktrace_name: stacktrace
                })
                collection_basic.insert({
                    bug_id_name: bug_id,
                    create_ts_name: create_ts,
                    product_name: product
                })
Exemplo n.º 8
0
 def run(self):
     from ir_log import IRLog
     session_state = STATE_ALIVE
     while session_state == STATE_ALIVE:
         try:
             msgpack = self.__msg_queue.get(True)
             # do something to msgpack
             conn = msgpack['connection']
             respack = msgpack['respack']
             respack[SESSION_ID] = msgpack[SESSION_ID]
             # set phase
             for key, value in msgpack.items():
                 if key in SET_COMMANDS:
                     self.__report = SET_COMMANDS[key](self.__report, value)
             # do phase
             signal = SIGNAL_CONTINUE
             for key, value in msgpack.items():
                 if key in CTL_COMMANDS:
                     signal = CTL_COMMANDS[key](self.__report, respack)
                     if signal == SIGNAL_BREAK:
                         session_state = STATE_EXPIRED
             self.__pack_report_info(respack)
             IRLog.get_instance().println('Send message: %s' % str(respack))
             conn.send(str(respack))
         except Queue.Empty:
             from ir_log import IRLog
             IRLog.get_instance().println('Session %d time out' % self.__id,
                                          2)
             break
     self.__dispatcher.remove_session(self.__id)
Exemplo n.º 9
0
    def __show_similarity_distribution(self, sorted_similarities):
        """Show the distribtuion of similarities.

        Args:
            sorted_similarities: [(bug_id, (score, ...))]
        """
        from ir_log import IRLog
        tot = sorted_similarities.__len__()
        # number of near top
        print sorted_similarities[0]
        max_score = sorted_similarities[0][1][0]
        min_score = sorted_similarities[-1][1][0]
        score_span = 0.1
        near_threshold = max_score - (max_score - min_score) * score_span
        near_one_number = 0
        for item in sorted_similarities:
            if item[1][0] > near_threshold:
                near_one_number += 1
            else:
                break
        IRLog.get_instance().println('%d in %d (%f) reports have score ' \
                'greater than %f (%f of the score span)' % \
                (near_one_number, tot, float(near_one_number)/tot,
                 near_threshold, score_span))
        # quantiles
        quantiles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
        for quan in quantiles:
            pos = int(quan * tot)
            if pos >= tot:
                pos = tot-1
            IRLog.get_instance().println('Top %d: %f' \
                    % (int(quan*100), sorted_similarities[pos][1][0]))
Exemplo n.º 10
0
    def __show_similarity_distribution(self, sorted_similarities):
        """Show the distribtuion of similarities.

        Args:
            sorted_similarities: [(bug_id, (score, ...))]
        """
        from ir_log import IRLog
        tot = sorted_similarities.__len__()
        # number of near top
        print sorted_similarities[0]
        max_score = sorted_similarities[0][1][0]
        min_score = sorted_similarities[-1][1][0]
        score_span = 0.1
        near_threshold = max_score - (max_score - min_score) * score_span
        near_one_number = 0
        for item in sorted_similarities:
            if item[1][0] > near_threshold:
                near_one_number += 1
            else:
                break
        IRLog.get_instance().println('%d in %d (%f) reports have score ' \
                'greater than %f (%f of the score span)' % \
                (near_one_number, tot, float(near_one_number)/tot,
                 near_threshold, score_span))
        # quantiles
        quantiles = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
        for quan in quantiles:
            pos = int(quan * tot)
            if pos >= tot:
                pos = tot - 1
            IRLog.get_instance().println('Top %d: %f' \
                    % (int(quan*100), sorted_similarities[pos][1][0]))
Exemplo n.º 11
0
    def test_filter(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_gnome_st_tools import IRSTTools
        from ir_text import IRText
        import pymongo

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRText.parse_info_level1('../data/test/info_level1_test')
        
        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # Maybe a bug here:
        # The test of filter (originally) depends on parse_info_level1
        # But parse_info_level1 seems to invoke filter...
        for bug in col.find():
            # TODO: it's not correct. no stacktrace in desc
            desc, stack = IRSTTools.filter(bug["desc"])      


        IRLog.get_instance().stop_log()
Exemplo n.º 12
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365

        bug_id_name = IRConfig.get_instance().get('bug_id_name')

        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')

        reports2scan = basic_collection.find({
            product_name: self.get_product(),
            create_ts_name: {
                '$gt': self.get_create_ts() - search_time_span
            },
            bug_id_name: {
                '$nin': self.__exclude_report_ids
            }
        })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )

        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(
                    bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
Exemplo n.º 13
0
 def __del__(self):
     if (self.__mode == 'w' or self.__mode == 'a') \
             and self.__is_closed == False:
         from ir_log import IRLog
         IRLog.get_instance().println('Error! Collection in modifying mode '
                                      'is destoried before being closed.')
         assert False
Exemplo n.º 14
0
    def test_parse_info_level1(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_text import IRText
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        assert None != IRConfig.get_instance()
        IRText.parse_info_level1('../data/test/info_level1_test')
        IRLog.get_instance().stop_log()

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # in the test data, we have 1000 in total.
        # within, 40 have no resolution, 154 are incomplete
        assert 833 == col.count()
        assert 'gnome is full of bugs ! (100000 currently)' == \
                col.find({'bug_id':100000})[0]["summ"]

        res = col.find(
            {"summ": {
                '$regex': '(&gt)|(&lt)|(&quot)|(&apo)s|(&amp)'
            }})
        assert res.count() == 0
Exemplo n.º 15
0
    def show_dict_compare(cls, dicta, dictb, log_level = 1):
        """Compare the print two BoW.

        Args:
            dicta: dict, term -> count
            dictb: dict
            log_level: int
        """

        from ir_log import IRLog
        keys = set()
        if None != dicta:
            for key in dicta:
                keys.add(key)
        if None != dictb:
            for key in dictb:
                keys.add(key)
        # sort by common num
        common_num = []
        for key in keys:
            counta = 0
            countb = 0
            if None != dicta:
                if key in dicta:
                    counta = dicta[key]
            if None != dictb:
                if key in dictb:
                    countb = dictb[key]
            common_num.append((key, min(counta, countb), counta, countb))
        common_num.sort(cmp=lambda a,b:cmp(a[1],b[1]), reverse=True)
        # print it out
        for item in common_num:
            IRLog.get_instance().println('%16s\t%8d\t%8d' \
                        % (item[0], item[2], item[3]), log_level)
Exemplo n.º 16
0
    def test_cache_all_data(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRDocumentCount.cache_all_data()
Exemplo n.º 17
0
 def __is_modification_legal_in_current_mode(self):
     """Check if current mode supports modifying operation."""
     self.__is_collection_close()
     if self.__mode == 'r':
         from ir_log import IRLog
         IRLog.get_instance().println(
             'Error! Cannot write to collection being opened in read mode.')
         assert False
Exemplo n.º 18
0
    def similarities_and_duplicates(self):
        """Calculate the similarities over all existing reports and return
        the similar reports and duplicate reports.

        Returns:
            [bug_id],[bug_id], [similar report ids],[duplicate report ids]
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        similar_threshold = IRConfig.get_instance().get_float(
            'bug_similar_threshold', 0.7)
        duplicate_num = IRConfig.get_instance().get_int(
            'bug_duplicate_number', 5)
        duplicate_threshold = IRConfig.get_instance().get_int(
            'bug_duplicate_threshold', 10)
        max_similar_number = IRConfig.get_instance().get_int(
            'bug_similar_max', 10000000)
        similar_threshold_percent = IRConfig.get_instance().get_float(
            'bug_similar_threshold_percent', 0.8)
        no_similar_threshold = IRConfig.get_instance().get_float(
            'bug_no_similar_threshold', 0.65)

        similarities = self.similarity_over_all().items()
        if similarities.__len__() == 0:
            return [], []
        similarities.sort(key=lambda  x:x[1][0], reverse = True)
        # report scoring
        IRLog.get_instance().println('Max score report: %s' % str(similarities[0]))
        if similarities.__len__() > 1:
            IRLog.get_instance().println('Second score report: %s' % str(similarities[1]))

        # find cutting edge of similar reports
        max_score = similarities[0][1][0]
        min_score = similarities[-1][1][0]
        IRLog.get_instance().println('max score:%f, min score: %f' %(max_score, min_score))
        IRLog.get_instance().println('no threshold:%f' % no_similar_threshold)
        if max_score < no_similar_threshold:
            return [], []

        similar_threshold_percent_cut = min_score + (max_score - min_score) *\
                                        similar_threshold_percent
        print 'cut:', similar_threshold_percent_cut

        cut_position = min(max_similar_number, self.__binary_search_less(similarities, lambda x:x[1][
            0], similar_threshold_percent_cut))
        IRLog.get_instance().println('Get %d similar reports.' % cut_position)
        # find number of duplicate groups in similar reports
        group_set = set()
        for report in similarities[:cut_position]:
            group_set.add(IRDuplicateGroup.get_group_of_bug(report[0]))
        if None in group_set:
            group_set.remove(None)
        duplicate_reports = []
        if group_set.__len__() <= duplicate_threshold:
            duplicate_reports = similarities[:min(cut_position, duplicate_num)]
        return similarities[:cut_position], duplicate_reports
Exemplo n.º 19
0
    def test_get_summary_and_description_of_bug(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_text import IRText
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRText.get_summary_and_description_of_bug(100000)
        IRLog.get_instance().println('summary: %s' % (summary))
        IRLog.get_instance().println('description: %s' % (description))
Exemplo n.º 20
0
    def test_get_squared_length(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary = {'firefox':0.4, 'chrome':0.6}
        assert abs(IRTFIDF.get_squared_length(summary) - 0.52 ) < 0.00001
Exemplo n.º 21
0
    def test_cache_all_data(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTFIDF.cache_all_data()
        IRLog.get_instance().stop_log()
Exemplo n.º 22
0
    def test_generate_document_count(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')

        IRDocumentCount.batch_generate_document_count()
        IRLog.get_instance().stop_log()
Exemplo n.º 23
0
    def print_similarity_score(cls, report_a, report_b):
        """Warning: report_a is primary! It is critial in asymatric algorithm"""

        from ir_log import IRLog

        total, summary, description, stacktrace = \
                report_a.similarity_with(report_b)
        IRLog.get_instance().println('[Similarity] %f '\
                '=[Summary]%f[Description]%f[Stacktrace]%f' \
                % (total, summary, description, stacktrace))
Exemplo n.º 24
0
    def test_get_summary_and_description_of_bug(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_text import IRText
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRText.get_summary_and_description_of_bug(
            100000)
        IRLog.get_instance().println('summary: %s' % (summary))
        IRLog.get_instance().println('description: %s' % (description))
Exemplo n.º 25
0
    def print_similarity_score(cls, report_a, report_b):
        """Warning: report_a is primary! It is critial in asymatric algorithm"""

        from ir_log import IRLog

        total, summary, description, stacktrace = \
                report_a.similarity_with(report_b)
        IRLog.get_instance().println('[Similarity] %f '\
                '=[Summary]%f[Description]%f[Stacktrace]%f' \
                % (total, summary, description, stacktrace))
Exemplo n.º 26
0
 def get_int(self, name, default_value = None):
     """Get the int value with the given name."""
     try:
         res = self.get(name, default_value)
         return int(res)
     except ValueError:
         from ir_log import IRLog
         IRLog.get_instance().println('Could not convert %d to int.' \
                 % (self.get(name)))
         return default_value
Exemplo n.º 27
0
    def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms=None):
        """Get the best term which has most entropy in diff.
            

        Args:
            diff: [(set, set)], generated by get_all_reports_difference

        Retruns:
            str, The term
        """

        termcount = {}
        max_score = -1.0
        max_score_term = None
        # count the occurance of term
        total_score = 0.0
        for index, delta in enumerate(diff):
            total_score += sim_bug_ids[index][1][0]
            # only account for
            for term in delta[1]:
                if penalty_terms is not None and term in penalty_terms:
                    continue
                if not term in termcount:
                    termcount[term] = 0.0
                termcount[term] += sim_bug_ids[index][1][0]
        # calcualte the value and pick the most
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount
        from ir_tfidf import IRTFIDF
        description_name = IRConfig.get_instance().get('bug_description_name')
        # debug use
        scoreboard = []
        # /debug use
        from math import log
        for term in termcount:
            bg_score = termcount[term] / total_score
            ig_score = -2.0 * abs(float(termcount[term]) / total_score -
                                  0.5) + 1
            idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \
                    description_name))
            score = ig_score * idf
            scoreboard.append((term, score, ig_score, idf))
            if score > max_score:
                max_score = score
                max_score_term = term
        scoreboard.sort(cmp=lambda x, y: cmp(x[1], y[1]), reverse=True)
        from ir_log import IRLog
        IRLog.get_instance().println(
            'Candidate keywords: %s' %
            ','.join(['word', 'score', 'ig_score', 'idf']))
        IRLog.get_instance().println('\n'.join([ \
                ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \
                ]))
        return max_score_term
Exemplo n.º 28
0
def server_cache(msg, res):
    from ir_log import IRLog
    from ir_text import IRText
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRLog.get_instance().println('Server is caching data')
    IRText.cache_all_data()
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRLog.get_instance().println('Server cached data')
    return SIGNAL_CONTINUE
Exemplo n.º 29
0
def server_cache(msg, res):
    from ir_log import IRLog
    from ir_text import IRText
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRLog.get_instance().println('Server is caching data')
    IRText.cache_all_data()
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRLog.get_instance().println('Server cached data')
    return SIGNAL_CONTINUE
Exemplo n.º 30
0
    def test_batch_generate_tfidf(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTFIDF.batch_generate_tfidf()
        IRLog.get_instance().stop_log()
Exemplo n.º 31
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365
        
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        
        reports2scan = basic_collection.find({
            product_name : self.get_product(),
            create_ts_name : {'$gt' : self.get_create_ts() - search_time_span},
            bug_id_name : {'$nin' : self.__exclude_report_ids} })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )
        
        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace 
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
Exemplo n.º 32
0
    def get_term_by_simple_entropy(cls, diff, sim_bug_ids, penalty_terms =
    None):
        """Get the best term which has most entropy in diff.
            

        Args:
            diff: [(set, set)], generated by get_all_reports_difference

        Retruns:
            str, The term
        """

        termcount = {}
        max_score = -1.0
        max_score_term = None
        # count the occurance of term
        total_score = 0.0
        for index, delta in enumerate(diff):
            total_score += sim_bug_ids[index][1][0]
            # only account for
            for term in delta[1]:
                if penalty_terms is not None and term in penalty_terms:
                    continue
                if not term in termcount:
                    termcount[term] = 0.0
                termcount[term] += sim_bug_ids[index][1][0]
        # calcualte the value and pick the most
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount
        from ir_tfidf import IRTFIDF
        description_name = IRConfig.get_instance().get('bug_description_name')
        # debug use
        scoreboard = []
        # /debug use
        from math import log
        for term in termcount:
            bg_score = termcount[term] / total_score
            ig_score = -2.0 * abs(float(termcount[term]) / total_score - 0.5) + 1
            idf = IRTFIDF.get_unit_idf(IRDocumentCount.get_documentcount(term, \
                    description_name))
            score = ig_score * idf
            scoreboard.append((term, score, ig_score, idf))
            if score > max_score:
                max_score = score
                max_score_term = term
        scoreboard.sort(cmp=lambda x,y:cmp(x[1],y[1]), reverse=True)
        from ir_log import IRLog
        IRLog.get_instance().println('Candidate keywords: %s' % ','.join(['word','score','ig_score','idf']))
        IRLog.get_instance().println('\n'.join([ \
                ','.join([t[0],str(t[1]), str(t[2]), str(t[3])]) for t in scoreboard[:10] \
                ]))
        return max_score_term
Exemplo n.º 33
0
    def test_get_termcount_of_bug(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRTermCount.get_termcount_of_bug(100000)
        assert None != summary
        assert None != description
        IRLog.get_instance().println('Summary')
        IRTermCount.show_dict_compare(summary, {})
        IRLog.get_instance().println('Description')
        IRTermCount.show_dict_compare(description, {})
Exemplo n.º 34
0
    def test_get_termcount_of_bug(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRTermCount.get_termcount_of_bug(100000)
        assert None != summary
        assert None != description
        IRLog.get_instance().println('Summary')
        IRTermCount.show_dict_compare(summary, {})
        IRLog.get_instance().println('Description')
        IRTermCount.show_dict_compare(description, {})
Exemplo n.º 35
0
    def test_get_documentcount(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_document_count import IRDocumentCount

        IRLog.get_instance().start_log()
        summary, description = IRDocumentCount.get_documentcount('click')
        IRLog.get_instance().println('\'click\', Document Count of summary: %d, description: %d' % (summary, description))

        assert summary == IRDocumentCount.get_documentcount(
            'click', IRConfig.get_instance().get('bug_summary_name'))
        assert description == IRDocumentCount.get_documentcount(
            'click', IRConfig.get_instance().get('bug_description_name'))
Exemplo n.º 36
0
 def compare_and_print_termcount(cls, title_a, report_a, title_b, report_b):
     from ir_log import IRLog
     from ir_term_count import IRTermCount
     summary_a, description_a = \
             report_a.get_summary_and_description_termcount()
     summary_b, description_b = \
             report_b.get_summary_and_description_termcount()
     IRLog.get_instance().println('[Termcount][Summary][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(summary_a, summary_b)
     IRLog.get_instance().println('[Termcount][Description][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(description_a, description_b)
Exemplo n.º 37
0
    def __report_result(self, bug_id, hit, nothit, duplicates):
        """
        Print the evaluation result.

        hit: actual duplicates found by algorithm
        notthis: actual non-duplicates, but are detected as duplicate by algorithm
        Return: precision, recall
        """
        from ir_log import IRLog
        total =  hit.__len__() + nothit.__len__()
        if total == 0:
            precision = 0.0
        else:
            precision = float(hit.__len__())/(hit.__len__() + nothit.__len__())
        if duplicates.__len__() == 0:
            recall = 0.0
        else:
            recall = float(hit.__len__())/duplicates.__len__()
        IRLog.get_instance().println('Bug %d, precision %f, recall %f, ' \
                'duplicate size %d' \
                % (bug_id,
                   precision,
                   recall,
                   duplicates.__len__()), 2)
        IRLog.get_instance().println('Hit %d duplicates: %s' \
                % (hit.__len__(), ','.join([str(bug_id) for bug_id in hit])), 1)
        IRLog.get_instance().println('Hit %d nonduplicates: %s' \
                % (nothit.__len__(), ','.join([str(bug_id) for bug_id in nothit])), 1)
        IRLog.get_instance().println('Actual %d duplicates: %s' \
                % (duplicates.__len__(), ','.join([str(bug_id) for bug_id in duplicates])), 1)
        return precision, recall
Exemplo n.º 38
0
 def compare_and_print_termcount(cls, title_a, report_a,
                                 title_b, report_b):
     from ir_log import IRLog
     from ir_term_count import IRTermCount
     summary_a, description_a = \
             report_a.get_summary_and_description_termcount()
     summary_b, description_b = \
             report_b.get_summary_and_description_termcount()
     IRLog.get_instance().println('[Termcount][Summary][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(summary_a, summary_b)
     IRLog.get_instance().println('[Termcount][Description][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(description_a, description_b)
Exemplo n.º 39
0
    def __report_result(self, bug_id, hit, nothit, duplicates):
        """
        Print the evaluation result.

        hit: actual duplicates found by algorithm
        notthis: actual non-duplicates, but are detected as duplicate by algorithm
        Return: precision, recall
        """
        from ir_log import IRLog
        total = hit.__len__() + nothit.__len__()
        if total == 0:
            precision = 0.0
        else:
            precision = float(
                hit.__len__()) / (hit.__len__() + nothit.__len__())
        if duplicates.__len__() == 0:
            recall = 0.0
        else:
            recall = float(hit.__len__()) / duplicates.__len__()
        IRLog.get_instance().println('Bug %d, precision %f, recall %f, ' \
                'duplicate size %d' \
                % (bug_id,
                   precision,
                   recall,
                   duplicates.__len__()), 2)
        IRLog.get_instance().println('Hit %d duplicates: %s' \
                % (hit.__len__(), ','.join([str(bug_id) for bug_id in hit])), 1)
        IRLog.get_instance().println('Hit %d nonduplicates: %s' \
                % (nothit.__len__(), ','.join([str(bug_id) for bug_id in nothit])), 1)
        IRLog.get_instance().println('Actual %d duplicates: %s' \
                % (duplicates.__len__(), ','.join([str(bug_id) for bug_id in duplicates])), 1)
        return precision, recall
Exemplo n.º 40
0
    def __generate_sample_over_a_list(self, infile, group_ids, sample_num, drop_rate):
        """
        Conduct evaluation over the bugs within the groups in group ids

        group_ids: a list of group_ids
        sample_num: the number of bugs being sampled
        drop_rate: the probability of chance to drop a word
        """
        from ir_log import IRLog
        sampling_bugs = self.__get_sample_bugs_within_groups(group_ids, sample_num)
        for bug_id in sampling_bugs:
            new_report = \
                    self.__generate_single_bug(bug_id, drop_rate)
            IRLog.get_instance().println('%d' % bug_id)
            infile.write('%s\n' % (new_report.to_string()))
Exemplo n.º 41
0
    def test_similarities_and_duplicates(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport(100000)
        similarities, duplicates = report.similarities_and_duplicates()
        IRLog.get_instance().println('Report %d' % (100000))
        IRLog.get_instance().println('%d Similar Reports: %s' % (similarities
               .__len__(), ','.join([str(item[0]) for item in similarities])))
        IRLog.get_instance().println('%d Duplicate Reports: %s' % (duplicates
               .__len__(), ','.join([str(item[0]) for item in duplicates])))
        IRLog.get_instance().stop_log()
Exemplo n.º 42
0
    def test_stemming(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        tests = ['discrimination', 'disgusting', 'visualization', 'configuration']
        stemmers = ['porter', 'lancaster', 'snowball']
        for test in tests:
            out = []
            for stemmer in stemmers:
                IRConfig.get_instance().set('stemmer', stemmer)
                out_token = IRTermCount.do_stemming([test])
                out.append(':'.join([stemmer, out_token[0]]))
            IRLog.get_instance().println('%s > %s' % (test, ', '.join(out)))
Exemplo n.º 43
0
    def test_create_new_report(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!'
        report = IRReport(summary_text, description_text)
        assert summary_text == report.get_summary_text()
        assert description_text == report.get_description_text()
        report.get_summary_and_description_tfidf()
        report.get_summary_and_description_tfidf_squared_length()
        IRLog.get_instance().stop_log()
Exemplo n.º 44
0
 def compare_and_print_tfidf(cls, title_a, report_a,
                             title_b, report_b):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_tfidf import IRTFIDF
     
     summary_field_name = IRConfig.get_instance().get('bug_summary_name')
     description_field_name = IRConfig.get_instance().get('bug_description_name')
     summary_a, description_a = report_a.get_summary_and_description_tfidf()
     summary_b, description_b = report_b.get_summary_and_description_tfidf()
     IRLog.get_instance().println('[TFIDF][Summary][%s][%s]' \
             % (title_a, title_b))
     IRTFIDF.show_dict_compare(summary_a, summary_b, summary_field_name)
     IRLog.get_instance().println('[TFIDF][Description][%s][%s]' \
             % (title_a, title_b))
     IRTFIDF.show_dict_compare(description_a, description_b, description_field_name)
Exemplo n.º 45
0
    def test_progress_bar(self):
        from ir_log import IRLog
        from ir_log import IRProgressBar

        IRLog.get_instance().start_log(True)
        title = 'ProgressBar Output Not Verbose'
        bar = IRProgressBar(1000, title, False, 0, 1)
        assert bar is not None
        for i in range(0,1001):
            bar.set_value(i)
        title = 'ProgressBar Output Verbose'
        bar = IRProgressBar(1000, title, True, 1, 0)
        assert bar is not None
        for i in range(0,1001):
            bar.set_value(i)
        IRLog.get_instance().start_log()
Exemplo n.º 46
0
    def test_create_new_report(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!'
        report = IRReport(summary_text, description_text)
        assert summary_text == report.get_summary_text()
        assert description_text == report.get_description_text()
        report.get_summary_and_description_tfidf()
        report.get_summary_and_description_tfidf_squared_length()
        IRLog.get_instance().stop_log()
Exemplo n.º 47
0
    def test_is_in_same_duplicate_group(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        hit, nothit, duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(100000, [(100000, 0.93)])
        IRLog.get_instance().println(
                'Hit: %s' % (','.join([str(bug_id) for bug_id in hit])))
        IRLog.get_instance().println(
                'Not Hit: %s' % (','.join([str(bug_id) for bug_id in nothit])))
        IRLog.get_instance().println(
                'Actual Duplicate: %s' % (','.join([str(bug_id) for bug_id in duplicates])))
        IRLog.get_instance().stop_log()
Exemplo n.º 48
0
    def test_progress_bar(self):
        from ir_log import IRLog
        from ir_log import IRProgressBar

        IRLog.get_instance().start_log(True)
        title = 'ProgressBar Output Not Verbose'
        bar = IRProgressBar(1000, title, False, 0, 1)
        assert bar is not None
        for i in range(0, 1001):
            bar.set_value(i)
        title = 'ProgressBar Output Verbose'
        bar = IRProgressBar(1000, title, True, 1, 0)
        assert bar is not None
        for i in range(0, 1001):
            bar.set_value(i)
        IRLog.get_instance().start_log()
Exemplo n.º 49
0
    def test_tfidf_asm_similarity(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_tfidf import IRTFIDF
        
        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        
        vec_a = {'firefox':1, 'chrome':1}
        vec_b = {'firefox':1, 'chrome':1, 'ie':1}
        vec_c = {'firefox':1, 'windows':1, 'linux':1}

        delta = 0.0001
        assert abs(1.0 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_b)) < delta
        assert abs(0.5 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_c)) < delta
        assert IRTFIDF.tfidf_asm_similarity(vec_a, vec_b) > \
                IRTFIDF.tfidf_asm_similarity(vec_a, vec_b, None, ['ie'], 100)
Exemplo n.º 50
0
    def test_is_in_same_duplicate_group(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        hit, nothit, duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(100000, [(100000, 0.93)])
        IRLog.get_instance().println(
            'Hit: %s' % (','.join([str(bug_id) for bug_id in hit])))
        IRLog.get_instance().println(
            'Not Hit: %s' % (','.join([str(bug_id) for bug_id in nothit])))
        IRLog.get_instance().println(
            'Actual Duplicate: %s' %
            (','.join([str(bug_id) for bug_id in duplicates])))
        IRLog.get_instance().stop_log()
Exemplo n.º 51
0
    def get_artifact(self):
        """
        Check whether all the dependencies are updated. (If not, update
            the dependencies recursively)
        Check whether this artifact needs updating.
        """

        # get all the dependencies
        for key, value in self.dependencies.items():
            value.get_artifact()
        # need update ?
        if self.action is not None and self.__need_update():
            self.action()
            if not self.is_success():
                from ir_log import IRLog
                IRLog.get_instance().println("Fail to generate %s." % self.id)
                assert False
Exemplo n.º 52
0
    def test_similarities_and_duplicates(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        report = IRReport(100000)
        similarities, duplicates = report.similarities_and_duplicates()
        IRLog.get_instance().println('Report %d' % (100000))
        IRLog.get_instance().println(
            '%d Similar Reports: %s' % (similarities.__len__(), ','.join(
                [str(item[0]) for item in similarities])))
        IRLog.get_instance().println(
            '%d Duplicate Reports: %s' % (duplicates.__len__(), ','.join(
                [str(item[0]) for item in duplicates])))
        IRLog.get_instance().stop_log()
Exemplo n.º 53
0
    def test_create_new_sentence(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_sentence import IRSentence

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        bug_id = 10000
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!'
        sent = IRSentence(description_text, bug_id)
        assert description_text == sent.get_text()
        assert bug_id == sent.get_bug_id()
        assert sent.contain_term('folder')
        sent.get_termcount()
        sent.get_tfidf()
        IRLog.get_instance().stop_log()
Exemplo n.º 54
0
    def show_dict_compare(cls, dicta, dictb, field_name = 'summ',
                          log_level = 1):
        """Compare and print the tfidf of two tfidf.
        tfidf sorted.

        Args:
            dicta: dict, TFIDF
            dictb: dict, TFIDF
            field_name: str, summary or description?
            log_level: int
        """

        from ir_log import IRLog
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount

        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'r')
        keys = set()
        if None != dicta:
            for key in dicta:
                keys.add(key)
        if None != dictb:
            for key in dictb:
                keys.add(key)
        # sort by product
        product = []
        for key in keys:
            tfidf_a = 0.0
            tfidf_b = 0.0
            if (None != dicta) and (key in dicta):
                tfidf_a = dicta[key]
            if (None != dictb) and (key in dictb):
                tfidf_b = dictb[key]
            documentcount = IRDocumentCount.get_documentcount(
                key, field_name, documentcount_collection)
            idf = cls.get_idf(documentcount)
            product.append((key, tfidf_a*tfidf_b, tfidf_a, tfidf_b, documentcount, idf))
        product.sort(cmp=lambda a,b:cmp(a[1],b[1]), reverse = True)
        # print it out
        IRLog.get_instance().println('%16s\t%8s\t%8s\t%8s\t%8s\t%8s' \
                % ('term', 'tfidf a', 'tfidf b', 'doccount', 'idf', 'sim'))
        for item in product:
            IRLog.get_instance().println('%16s\t%8f\t%8f\t%8d\t%8f\t%8f' \
                    % (item[0], item[2], item[3], item[4], item[5], item[1]), log_level)