예제 #1
0
    def __generate_single_bug(self, bug_id, drop_rate):
        """Generate an incomplete bug report text.
        
        Args:
            bug_id: int, original bug id.
            drop_rate: float, 0.0 for not drop, 1.0 for totally drop.
        
        Returns:
            IRReport
        """
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_report import IRReport

        # get description and summary
        summary, description = IRText.get_summary_and_description_of_bug(bug_id)
        create_ts, product = IRText.get_basic_info_of_bug(bug_id)
        if drop_rate > 0.001:
            summary, description = \
                IRTermCount.create_incomplete_report(summary, description, drop_rate)
            print description
        new_report = IRReport(summary, description)
        new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id))
        new_report.set_dummy_bug_id(bug_id)
        new_report.set_basic_info(create_ts, product)
        return new_report
예제 #2
0
    def test_compare_stackinfo(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_gnome_st_tools import IRSTTools
        from ir_text import IRText
        from random import randint
        import pymongo

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRText.parse_info_level1('../data/test/stacktrace_test')

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col

        bugs = col.find()
        total = col.count()
        st1 = bugs[0]["stacktrace"]

        for i in range(total):
            st2 = bugs[i]["stacktrace"]
            result_weight = IRSTTools.compare_stackinfo(st1, st2, 'weight')
            result_max = IRSTTools.compare_stackinfo(st1, st2, 'max')
            IRLog.get_instance().println('Weight: %f, Max: %f' \
                    % (result_weight, result_max))

        IRLog.get_instance().stop_log()
예제 #3
0
    def test_compare_stackinfo(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_gnome_st_tools import IRSTTools
        from ir_text import IRText
        from random import randint
        import pymongo

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRText.parse_info_level1('../data/test/stacktrace_test')
        
        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col

        bugs = col.find()
        total = col.count()
        st1 = bugs[0]["stacktrace"]

        for i in range(total):
            st2 = bugs[i]["stacktrace"]
            result_weight = IRSTTools.compare_stackinfo(st1, st2, 'weight')
            result_max = IRSTTools.compare_stackinfo(st1, st2, 'max')
            IRLog.get_instance().println('Weight: %f, Max: %f' \
                    % (result_weight, result_max))

        IRLog.get_instance().stop_log()
예제 #4
0
    def test_filter(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_gnome_st_tools import IRSTTools
        from ir_text import IRText
        import pymongo

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRText.parse_info_level1('../data/test/info_level1_test')
        
        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # Maybe a bug here:
        # The test of filter (originally) depends on parse_info_level1
        # But parse_info_level1 seems to invoke filter...
        for bug in col.find():
            # TODO: it's not correct. no stacktrace in desc
            desc, stack = IRSTTools.filter(bug["desc"])      


        IRLog.get_instance().stop_log()
예제 #5
0
    def test_filter(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_gnome_st_tools import IRSTTools
        from ir_text import IRText
        import pymongo

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRText.parse_info_level1('../data/test/info_level1_test')

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # Maybe a bug here:
        # The test of filter (originally) depends on parse_info_level1
        # But parse_info_level1 seems to invoke filter...
        for bug in col.find():
            # TODO: it's not correct. no stacktrace in desc
            desc, stack = IRSTTools.filter(bug["desc"])

        IRLog.get_instance().stop_log()
예제 #6
0
    def __generate_single_bug(self, bug_id, drop_rate):
        """Generate an incomplete bug report text.
        
        Args:
            bug_id: int, original bug id.
            drop_rate: float, 0.0 for not drop, 1.0 for totally drop.
        
        Returns:
            IRReport
        """
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_report import IRReport

        # get description and summary
        summary, description = IRText.get_summary_and_description_of_bug(
            bug_id)
        create_ts, product = IRText.get_basic_info_of_bug(bug_id)
        if drop_rate > 0.001:
            summary, description = \
                IRTermCount.create_incomplete_report(summary, description, drop_rate)
            print description
        new_report = IRReport(summary, description)
        new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id))
        new_report.set_dummy_bug_id(bug_id)
        new_report.set_basic_info(create_ts, product)
        return new_report
예제 #7
0
    def test_parse_info_level1(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_text import IRText
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        assert None != IRConfig.get_instance()
        IRText.parse_info_level1('../data/test/info_level1_test')
        IRLog.get_instance().stop_log()

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # in the test data, we have 1000 in total.
        # within, 40 have no resolution, 154 are incomplete
        assert 833 == col.count()
        assert 'gnome is full of bugs ! (100000 currently)' == \
                col.find({'bug_id':100000})[0]["summ"]

        res = col.find(
            {"summ": {
                '$regex': '(&gt)|(&lt)|(&quot)|(&apo)s|(&amp)'
            }})
        assert res.count() == 0
예제 #8
0
    def test_parse_info_level1(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_log import IRLog
        from ir_text import IRText
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        assert None != IRConfig.get_instance()
        IRText.parse_info_level1('../data/test/info_level1_test')
        IRLog.get_instance().stop_log()

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col
        # in the test data, we have 1000 in total.
        # within, 40 have no resolution, 154 are incomplete
        assert 833 == col.count()
        assert 'gnome is full of bugs ! (100000 currently)' == \
                col.find({'bug_id':100000})[0]["summ"]
        
        res = col.find({"summ":{'$regex':'(&gt)|(&lt)|(&quot)|(&apo)s|(&amp)'}})
        assert res.count() == 0
예제 #9
0
def server_cache(msg, res):
    from ir_log import IRLog
    from ir_text import IRText
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRLog.get_instance().println('Server is caching data')
    IRText.cache_all_data()
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRLog.get_instance().println('Server cached data')
    return SIGNAL_CONTINUE
예제 #10
0
def server_cache(msg, res):
    from ir_log import IRLog
    from ir_text import IRText
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRLog.get_instance().println('Server is caching data')
    IRText.cache_all_data()
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()
    IRLog.get_instance().println('Server cached data')
    return SIGNAL_CONTINUE
예제 #11
0
    def do_remove_bad_reports(cls, config_file):

        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        import ir_mongodb_helper
        from ir_text import IRText

        
        config = IRConfig.get_instance()
        config.load(config_file)
        bug_id_name = config.get('bug_id_name')
        bug_description_name = config.get('bug_description_name')
        text_cursor = IRText.get_iterator(None)
        remove_ids = []
        def iter_text(item):
            if IRText.is_drop_report(item[bug_description_name]):
                remove_ids.append(item[bug_id_name])
                IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3)
        IRProgressBar.execute_iteration_for_cursor(text_cursor, iter_text)

        # remove from all database
        def remove_from_collection(collection_cfg_name):
            collection =ir_mongodb_helper.IRCollection( \
                'bug_db_name', collection_cfg_name, 'a')
            collection.remove({'bug_id':{'$in':remove_ids}})
            collection.close()

        remove_from_collection('bug_text_collection_name')
        remove_from_collection('bug_tfidf_collection_name')
        remove_from_collection('bug_duplicate_collection_name')
예제 #12
0
 def test_get_stacktrace_text_of_bug(self):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_text import IRText
     IRConfig.get_instance().load('../data/test/bug_test.cfg')
     stacktrace_text = IRText.get_stacktrace_text_of_bug(104400)
     IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
예제 #13
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365

        bug_id_name = IRConfig.get_instance().get('bug_id_name')

        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')

        reports2scan = basic_collection.find({
            product_name: self.get_product(),
            create_ts_name: {
                '$gt': self.get_create_ts() - search_time_span
            },
            bug_id_name: {
                '$nin': self.__exclude_report_ids
            }
        })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )

        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(
                    bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
예제 #14
0
 def batch_generate_term_count(cls):
     """Generate term count for text in mongodb database,
         and store to database.
     """
     from ir_log import IRProgressBar
     from ir_text import IRText
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # config
     bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
     summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
     description_name = IRConfig.get_instance().\
             get('bug_description_name', 'desc')
     
     termcount_collection = IRCollection(
         'bug_db_name', 'bug_termcount_collection_name', 'w')
     def iter_text(bug):
         summary_bow, description_bow = cls.calculate_term_count(
             bug[summary_name], bug[description_name])
         termcount_collection.insert({
             bug_id_name : bug[bug_id_name],
             summary_name : summary_bow,
             description_name : description_bow })
     IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}),
                                                iter_text, "From Text to Term Count")
     termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
     termcount_collection.close()
예제 #15
0
 def test_get_stacktrace_text_of_bug(self):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_text import IRText
     IRConfig.get_instance().load('../data/test/bug_test.cfg')
     stacktrace_text = IRText.get_stacktrace_text_of_bug(104400)
     IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
예제 #16
0
    def test_get_summary_and_description_of_bug(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_text import IRText
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRText.get_summary_and_description_of_bug(100000)
        IRLog.get_instance().println('summary: %s' % (summary))
        IRLog.get_instance().println('description: %s' % (description))
예제 #17
0
    def test_parse_dump_file(self):
        from ir_log import IRLog
        from ir_text import IRText
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        assert None != IRConfig.get_instance()
        IRText.parse_dump_file('../data/test/dump_file_test')
        IRLog.get_instance().stop_log()

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col

        assert 1000 == col.count()
예제 #18
0
    def test_parse_dump_file(self):
        from ir_log import IRLog
        from ir_text import IRText
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        assert None != IRConfig.get_instance()
        IRText.parse_dump_file('../data/test/dump_file_test')
        IRLog.get_instance().stop_log()

        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        assert None != db
        col = db[IRConfig.get_instance().get('bug_text_collection_name')]
        assert None != col

        assert 1000 == col.count()
예제 #19
0
    def test_get_summary_and_description_of_bug(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_text import IRText
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRText.get_summary_and_description_of_bug(
            100000)
        IRLog.get_instance().println('summary: %s' % (summary))
        IRLog.get_instance().println('description: %s' % (description))
예제 #20
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365
        
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        
        reports2scan = basic_collection.find({
            product_name : self.get_product(),
            create_ts_name : {'$gt' : self.get_create_ts() - search_time_span},
            bug_id_name : {'$nin' : self.__exclude_report_ids} })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )
        
        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace 
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
예제 #21
0
 def get_stacktrace(self):
     if self.__bug_id is None:
         return self.__stacktrace
     else:
         if self.__allow_cache and self.__stacktrace is not None:
             return self.__stacktrace
         from ir_text import IRText
         stack = IRText.get_stacktrace_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__stacktrace = stack
         return stack
예제 #22
0
 def get_stacktrace(self):
     if self.__bug_id is None:
         return self.__stacktrace
     else:
         if self.__allow_cache and self.__stacktrace is not None:
             return self.__stacktrace
         from ir_text import IRText
         stack = IRText.get_stacktrace_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__stacktrace = stack
         return stack
예제 #23
0
 def get_basic_info(self):
     if self.__bug_id is None:
         return self.__create_ts, self.__product
     else:
         if self.__allow_cache and \
                         self.__create_ts is not None and \
                         self.__product is not None:
             return self.__create_ts, self.__product
         from ir_text import IRText
         create_ts, product = IRText.get_basic_info_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__create_ts, self.__product = (create_ts, product)
         return create_ts, product
예제 #24
0
 def get_basic_info(self):
     if self.__bug_id is None:
         return self.__create_ts, self.__product
     else:
         if self.__allow_cache and \
                         self.__create_ts is not None and \
                         self.__product is not None:
             return self.__create_ts, self.__product
         from ir_text import IRText
         create_ts, product = IRText.get_basic_info_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__create_ts, self.__product = (create_ts, product)
         return create_ts, product
예제 #25
0
 def get_summary_and_description_text(self):
     if self.__bug_id is None:
         return [self.__summary_text, self.__description_text]
     else:
         if self.__allow_cache and \
                         self.__summary_text is not None and \
                         self.__description_text is not None:
             return [self.__summary_text, self.__description_text]
         from ir_text import IRText
         summary, description = \
                 IRText.get_summary_and_description_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__summary_text, self.__description_text = \
                     summary, description
         return summary, description
예제 #26
0
 def get_summary_and_description_text(self):
     if self.__bug_id is None:
         return [self.__summary_text, self.__description_text]
     else:
         if self.__allow_cache and \
                         self.__summary_text is not None and \
                         self.__description_text is not None:
             return [self.__summary_text, self.__description_text]
         from ir_text import IRText
         summary, description = \
                 IRText.get_summary_and_description_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__summary_text, self.__description_text = \
                     summary, description
         return summary, description
예제 #27
0
    def do_test_over_file(self, filename):
        """Do test over the file.

        Args:
            filename: str, the input file which generated by 
                generate_incomplete_test_file.
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        from ir_report import IRReport
        from ir_document_count import IRDocumentCount

        IRText.cache_all_data()
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()

        remove_self_bug_id = IRConfig.get_instance().get_bool('remove_self_bug_id', True)

        sim_tot_precision = 0.0
        sim_tot_recall = 0.0
        sim_bi_tot_recall = 0.0
        sim_tot_size = 0

        dup_tot_precision = 0.0
        dup_tot_recall = 0.0
        dup_bi_toto_recall = 0.0
        dup_num = 0
        test_num = 0

        infile = open(filename, 'r')
        for line in infile:
            IRLog.get_instance().println('----test----')
            test_num += 1
            line.strip()
            new_report = IRReport.from_string(line)
            ori_report = IRReport(new_report.get_dummy_bug_id())
            #IRLog.get_instance().println('Summary')
            #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(),
            #                              new_report.get_summary_termcount())
            #IRLog.get_instance().println('Description')
            #IRTermCount.show_dict_compare(ori_report.get_description_termcount(),
            #                              new_report.get_description_termcount())
            # do test for single
            similarities, duplicates = new_report.similarities_and_duplicates()
            sim_ids = [sim[0] for sim in similarities]
            dup_ids = [dup[0] for dup in duplicates]
            IRLog.get_instance().println('Sim ids: %s' % str(sim_ids))
            IRLog.get_instance().println('Dup ids: %s' % str(dup_ids))
            # evaluate sim
            sim_hit, sim_nothit, real_duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(
                    new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id)
            # some group contain only one
            if real_duplicates.__len__() == 0:
                test_num -= 1
                continue
            
            precision, recall = self.__report_result(
                new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates)

            sim_tot_precision += precision
            sim_tot_recall += recall
            sim_tot_size += sim_ids.__len__()
            sim_bi_tot_recall += 1 if recall > 0.0 else 0

            if dup_ids.__len__() > 0:
                dup_num += 1
                dup_hit, dup_nothit, real_duplicates = \
                        IRDuplicateGroup.is_in_same_duplicate_group(
                                new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id)
                precision, recall = self.__report_result(
                        new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates)
                dup_tot_precision += precision
                dup_tot_recall += recall
                dup_bi_toto_recall += 1 if recall > 0.0 else 0
        # general conclusion
        if dup_num == 0:
            dup_num = 1.0
        IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\
                '#dup', 'dup pre', 'dup rec', 'dup birec']))
        IRLog.get_instance().println(','.join([str(test_num), \
                str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \
                str(dup_num), \
                str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)]))
        infile.close()
예제 #28
0
 def test_cache_all_data(self):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_text import IRText
     IRConfig.get_instance().load('../data/test/bug_test.cfg')
     IRText.cache_all_data()
예제 #29
0
    def do_test_over_file(self, filename):
        """Do test over the file.

        Args:
            filename: str, the input file which generated by 
                generate_incomplete_test_file.
        """
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_duplicate_group import IRDuplicateGroup
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        from ir_report import IRReport
        from ir_document_count import IRDocumentCount

        IRText.cache_all_data()
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()

        remove_self_bug_id = IRConfig.get_instance().get_bool(
            'remove_self_bug_id', True)

        sim_tot_precision = 0.0
        sim_tot_recall = 0.0
        sim_bi_tot_recall = 0.0
        sim_tot_size = 0

        dup_tot_precision = 0.0
        dup_tot_recall = 0.0
        dup_bi_toto_recall = 0.0
        dup_num = 0
        test_num = 0

        infile = open(filename, 'r')
        for line in infile:
            IRLog.get_instance().println('----test----')
            test_num += 1
            line.strip()
            new_report = IRReport.from_string(line)
            ori_report = IRReport(new_report.get_dummy_bug_id())
            #IRLog.get_instance().println('Summary')
            #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(),
            #                              new_report.get_summary_termcount())
            #IRLog.get_instance().println('Description')
            #IRTermCount.show_dict_compare(ori_report.get_description_termcount(),
            #                              new_report.get_description_termcount())
            # do test for single
            similarities, duplicates = new_report.similarities_and_duplicates()
            sim_ids = [sim[0] for sim in similarities]
            dup_ids = [dup[0] for dup in duplicates]
            IRLog.get_instance().println('Sim ids: %s' % str(sim_ids))
            IRLog.get_instance().println('Dup ids: %s' % str(dup_ids))
            # evaluate sim
            sim_hit, sim_nothit, real_duplicates = \
                IRDuplicateGroup.is_in_same_duplicate_group(
                    new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id)
            # some group contain only one
            if real_duplicates.__len__() == 0:
                test_num -= 1
                continue

            precision, recall = self.__report_result(
                new_report.get_dummy_bug_id(), sim_hit, sim_nothit,
                real_duplicates)

            sim_tot_precision += precision
            sim_tot_recall += recall
            sim_tot_size += sim_ids.__len__()
            sim_bi_tot_recall += 1 if recall > 0.0 else 0

            if dup_ids.__len__() > 0:
                dup_num += 1
                dup_hit, dup_nothit, real_duplicates = \
                        IRDuplicateGroup.is_in_same_duplicate_group(
                                new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id)
                precision, recall = self.__report_result(
                    new_report.get_dummy_bug_id(), dup_hit, dup_nothit,
                    real_duplicates)
                dup_tot_precision += precision
                dup_tot_recall += recall
                dup_bi_toto_recall += 1 if recall > 0.0 else 0
        # general conclusion
        if dup_num == 0:
            dup_num = 1.0
        IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\
                '#dup', 'dup pre', 'dup rec', 'dup birec']))
        IRLog.get_instance().println(','.join([str(test_num), \
                str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \
                str(dup_num), \
                str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)]))
        infile.close()
예제 #30
0
 def test_cache_all_data(self):
     from ir_log import IRLog
     from ir_config import IRConfig
     from ir_text import IRText
     IRConfig.get_instance().load('../data/test/bug_test.cfg')
     IRText.cache_all_data()
예제 #31
0
 def iter_text(item):
     if IRText.is_drop_report(item[bug_description_name]):
         remove_ids.append(item[bug_id_name])
         IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3)