Пример #1
0
    def get_summary_and_description_of_bug(cls, bug_id):
        """Get summary and description from mongodb.

        Args:
            bug_id: int

        Returns:
            [str, str], [summary, description]
        """
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        if cls.__is_cache:
            if bug_id in cls.__cache_summary_description:
                return cls.__cache_summary_description[bug_id]
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        text_collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'r')
        res = text_collection.find({bug_id_name : bug_id})
        summary = ''
        description = ''
        if res.count() > 0:
            summary = res[0][summary_name]
            description = res[0][description_name]
        if cls.__is_cache:
            cls.__cache_summary_description[bug_id] = (summary, description)
        return summary, description
Пример #2
0
    def cache_all_data(cls):
        """Load all document count into memory.
        
        """
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        # config
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        term_name = IRConfig.get_instance().get('bug_term_name')

        cls.__is_cache = True
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'r')

        def iter_document_count(term):
            summary = term[summary_name] if summary_name in term else 0
            description = term[
                description_name] if description_name in term else 0
            cls.__cache_document_count[term[term_name]] = \
                    (summary, description)

        IRProgressBar.execute_iteration_for_cursor(
            documentcount_collection.find({}), iter_document_count,
            "Caching Document Count")
Пример #3
0
    def get_tfidf_of_bug(cls, bug_id):
        """Get tfidf of a bug.

        Args:
            bug_id: int

        Returns:
            [dict, dict], [TFIDF of summary, TFIDF of description]
        """

        if cls.__is_cache:
            if bug_id in cls.__cache:
                return cls.__cache[bug_id]
        # load from db
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_collection = IRCollection('bug_db_name',
                                        'bug_tfidf_collection_name', 'r')
        find_result = tfidf_collection.find({bug_id_name: bug_id})
        summary = {}
        description = {}
        if find_result.count() > 0:
            summary = find_result[0][summary_name]
            description = find_result[0][description_name]
        if cls.__is_cache:
            cls.__cache[bug_id] = (summary, description)
        return summary, description
Пример #4
0
    def get_tfidf_of_bug(cls, bug_id):
        """Get tfidf of a bug.

        Args:
            bug_id: int

        Returns:
            [dict, dict], [TFIDF of summary, TFIDF of description]
        """

        if cls.__is_cache:
            if bug_id in cls.__cache:
                return cls.__cache[bug_id]
        # load from db
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_collection = IRCollection(
            'bug_db_name', 'bug_tfidf_collection_name', 'r')
        find_result = tfidf_collection.find({bug_id_name : bug_id})
        summary = {}
        description = {}
        if find_result.count() > 0:
            summary = find_result[0][summary_name]
            description = find_result[0][description_name]
        if cls.__is_cache:
            cls.__cache[bug_id] = (summary, description)
        return summary, description
Пример #5
0
    def batch_generate_tfidf(cls):
        """Batch calculate TFIDF."""

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount
        from ir_term_count import IRTermCount
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        # prepare collections
        IRDocumentCount.cache_all_data()
        tfidf_collection = IRCollection(
            'bug_db_name', 'bug_tfidf_collection_name', 'w')
        # batch calculate tfidf
        termcount_iterator = IRTermCount.get_iterator()
        bug_count = termcount_iterator.count()
        def iter_term_count(bug):
            summary_tfidf = cls.calculate_tfidf(bug[summary_name],
                                                summary_name, bug_count, None, tfidf_algorithm)
            description_tfidf = cls.calculate_tfidf(bug[description_name],
                                                    description_name, bug_count, None, tfidf_algorithm)
            tfidf_collection.insert({bug_id_name : bug[bug_id_name],
                                     summary_name : summary_tfidf,
                                     description_name : description_tfidf})
        IRProgressBar.execute_iteration_for_cursor(termcount_iterator,
                                                   iter_term_count, "Calculating TFIDF")
        tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        tfidf_collection.close()
Пример #6
0
    def get_summary_and_description_of_bug(cls, bug_id):
        """Get summary and description from mongodb.

        Args:
            bug_id: int

        Returns:
            [str, str], [summary, description]
        """
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        if cls.__is_cache:
            if bug_id in cls.__cache_summary_description:
                return cls.__cache_summary_description[bug_id]
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        text_collection = IRCollection('bug_db_name',
                                       'bug_text_collection_name', 'r')
        res = text_collection.find({bug_id_name: bug_id})
        summary = ''
        description = ''
        if res.count() > 0:
            summary = res[0][summary_name]
            description = res[0][description_name]
        if cls.__is_cache:
            cls.__cache_summary_description[bug_id] = (summary, description)
        return summary, description
Пример #7
0
    def get_termcount_of_bug(cls, bug_id):
        """Get termcount of a bug

        Args:
            bug_id: int

        Returns:
            [dict, dict], [termcount of summary, termcount of description]
        """

        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        if cls.__is_cache:
            if bug_id in cls.__cache_term_count:
                return cls.__cache_term_count[bug_id]
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        termcount_collection = IRCollection(
            'bug_db_name', 'bug_termcount_collection_name', 'r')
        res = termcount_collection.find({bug_id_name : bug_id})
        summary = {}
        description = {}
        if res.count() > 0:
            summary = res[0][summary_name]
            description = res[0][description_name]
        if cls.__is_cache:
            cls.__cache_term_count[bug_id] = (summary, description)
        return summary, description
Пример #8
0
    def get_stacktrace_of_bug(cls, bug_id):
        """Get stacktrace from mongodb.

        Args:
            bug_id: int

        Returns:
            [[str]], [[signature]]
        """
        if cls.__is_cache:
            if bug_id in cls.__cache_stacktrace:
                return cls.__cache_stacktrace[bug_id]
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        text_collection = IRCollection('bug_db_name',
                                       'bug_text_collection_name', 'r')
        res = text_collection.find({bug_id_name: bug_id})
        stacktrace = []
        if res.count() > 0:
            stacktrace = res[0][stacktrace_name]
        if cls.__is_cache:
            cls.__cache_stacktrace[bug_id] = stacktrace
        return stacktrace
Пример #9
0
    def cache_all_data(cls):
        """Load all data into memory."""
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        # caching data
        cls.set_is_cache(True)
        text_collection = \
                IRCollection('bug_db_name', 'bug_text_collection_name', 'r')
        cls.__cache_summary_description = {}
        cls.__cache_stacktrace = {}

        def iter_func(bug):
            cls.__cache_summary_description[bug[bug_id_name]] = \
                    (bug[summary_name], bug[description_name])
            cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name]

        IRProgressBar.execute_iteration_for_cursor(text_collection.find(),
                                                   iter_func,
                                                   'Caching Text Data')
        text_collection.close()
Пример #10
0
    def get_stacktrace_of_bug(cls, bug_id):
        """Get stacktrace from mongodb.

        Args:
            bug_id: int

        Returns:
            [[str]], [[signature]]
        """
        if cls.__is_cache:
            if bug_id in cls.__cache_stacktrace:
                return cls.__cache_stacktrace[bug_id]
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        text_collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'r')
        res = text_collection.find({bug_id_name : bug_id})
        stacktrace = []
        if res.count() > 0:
            stacktrace = res[0][stacktrace_name]
        if cls.__is_cache:
            cls.__cache_stacktrace[bug_id] = stacktrace
        return stacktrace
Пример #11
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365

        bug_id_name = IRConfig.get_instance().get('bug_id_name')

        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')

        reports2scan = basic_collection.find({
            product_name: self.get_product(),
            create_ts_name: {
                '$gt': self.get_create_ts() - search_time_span
            },
            bug_id_name: {
                '$nin': self.__exclude_report_ids
            }
        })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )

        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(
                    bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
Пример #12
0
    def get_documentcount(cls,
                          term,
                          field=None,
                          documentcount_collection=None):
        """Get documentcount of a term.

        Args:
            term, str

        Returns:
            if field == None: (int, int), (summary document count, description document count)
            else: int, the document count of corresponding field
        """

        if cls.__is_cache and term in cls.__cache_document_count:
            if field is None:
                return cls.__cache_document_count[term]
            else:
                from ir_config import IRConfig
                summary_name = IRConfig.get_instance().get('bug_summary_name')
                description_name = IRConfig.get_instance().get(
                    'bug_description_name')
                if field == summary_name:
                    return cls.__cache_document_count[term][0]
                elif field == description_name:
                    return cls.__cache_document_count[term][1]
                else:
                    return 0
        # load from db
        from ir_mongodb_helper import IRCollection
        from ir_config import IRConfig
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        if documentcount_collection is None:
            documentcount_collection = IRCollection(
                'bug_db_name', 'bug_documentcount_collection_name', 'r')
        res = documentcount_collection.find({term_name: term})
        summary = 0
        description = 0
        if res.count() > 0:
            if summary_name in res[0]:
                summary = res[0][summary_name]
            if description_name in res[0]:
                description = res[0][description_name]
        if cls.__is_cache:
            cls.__cache_document_count[term] = (summary, description)
        # return value
        if field is None:
            return summary, description
        elif field == summary_name:
            return summary
        elif field == description_name:
            return description
        else:
            return 0
Пример #13
0
    def similarity_over_all(self):
        """Calculate similarity between bug (summary, description) over
         all.

        Returns:
            dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]}
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_text import IRText
        from ir_tfidf import IRTFIDF

        logger = IRLog.get_instance()
        search_time_span = 2 * 3600 * 24 * 365
        
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')

        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        
        reports2scan = basic_collection.find({
            product_name : self.get_product(),
            create_ts_name : {'$gt' : self.get_create_ts() - search_time_span},
            bug_id_name : {'$nin' : self.__exclude_report_ids} })
        result = {}
        IRLog.get_instance().println('Comparing with %d reports.' \
                % (reports2scan.count()) )
        
        print self.__summary_text
        print self.__description_text

        for report in reports2scan:
            bug_id = report[bug_id_name]
            if bug_id == self.get_dummy_bug_id():
                continue
            # because we don't want to load stacktrace in case of self.__stacktrace 
            #    being none, we create and fill the info of report manually
            other_report = IRReport("", "")
            other_report.__summary_tfidf, other_report.__description_tfidf = \
                    IRTFIDF.get_tfidf_of_bug(bug_id)
            # if self.__stacktrace is empty, we don't need to do this
            if self.get_stacktrace() is not None and \
                    self.get_stacktrace().__len__() > 0:
                other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id)
            if other_report.__stacktrace is None:
                other_report.__stacktrace = []
            result[bug_id] = self.similarity_with(other_report)

        return result
Пример #14
0
    def get_documentcount(cls, term, field = None, documentcount_collection = None):
        """Get documentcount of a term.

        Args:
            term, str

        Returns:
            if field == None: (int, int), (summary document count, description document count)
            else: int, the document count of corresponding field
        """

        if cls.__is_cache and term in cls.__cache_document_count:
            if field is None:
                return cls.__cache_document_count[term]
            else:
                from ir_config import IRConfig
                summary_name = IRConfig.get_instance().get('bug_summary_name')
                description_name = IRConfig.get_instance().get('bug_description_name')
                if field == summary_name:
                    return cls.__cache_document_count[term][0]
                elif field == description_name:
                    return cls.__cache_document_count[term][1]
                else:
                    return 0
        # load from db
        from ir_mongodb_helper import IRCollection
        from ir_config import IRConfig
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        if documentcount_collection is None:
            documentcount_collection = IRCollection(
                'bug_db_name', 'bug_documentcount_collection_name', 'r')
        res = documentcount_collection.find({term_name : term})
        summary = 0
        description = 0
        if res.count() > 0:
            if summary_name in res[0]:
                summary = res[0][summary_name]
            if description_name in res[0]:
                description = res[0][description_name]
        if cls.__is_cache:
           cls.__cache_document_count[term] = (summary, description)
        # return value
        if field is None:
            return summary, description
        elif field == summary_name:
            return summary
        elif field == description_name:
            return description
        else:
            return 0
Пример #15
0
    def get_iterator(cls, arg):
        """Get the cursor to the items fulfill arg.

        Args:
            arg: dict, condition

        Returns:
            cursor
        """
        from ir_mongodb_helper import IRCollection
        text_collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'r')
        return text_collection.find(arg)
Пример #16
0
    def get_iterator(cls, arg):
        """Get the cursor to the items fulfill arg.

        Args:
            arg: dict, condition

        Returns:
            cursor
        """
        from ir_mongodb_helper import IRCollection
        text_collection = IRCollection('bug_db_name',
                                       'bug_text_collection_name', 'r')
        return text_collection.find(arg)
Пример #17
0
 def get_iterator(cls, arg=None):
     """Get iterator of termcounts fulfiling arg.
     
     Args:
         arg: dict, Condiction.
         
     Returns:
         cursor
     """
     if not arg: arg = {}
     from ir_mongodb_helper import IRCollection
     termcount_collection = IRCollection(
         'bug_db_name', 'bug_termcount_collection_name', 'r')
     return termcount_collection.find(arg)
Пример #18
0
    def get_total_report_number(cls):
        """Get the total number of reports.

        Returns:
            int
        """
        if cls.__is_cache and cls.__total_report_number is not None:
            return cls.__total_report_number
        from ir_mongodb_helper import IRCollection
        tc_collection = IRCollection(
            'bug_db_name', 'bug_termcount_collection_name', 'r')
        total_report_number = tc_collection.count()
        if cls.__is_cache:
            cls.__total_report_number = total_report_number
        return total_report_number
Пример #19
0
    def get_total_report_number(cls):
        """Get the total number of reports.

        Returns:
            int
        """
        if cls.__is_cache and cls.__total_report_number is not None:
            return cls.__total_report_number
        from ir_mongodb_helper import IRCollection
        tc_collection = IRCollection('bug_db_name',
                                     'bug_termcount_collection_name', 'r')
        total_report_number = tc_collection.count()
        if cls.__is_cache:
            cls.__total_report_number = total_report_number
        return total_report_number
Пример #20
0
    def calculate_tfidf_for_report_termcount(cls, summary_termcount,
                                             description_termcount):
        """Calculate TFIDF for single report.
        
        Args:
            summary_termcount: dict, {term -> termcount}
            description_termcount: dict, {term -> termcount}

        Returns:
            [dict, dict], [tfidf of summary, tfidf of description]
        """
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'r')
        total_document = cls.get_total_report_number()
        summary_tfidf = cls.calculate_tfidf(
            summary_termcount,
            IRConfig.get_instance().get('bug_summary_name'), total_document,
            documentcount_collection)
        description_tfidf = cls.calculate_tfidf(
            description_termcount,
            IRConfig.get_instance().get('bug_description_name'),
            total_document, documentcount_collection)
        return summary_tfidf, description_tfidf
Пример #21
0
 def cache_all_data(cls):
     """Load all TFIDF into memory."""
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     bug_id_name = IRConfig.get_instance().get('bug_id_name')
     summary_name = IRConfig.get_instance().get('bug_summary_name')
     description_name = IRConfig.get_instance().get('bug_description_name')
     tfidf_collection = IRCollection(
         'bug_db_name', 'bug_tfidf_collection_name', 'r')
     cls.set_is_cache(True)
     cls.__cache = {}
     def iter_tfidf(bug):
         cls.__cache[bug[bug_id_name]] = (bug[summary_name],
                                          bug[description_name])
     IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(),
                                                iter_tfidf, "Caching TFIDF")
Пример #22
0
    def show_distribution_on_product_and_create_ts(cls):
        """Show the distribution of create time and number of products on
        each duplicate group.
        """
        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug2group_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'r')
        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')

        group_ids = bug2group_collection.distinct(group_name)
        progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1)
        group_num = 0
        for group_id in group_ids:
            group_num += 1
            progress_bar.set_value(group_num)
            bugs = bug2group_collection.find({group_name : group_id})
            min_ts = 9999999999
            max_ts = -1000
            product_set = set()
            for bug in bugs:
                bug_id = bug[bug_id_name]
                basic = basic_collection.find({bug_id_name : bug_id})
                if basic.count() == 0:
                    continue
                ts = basic[0][create_ts_name]
                product = basic[0][product_name]
                # ts
                if ts > max_ts:
                    max_ts = ts
                if ts < min_ts:
                    min_ts = ts
                # product
                product_set.add(product)
            IRLog.get_instance().println('ts span:%d;product number:%d' \
                    % (max_ts - min_ts, product_set.__len__()), 2)
Пример #23
0
    def test_parse_info_level0(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_duplicate_group import IRDuplicateGroup

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        duplicate_group = IRDuplicateGroup()
        duplicate_group.parse_info_level0('../data/test/info_level0_test')

        #test if incomplete bugs have been removed
        bug2group = IRCollection('bug_db_name',
                                 'bug_duplicate_collection_name', 'r')
        assert bug2group is not None
        res = bug2group.find({'bug_id': 102500})
        assert res.count() == 0

        IRLog.get_instance().stop_log()
Пример #24
0
    def get_bugs_in_group(cls, group_id):
        """Get bugs in a group.

        Args:
            group_id: int

        Returns:
            [int], [bug_id]
        """
        
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        
        duplicate_collection =IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')
        find_result = duplicate_collection.find({group_name : group_id})
        return [bug[bug_id_name] for bug in find_result]
Пример #25
0
    def test_parse_info_level0(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_duplicate_group import IRDuplicateGroup

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        duplicate_group = IRDuplicateGroup()
        duplicate_group.parse_info_level0('../data/test/info_level0_test')
        
        #test if incomplete bugs have been removed
        bug2group = IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'r')
        assert bug2group is not None
        res = bug2group.find({'bug_id':102500})
        assert res.count() == 0
        
        IRLog.get_instance().stop_log()
Пример #26
0
    def cache_all_data(cls):
        """Load all TFIDF into memory."""
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_collection = IRCollection('bug_db_name',
                                        'bug_tfidf_collection_name', 'r')
        cls.set_is_cache(True)
        cls.__cache = {}

        def iter_tfidf(bug):
            cls.__cache[bug[bug_id_name]] = (bug[summary_name],
                                             bug[description_name])

        IRProgressBar.execute_iteration_for_cursor(tfidf_collection.find(),
                                                   iter_tfidf, "Caching TFIDF")
Пример #27
0
    def get_bugs_in_group(cls, group_id):
        """Get bugs in a group.

        Args:
            group_id: int

        Returns:
            [int], [bug_id]
        """

        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        duplicate_collection = IRCollection('bug_db_name',
                                            'bug_duplicate_collection_name',
                                            'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')
        find_result = duplicate_collection.find({group_name: group_id})
        return [bug[bug_id_name] for bug in find_result]
Пример #28
0
    def get_basic_info_of_bug(cls, bug_id):
        """Get basic info from mongodb.

        Args:
            bug_id: int

        Returns:
            (int, str): (create_ts, product)
        """
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')
        res = basic_collection.find({bug_id_name: bug_id})
        if res.count() > 0:
            return res[0][create_ts_name], res[0][product_name]
        else:
            return -1, ''
Пример #29
0
    def get_basic_info_of_bug(cls, bug_id):
        """Get basic info from mongodb.

        Args:
            bug_id: int

        Returns:
            (int, str): (create_ts, product)
        """
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        basic_collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'r')
        res = basic_collection.find({bug_id_name : bug_id})
        if res.count() > 0:
            return res[0][create_ts_name], res[0][product_name]
        else:
            return -1, ''
Пример #30
0
    def parse_dump_basic_file(cls, dump_filename = None):
        # Not finished yet
        """Extract basic information mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection


        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        product_name = IRConfig.get_instance().get('bug_product_name', 'product')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name', 'ts')

        collection = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'w')

        # load and insert text file
        if None == dump_filename :
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_basic_filename')
        in_file = open(dump_filename, 'r')
        
        def iter_for_line(line):
            # TODO here
            bug_id, product, ts = cls.__extract_basic_from_dump_file_line__(line)

            collection.insert({ bug_id_name : int(bug_id),
                                product_name: product,
                                create_ts_name : int(ts) })

        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump Basic')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Пример #31
0
 def batch_generate_term_count(cls):
     """Generate term count for text in mongodb database,
         and store to database.
     """
     from ir_log import IRProgressBar
     from ir_text import IRText
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # config
     bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
     summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
     description_name = IRConfig.get_instance().\
             get('bug_description_name', 'desc')
     
     termcount_collection = IRCollection(
         'bug_db_name', 'bug_termcount_collection_name', 'w')
     def iter_text(bug):
         summary_bow, description_bow = cls.calculate_term_count(
             bug[summary_name], bug[description_name])
         termcount_collection.insert({
             bug_id_name : bug[bug_id_name],
             summary_name : summary_bow,
             description_name : description_bow })
     IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}),
                                                iter_text, "From Text to Term Count")
     termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
     termcount_collection.close()
Пример #32
0
    def get_duplicate_group_information(cls, group_size_min, group_size_max):
        """Calculate the size of duplicate group.

        Args:
            group_size_min: int, The minimum size of wanted group.
            group_size_max: int, The maximum size of wanted group.

        Returns:
            [int], [group_id]
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        duplicate_group_count_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_group_count_collection_name', 'r')
        group_name = IRConfig.get_instance().get('bug_group_name')
        group_size_name = IRConfig.get_instance().get('bug_group_size')
        result = duplicate_group_count_collection.find({group_size_name : \
                {"$gt":group_size_min, "$lt":group_size_max}})
        return [group[group_name] for group in result]
Пример #33
0
    def get_duplicate_group_information(cls, group_size_min, group_size_max):
        """Calculate the size of duplicate group.

        Args:
            group_size_min: int, The minimum size of wanted group.
            group_size_max: int, The maximum size of wanted group.

        Returns:
            [int], [group_id]
        """

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        duplicate_group_count_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_group_count_collection_name', 'r')
        group_name = IRConfig.get_instance().get('bug_group_name')
        group_size_name = IRConfig.get_instance().get('bug_group_size')
        result = duplicate_group_count_collection.find({group_size_name : \
                {"$gt":group_size_min, "$lt":group_size_max}})
        return [group[group_name] for group in result]
Пример #34
0
    def batch_generate_document_count(cls):
        """Batch calculate term count over documents.
        Input is from mongodb, termcount collection.
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection 
        from ir_term_count import IRTermCount

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        # Calculate document count and stored in document_count
        document_count = {}
        def iter_term_count(bug):
            for term in bug[summary_name]:
                if not term in document_count:
                    document_count[term] = {term_name:term, summary_name:0,
                                            description_name:0}
                document_count[term][summary_name] += 1
            for term in bug[description_name]:
                if not term in document_count:
                    document_count[term] = {term_name:term, summary_name:0,
                                            description_name:0}
                document_count[term][description_name] += 1
        IRProgressBar.execute_iteration_for_cursor(IRTermCount.get_iterator({}),
                                                   iter_term_count, "Counting Document Count")
        # Write to db
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'w')
        def write_to_mongo(term):
            documentcount_collection.insert(document_count[term])
        IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo,
                                                 "Write to database")
        documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        documentcount_collection.close()
Пример #35
0
    def get_group_of_bug(cls, bug_id):
        """Get the group id of a bug.

        Args:
            bug_id: int

        Returns:
            int, group_id
        """

        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        duplicate_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')

        result = duplicate_collection.find({bug_id_name : bug_id})
        if result.count() == 0:
            return None
        else:
            return result[0][group_name]
Пример #36
0
 def cache_all_data(cls):
     """Load all document count into memory.
     
     """
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # config
     summary_name = IRConfig.get_instance().get('bug_summary_name')
     description_name = IRConfig.get_instance().get('bug_description_name')
     term_name = IRConfig.get_instance().get('bug_term_name')
     
     cls.__is_cache = True
     documentcount_collection = IRCollection(
         'bug_db_name', 'bug_documentcount_collection_name', 'r')
     def iter_document_count(term):
         summary = term[summary_name] if summary_name in term else 0
         description = term[description_name] if description_name in term else 0
         cls.__cache_document_count[term[term_name]] = \
                 (summary, description)
     IRProgressBar.execute_iteration_for_cursor(
         documentcount_collection.find({}), iter_document_count,
         "Caching Document Count")
Пример #37
0
 def cache_all_data(cls):
     """Load all data into memory."""
     from ir_log import IRProgressBar
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # get config
     bug_id_name = IRConfig.get_instance().get('bug_id_name')
     summary_name = IRConfig.get_instance().get('bug_summary_name')
     description_name = IRConfig.get_instance().get('bug_description_name')
     stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
     # caching data 
     cls.set_is_cache(True)
     text_collection = \
             IRCollection('bug_db_name', 'bug_text_collection_name', 'r')
     cls.__cache_summary_description = {}
     cls.__cache_stacktrace = {}
     def iter_func(bug):
         cls.__cache_summary_description[bug[bug_id_name]] = \
                 (bug[summary_name], bug[description_name])
         cls.__cache_stacktrace[bug[bug_id_name]] = bug[stacktrace_name]
     IRProgressBar.execute_iteration_for_cursor(
         text_collection.find(), iter_func, 'Caching Text Data')
     text_collection.close()
Пример #38
0
    def parse_dump_file(cls, dump_filename = None):
        """Extract text from mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        # get key name
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name', 'desc')
        # collection
        collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'w')

        # load and insert text file
        if None == dump_filename :
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_text_filename')
        in_file = open(dump_filename, 'r')
        
        def iter_for_line(line):
            bug_id, summary, description = \
                    cls.__extract_summary_and_description_from_dump_file_line(line)
            collection.insert({ bug_id_name : int(bug_id),
                                summary_name: summary,
                                description_name : description }) 
        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Пример #39
0
    def get_group_of_bug(cls, bug_id):
        """Get the group id of a bug.

        Args:
            bug_id: int

        Returns:
            int, group_id
        """

        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        duplicate_collection = IRCollection('bug_db_name',
                                            'bug_duplicate_collection_name',
                                            'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')

        result = duplicate_collection.find({bug_id_name: bug_id})
        if result.count() == 0:
            return None
        else:
            return result[0][group_name]
Пример #40
0
    def show_dict_compare(cls, dicta, dictb, field_name='summ', log_level=1):
        """Compare and print the tfidf of two tfidf.
        tfidf sorted.

        Args:
            dicta: dict, TFIDF
            dictb: dict, TFIDF
            field_name: str, summary or description?
            log_level: int
        """

        from ir_log import IRLog
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount

        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'r')
        keys = set()
        if None != dicta:
            for key in dicta:
                keys.add(key)
        if None != dictb:
            for key in dictb:
                keys.add(key)
        # sort by product
        product = []
        for key in keys:
            tfidf_a = 0.0
            tfidf_b = 0.0
            if (None != dicta) and (key in dicta):
                tfidf_a = dicta[key]
            if (None != dictb) and (key in dictb):
                tfidf_b = dictb[key]
            documentcount = IRDocumentCount.get_documentcount(
                key, field_name, documentcount_collection)
            idf = cls.get_idf(documentcount)
            product.append(
                (key, tfidf_a * tfidf_b, tfidf_a, tfidf_b, documentcount, idf))
        product.sort(cmp=lambda a, b: cmp(a[1], b[1]), reverse=True)
        # print it out
        IRLog.get_instance().println('%16s\t%8s\t%8s\t%8s\t%8s\t%8s' \
                % ('term', 'tfidf a', 'tfidf b', 'doccount', 'idf', 'sim'))
        for item in product:
            IRLog.get_instance().println('%16s\t%8f\t%8f\t%8d\t%8f\t%8f' \
                    % (item[0], item[2], item[3], item[4], item[5], item[1]), log_level)
Пример #41
0
    def batch_generate_document_count(cls):
        """Batch calculate term count over documents.
        Input is from mongodb, termcount collection.
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_term_count import IRTermCount

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        # Calculate document count and stored in document_count
        document_count = {}

        def iter_term_count(bug):
            for term in bug[summary_name]:
                if not term in document_count:
                    document_count[term] = {
                        term_name: term,
                        summary_name: 0,
                        description_name: 0
                    }
                document_count[term][summary_name] += 1
            for term in bug[description_name]:
                if not term in document_count:
                    document_count[term] = {
                        term_name: term,
                        summary_name: 0,
                        description_name: 0
                    }
                document_count[term][description_name] += 1

        IRProgressBar.execute_iteration_for_cursor(
            IRTermCount.get_iterator({}), iter_term_count,
            "Counting Document Count")
        # Write to db
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'w')

        def write_to_mongo(term):
            documentcount_collection.insert(document_count[term])

        IRProgressBar.execute_iteration_for_dict(document_count,
                                                 write_to_mongo,
                                                 "Write to database")
        documentcount_collection.create_index([(bug_id_name,
                                                IRCollection.ASCENDING)])
        documentcount_collection.close()
Пример #42
0
    def show_distribution_on_product_and_create_ts(cls):
        """Show the distribution of create time and number of products on
        each duplicate group.
        """
        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug2group_collection = IRCollection('bug_db_name',
                                            'bug_duplicate_collection_name',
                                            'r')
        basic_collection = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'r')
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        group_name = IRConfig.get_instance().get('bug_group_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')

        group_ids = bug2group_collection.distinct(group_name)
        progress_bar = IRProgressBar(group_ids.__len__(), "group", False, 0, 1)
        group_num = 0
        for group_id in group_ids:
            group_num += 1
            progress_bar.set_value(group_num)
            bugs = bug2group_collection.find({group_name: group_id})
            min_ts = 9999999999
            max_ts = -1000
            product_set = set()
            for bug in bugs:
                bug_id = bug[bug_id_name]
                basic = basic_collection.find({bug_id_name: bug_id})
                if basic.count() == 0:
                    continue
                ts = basic[0][create_ts_name]
                product = basic[0][product_name]
                # ts
                if ts > max_ts:
                    max_ts = ts
                if ts < min_ts:
                    min_ts = ts
                # product
                product_set.add(product)
            IRLog.get_instance().println('ts span:%d;product number:%d' \
                    % (max_ts - min_ts, product_set.__len__()), 2)
Пример #43
0
    def test_parse_dump_dup_file(self):

        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_duplicate_group import IRDuplicateGroup

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        duplicate_group = IRDuplicateGroup()
        duplicate_group.parse_dump_dup_file('../data/test/dump_dup_file_test')

        bug2group = IRCollection('bug_db_name',
                                 'bug_duplicate_collection_name', 'r')
        assert bug2group is not None

        bug_ids = duplicate_group.get_bugs_in_group(1)
        IRLog.get_instance().println('In dump-dup_file_test: Group %d has bugs: ' % (1) + \
                    ' '.join([str(bug_id) for bug_id in bug_ids]))
        IRLog.get_instance().stop_log()
Пример #44
0
    def parse_dump_basic_file(cls, dump_filename=None):
        # Not finished yet
        """Extract basic information mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        product_name = IRConfig.get_instance().get('bug_product_name',
                                                   'product')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name',
                                                     'ts')

        collection = IRCollection('bug_db_name', 'bug_basic_collection_name',
                                  'w')

        # load and insert text file
        if None == dump_filename:
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_basic_filename')
        in_file = open(dump_filename, 'r')

        def iter_for_line(line):
            # TODO here
            bug_id, product, ts = cls.__extract_basic_from_dump_file_line__(
                line)

            collection.insert({
                bug_id_name: int(bug_id),
                product_name: product,
                create_ts_name: int(ts)
            })

        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump Basic')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Пример #45
0
    def parse_dump_file(cls, dump_filename=None):
        """Extract text from mysql dump and insert into mongo db

        dump_filename: str, Filename of dump file. If this parameter
            is not given, dump_filename will be fetched from
            config file
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        # get key name
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name',
                                                       'desc')
        # collection
        collection = IRCollection('bug_db_name', 'bug_text_collection_name',
                                  'w')

        # load and insert text file
        if None == dump_filename:
            dump_filename = IRConfig.get_instance().\
                    get('bug_dump_text_filename')
        in_file = open(dump_filename, 'r')

        def iter_for_line(line):
            bug_id, summary, description = \
                    cls.__extract_summary_and_description_from_dump_file_line(line)
            collection.insert({
                bug_id_name: int(bug_id),
                summary_name: summary,
                description_name: description
            })

        IRProgressBar.execute_iteration_for_file(in_file, iter_for_line,
                                                 'Parsing Dump')
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection.close()
Пример #46
0
    def batch_generate_tfidf(cls):
        """Batch calculate TFIDF."""

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount
        from ir_term_count import IRTermCount
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        # prepare collections
        IRDocumentCount.cache_all_data()
        tfidf_collection = IRCollection('bug_db_name',
                                        'bug_tfidf_collection_name', 'w')
        # batch calculate tfidf
        termcount_iterator = IRTermCount.get_iterator()
        bug_count = termcount_iterator.count()

        def iter_term_count(bug):
            summary_tfidf = cls.calculate_tfidf(bug[summary_name],
                                                summary_name, bug_count, None,
                                                tfidf_algorithm)
            description_tfidf = cls.calculate_tfidf(bug[description_name],
                                                    description_name,
                                                    bug_count, None,
                                                    tfidf_algorithm)
            tfidf_collection.insert({
                bug_id_name: bug[bug_id_name],
                summary_name: summary_tfidf,
                description_name: description_tfidf
            })

        IRProgressBar.execute_iteration_for_cursor(termcount_iterator,
                                                   iter_term_count,
                                                   "Calculating TFIDF")
        tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        tfidf_collection.close()
Пример #47
0
    def __store_to_mongodb(cls, bug2group, group2bug):
        """Store duplicate group information into Mongodb.
        
        Args:
            bug2group: dict, {bug_id -> group_id}
            group2bug: dict, {group_id -> [bug_id]}
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        bug_group_name = IRConfig.get_instance().get('bug_group_name')
        duplicate_collection = IRCollection('bug_db_name',
                                            'bug_duplicate_collection_name',
                                            'w')

        def iter_bug_group(bug):
            duplicate_collection.insert({
                bug_id_name: bug,
                bug_group_name: bug2group[bug]
            })

        IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group,
                                                 "Store to db")
        duplicate_collection.create_index([(bug_id_name,
                                            IRCollection.ASCENDING)])
        duplicate_collection.create_index([(bug_group_name,
                                            IRCollection.ASCENDING)])
        duplicate_collection.close()

        # duplicate group size collection
        group_name = IRConfig.get_instance().get('bug_group_name')
        group_size_name = IRConfig.get_instance().get('bug_group_size')
        duplicate_group_count_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_group_count_collection_name', 'w')
        line_num = 0
        for group, bugs in group2bug.items():
            line_num += 1

        def iter_group_bug(group):
            duplicate_group_count_collection.insert({
                group_name:
                group,
                group_size_name:
                group2bug[group].__len__()
            })

        IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug,
                                                 'Store Index')
        duplicate_group_count_collection.create_index([
            (group_name, IRCollection.ASCENDING)
        ])
        duplicate_group_count_collection.close()
Пример #48
0
    def __store_to_mongodb(cls, bug2group, group2bug):
        """Store duplicate group information into Mongodb.
        
        Args:
            bug2group: dict, {bug_id -> group_id}
            group2bug: dict, {group_id -> [bug_id]}
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        bug_group_name = IRConfig.get_instance().get('bug_group_name')
        duplicate_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_collection_name', 'w')
        def iter_bug_group(bug):
            duplicate_collection.insert({ bug_id_name : bug,
                                          bug_group_name : bug2group[bug] })
        IRProgressBar.execute_iteration_for_dict(bug2group, iter_bug_group,
                                                 "Store to db")
        duplicate_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        duplicate_collection.create_index([(bug_group_name, IRCollection.ASCENDING)])
        duplicate_collection.close()

        # duplicate group size collection
        group_name = IRConfig.get_instance().get('bug_group_name')
        group_size_name = IRConfig.get_instance().get('bug_group_size')
        duplicate_group_count_collection = IRCollection(
            'bug_db_name', 'bug_duplicate_group_count_collection_name',
            'w')
        line_num = 0
        for group, bugs in group2bug.items():
            line_num += 1
        def iter_group_bug(group):
            duplicate_group_count_collection.insert({group_name : group,
                                                     group_size_name : group2bug[group].__len__()})
        IRProgressBar.execute_iteration_for_dict(group2bug, iter_group_bug,
                                                 'Store Index')
        duplicate_group_count_collection.create_index(
            [(group_name, IRCollection.ASCENDING)])
        duplicate_group_count_collection.close()
Пример #49
0
    def test_ir_collection(self):

        from ir_mongodb_helper import IRCollection
        from ir_config import IRConfig
        import pymongo

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        db_cfg_name = 'bug_db_name'
        collection_cfg_name = 'bug_mongodb_helper_collection_name'
        # create empty collection
        ircollection = IRCollection(db_cfg_name, collection_cfg_name, 'w')
        assert None != ircollection
        ircollection.insert({'abc':'abc'})
        ircollection.close()
        
        # access existing collection
        ircollection = IRCollection(db_cfg_name, collection_cfg_name, 'r')
        assert None != ircollection
        ircollection.close()
        # test result
        connection = pymongo.Connection(IRConfig.get_instance().get('db_host'),
                                        IRConfig.get_instance().get_int('db_port'))
        db_name = IRConfig.get_instance().get(db_cfg_name)
        collection_name = IRConfig.get_instance().get(collection_cfg_name)
        assert connection[db_name][collection_name].find({'abc':'abc'}).count() > 0

        ircollection = IRCollection(db_cfg_name, collection_cfg_name, 'w')
        ircollection.clean()
        ircollection.close()
Пример #50
0
    def parse_info_level1(cls, info_level1_filename=None):
        """Extract text and insert into mongo db

        info_level1_filename: str, Filename of info level1. If this parameter
            is not given, bug_info_level1_filename will be fetched from
            config file
        """

        import pymongo
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_gnome_st_tools import IRSTTools
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name',
                                                       'desc')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        # collections
        collection = IRCollection('bug_db_name', 'bug_text_collection_name',
                                  'w')
        collection_basic = IRCollection('bug_db_name',
                                        'bug_basic_collection_name', 'w')
        community_name = IRConfig.get_instance().get('community')

        # load and insert text file
        if None == info_level1_filename:
            info_level1_filename = IRConfig.get_instance().\
                    get('bug_info_level1_filename')
        in_file = open(info_level1_filename, 'r')

        def func_each_line(line):
            bug_id, summary, description, resolution, create_ts, product = \
                    cls.__extract_information_from_info_level1_line(line)

            if resolution is not None and resolution != "INCOMPLETE":
                # post process description
                description, stacktrace = \
                        cls.extract_raw_description_info(description,
                                                         community_name)
                # drop the report whose description containing stacktrace info
                if cls.is_drop_report(description):
                    from ir_log import IRLog
                    IRLog.get_instance().println('Drop report#=%d because it '\
                            'contains unrecognizable stacktrace.' % bug_id, 3)
                    return

                collection.insert({
                    bug_id_name: bug_id,
                    summary_name: summary,
                    description_name: description,
                    stacktrace_name: stacktrace
                })
                collection_basic.insert({
                    bug_id_name: bug_id,
                    create_ts_name: create_ts,
                    product_name: product
                })

        IRProgressBar.execute_iteration_for_file(in_file, func_each_line,
                                                 "Parsing Infolevel 1")
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection_basic.create_index([(bug_id_name, IRCollection.ASCENDING),
                                       (create_ts_name,
                                        IRCollection.ASCENDING),
                                       (product_name, IRCollection.ASCENDING)])
        collection.close()
        collection_basic.close()
Пример #51
0
#!/usr/bin/python2.7

if __name__ == '__main__':
    import sys
  
    from ir_config import IRConfig
    from ir_text import IRText
    from ir_mongodb_helper import IRCollection

    config = IRConfig.get_instance()
    config.load(sys.argv[1])
    product_name = config.get('bug_product_name')
    
    products = dict()

    basic = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r')
    cursor = basic.find(None)
    for bug in cursor:
        product = bug[product_name]
        if product not in products:
            products[product] = 0
        products[product] += 1

    product_list = products.items()
    product_list.sort(cmp=lambda x,y:cmp(x[1],y[1]), reverse=True)

    prefix = '' if sys.argv.__len__() < 3 else sys.argv[2]
    surfix = '' if sys.argv.__len__() < 4 else sys.argv[3]
    threshold = 100 if sys.argv.__len__() <5 else int(sys.argv[4])
    for product in product_list:
        if product[1] < threshold:
Пример #52
0
    def parse_info_level1(cls, info_level1_filename = None):
        """Extract text and insert into mongo db

        info_level1_filename: str, Filename of info level1. If this parameter
            is not given, bug_info_level1_filename will be fetched from
            config file
        """

        import pymongo
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_gnome_st_tools import IRSTTools
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
        summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
        description_name = IRConfig.get_instance().get('bug_description_name', 'desc')
        stacktrace_name = IRConfig.get_instance().get('bug_stacktrace_name')
        create_ts_name = IRConfig.get_instance().get('bug_create_ts_name')
        product_name = IRConfig.get_instance().get('bug_product_name')
        # collections
        collection = IRCollection(
            'bug_db_name', 'bug_text_collection_name', 'w')
        collection_basic = IRCollection(
            'bug_db_name', 'bug_basic_collection_name', 'w')
        community_name = IRConfig.get_instance().get('community')
        
        # load and insert text file
        if None == info_level1_filename :
            info_level1_filename = IRConfig.get_instance().\
                    get('bug_info_level1_filename')
        in_file = open(info_level1_filename, 'r')
        
        def func_each_line(line):
            bug_id, summary, description, resolution, create_ts, product = \
                    cls.__extract_information_from_info_level1_line(line)
            
            if resolution is not None and resolution != "INCOMPLETE":
                # post process description
                description, stacktrace = \
                        cls.extract_raw_description_info(description,
                                                         community_name)
                # drop the report whose description containing stacktrace info
                if cls.is_drop_report(description):
                    from ir_log import IRLog
                    IRLog.get_instance().println('Drop report#=%d because it '\
                            'contains unrecognizable stacktrace.' % bug_id, 3)
                    return
                
                collection.insert({ bug_id_name : bug_id,
                                    summary_name: summary,
                                    description_name : description,
                                    stacktrace_name : stacktrace })
                collection_basic.insert({ bug_id_name : bug_id,
                                          create_ts_name : create_ts,
                                          product_name : product })
        IRProgressBar.execute_iteration_for_file(in_file, func_each_line,
                                                 "Parsing Infolevel 1")
        in_file.close()
        collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        collection_basic.create_index([ (bug_id_name, IRCollection.ASCENDING),
                                        (create_ts_name, IRCollection.ASCENDING),
                                        (product_name, IRCollection.ASCENDING) ])
        collection.close()
        collection_basic.close()