def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name: self.get_product(), create_ts_name: { '$gt': self.get_create_ts() - search_time_span }, bug_id_name: { '$nin': self.__exclude_report_ids } }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug( bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name : self.get_product(), create_ts_name : {'$gt' : self.get_create_ts() - search_time_span}, bug_id_name : {'$nin' : self.__exclude_report_ids} }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def test_show_dict_compare(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_name = IRConfig.get_instance().get('bug_summary_name') description_name = IRConfig.get_instance().get('bug_description_name') summary_a, description_a = IRTFIDF.get_tfidf_of_bug(100000) summary_b, description_b = IRTFIDF.get_tfidf_of_bug(100200) IRLog.get_instance().println('Summary 100000 vs 100200') IRTFIDF.show_dict_compare(summary_a, summary_b, summary_name) IRLog.get_instance().println('Description 100000 vs 100200') IRTFIDF.show_dict_compare(description_a, description_b) IRLog.get_instance().println('Summary 100000 vs 100000') IRTFIDF.show_dict_compare(summary_a, summary_a) IRLog.get_instance().println('Description 100000 vs 100000') IRTFIDF.show_dict_compare(description_a, description_a, description_name)
def test_get_tfidf_of_bug(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRTFIDF.get_tfidf_of_bug(100000) IRLog.get_instance().println('Summary tfidf: %s' % (str(summary))) IRLog.get_instance().println('Description tfidf: %s' % (str(description))) IRLog.get_instance().stop_log()
def get_summary_and_description_tfidf(self): if self.__bug_id is None: if self.__summary_tfidf is None or \ self.__description_tfidf is None: self.__update_summary_and_description_tfidf_from_termcount() return [self.__summary_tfidf, self.__description_tfidf] else: if self.__allow_cache and \ self.__summary_tfidf is not None and \ self.__description_tfidf is not None: return [self.__summary_tfidf, self.__description_tfidf] from ir_tfidf import IRTFIDF summary_tfidf, description_tfidf = \ IRTFIDF.get_tfidf_of_bug(self.__bug_id) if self.__allow_cache: self.__summary_tfidf, self.__description_tfidf = \ summary_tfidf, description_tfidf return [summary_tfidf, description_tfidf]