def test_tfidf_asm_similarity(self): from ir_log import IRLog from ir_config import IRConfig from ir_tfidf import IRTFIDF IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') vec_a = {'firefox':1, 'chrome':1} vec_b = {'firefox':1, 'chrome':1, 'ie':1} vec_c = {'firefox':1, 'windows':1, 'linux':1} delta = 0.0001 assert abs(1.0 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_b)) < delta assert abs(0.5 - IRTFIDF.tfidf_asm_similarity(vec_a, vec_c)) < delta assert IRTFIDF.tfidf_asm_similarity(vec_a, vec_b) > \ IRTFIDF.tfidf_asm_similarity(vec_a, vec_b, None, ['ie'], 100)
def similarity_with(self, other_report): """ Returns: [float, float, float, float], [total score, summary, \ description, stacktrace] """ from ir_config import IRConfig from ir_tfidf import IRTFIDF from ir_gnome_st_tools import IRSTTools summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio') description_ratio = IRConfig.get_instance().get_float( 'bug_description_ratio') stacktrace_ratio = IRConfig.get_instance().get_float( 'bug_stacktrace_ratio') summary_tfidf_a, description_tfidf_a = \ self.get_summary_and_description_tfidf() summary_tfidf_b, description_tfidf_b = \ other_report.get_summary_and_description_tfidf() tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') stacktrace_algorithm = IRConfig.get_instance().get( 'stacktrace_algorithm') if tfidf_algorithm == 'tfidf': summary_similarity = IRTFIDF.tfidf_similarity( summary_tfidf_a, summary_tfidf_b) description_similarity = IRTFIDF.tfidf_similarity( description_tfidf_a, description_tfidf_b) elif tfidf_algorithm == 'bidf': summary_squared_length, description_squared_length = \ self.get_summary_and_description_tfidf_squared_length() summary_similarity = IRTFIDF.tfidf_asm_similarity( summary_tfidf_a, summary_tfidf_b, summary_squared_length) description_similarity = IRTFIDF.tfidf_asm_similarity( description_tfidf_a, description_tfidf_b, description_squared_length, self.__penalty_terms) if self.__stacktrace is None or \ self.__stacktrace.__len__() == 0 or \ self.__stacktrace[0].__len__() == 0: stacktrace_similarity = 1.0 else: stacktrace_similarity = IRSTTools.compare_stackinfo( self.get_stacktrace(), other_report.get_stacktrace(), stacktrace_algorithm) scoring_strategy = IRConfig.get_instance().get('scoring_strategy', 'heuristic') if scoring_strategy == 'weighted': score = self.__weighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'heuristic': score = self.__heuristic_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'distweighted': score = self.__distweighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) else: assert False, 'invalid scoring strategy' return [ score, summary_similarity, description_similarity, stacktrace_similarity ]
def similarity_with(self, other_report): """ Returns: [float, float, float, float], [total score, summary, \ description, stacktrace] """ from ir_config import IRConfig from ir_tfidf import IRTFIDF from ir_gnome_st_tools import IRSTTools summary_ratio = IRConfig.get_instance().get_float('bug_summary_ratio') description_ratio = IRConfig.get_instance().get_float('bug_description_ratio') stacktrace_ratio = IRConfig.get_instance().get_float('bug_stacktrace_ratio') summary_tfidf_a, description_tfidf_a = \ self.get_summary_and_description_tfidf() summary_tfidf_b, description_tfidf_b = \ other_report.get_summary_and_description_tfidf() tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm') stacktrace_algorithm = IRConfig.get_instance().get('stacktrace_algorithm') if tfidf_algorithm == 'tfidf': summary_similarity = IRTFIDF.tfidf_similarity( summary_tfidf_a, summary_tfidf_b) description_similarity = IRTFIDF.tfidf_similarity( description_tfidf_a, description_tfidf_b) elif tfidf_algorithm == 'bidf': summary_squared_length, description_squared_length = \ self.get_summary_and_description_tfidf_squared_length() summary_similarity = IRTFIDF.tfidf_asm_similarity( summary_tfidf_a, summary_tfidf_b, summary_squared_length) description_similarity = IRTFIDF.tfidf_asm_similarity( description_tfidf_a, description_tfidf_b, description_squared_length, self.__penalty_terms) if self.__stacktrace is None or \ self.__stacktrace.__len__() == 0 or \ self.__stacktrace[0].__len__() == 0: stacktrace_similarity = 1.0 else: stacktrace_similarity = IRSTTools.compare_stackinfo( self.get_stacktrace(), other_report.get_stacktrace(), stacktrace_algorithm) scoring_strategy = IRConfig.get_instance().get('scoring_strategy', 'heuristic') if scoring_strategy == 'weighted': score = self.__weighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'heuristic': score = self.__heuristic_scoring(summary_similarity, description_similarity, stacktrace_similarity) elif scoring_strategy == 'distweighted': score = self.__distweighted_scoring(summary_similarity, description_similarity, stacktrace_similarity) else: assert False, 'invalid scoring strategy' return [score, summary_similarity, description_similarity, stacktrace_similarity]