def __generate_single_bug(self, bug_id, drop_rate): """Generate an incomplete bug report text. Args: bug_id: int, original bug id. drop_rate: float, 0.0 for not drop, 1.0 for totally drop. Returns: IRReport """ from ir_text import IRText from ir_term_count import IRTermCount from ir_report import IRReport # get description and summary summary, description = IRText.get_summary_and_description_of_bug(bug_id) create_ts, product = IRText.get_basic_info_of_bug(bug_id) if drop_rate > 0.001: summary, description = \ IRTermCount.create_incomplete_report(summary, description, drop_rate) print description new_report = IRReport(summary, description) new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id)) new_report.set_dummy_bug_id(bug_id) new_report.set_basic_info(create_ts, product) return new_report
def test_compare_stackinfo(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper from ir_gnome_st_tools import IRSTTools from ir_text import IRText from random import randint import pymongo IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRText.parse_info_level1('../data/test/stacktrace_test') con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col bugs = col.find() total = col.count() st1 = bugs[0]["stacktrace"] for i in range(total): st2 = bugs[i]["stacktrace"] result_weight = IRSTTools.compare_stackinfo(st1, st2, 'weight') result_max = IRSTTools.compare_stackinfo(st1, st2, 'max') IRLog.get_instance().println('Weight: %f, Max: %f' \ % (result_weight, result_max)) IRLog.get_instance().stop_log()
def test_filter(self): from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper from ir_gnome_st_tools import IRSTTools from ir_text import IRText import pymongo IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') IRText.parse_info_level1('../data/test/info_level1_test') con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # Maybe a bug here: # The test of filter (originally) depends on parse_info_level1 # But parse_info_level1 seems to invoke filter... for bug in col.find(): # TODO: it's not correct. no stacktrace in desc desc, stack = IRSTTools.filter(bug["desc"]) IRLog.get_instance().stop_log()
def __generate_single_bug(self, bug_id, drop_rate): """Generate an incomplete bug report text. Args: bug_id: int, original bug id. drop_rate: float, 0.0 for not drop, 1.0 for totally drop. Returns: IRReport """ from ir_text import IRText from ir_term_count import IRTermCount from ir_report import IRReport # get description and summary summary, description = IRText.get_summary_and_description_of_bug( bug_id) create_ts, product = IRText.get_basic_info_of_bug(bug_id) if drop_rate > 0.001: summary, description = \ IRTermCount.create_incomplete_report(summary, description, drop_rate) print description new_report = IRReport(summary, description) new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id)) new_report.set_dummy_bug_id(bug_id) new_report.set_basic_info(create_ts, product) return new_report
def test_parse_info_level1(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_info_level1('../data/test/info_level1_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # in the test data, we have 1000 in total. # within, 40 have no resolution, 154 are incomplete assert 833 == col.count() assert 'gnome is full of bugs ! (100000 currently)' == \ col.find({'bug_id':100000})[0]["summ"] res = col.find( {"summ": { '$regex': '(>)|(<)|(")|(&apo)s|(&)' }}) assert res.count() == 0
def test_parse_info_level1(self): #import sys #sys.path.append('../bin/') from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_info_level1('../data/test/info_level1_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col # in the test data, we have 1000 in total. # within, 40 have no resolution, 154 are incomplete assert 833 == col.count() assert 'gnome is full of bugs ! (100000 currently)' == \ col.find({'bug_id':100000})[0]["summ"] res = col.find({"summ":{'$regex':'(>)|(<)|(")|(&apo)s|(&)'}}) assert res.count() == 0
def server_cache(msg, res): from ir_log import IRLog from ir_text import IRText from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRLog.get_instance().println('Server is caching data') IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println('Server cached data') return SIGNAL_CONTINUE
def do_remove_bad_reports(cls, config_file): from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig import ir_mongodb_helper from ir_text import IRText config = IRConfig.get_instance() config.load(config_file) bug_id_name = config.get('bug_id_name') bug_description_name = config.get('bug_description_name') text_cursor = IRText.get_iterator(None) remove_ids = [] def iter_text(item): if IRText.is_drop_report(item[bug_description_name]): remove_ids.append(item[bug_id_name]) IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3) IRProgressBar.execute_iteration_for_cursor(text_cursor, iter_text) # remove from all database def remove_from_collection(collection_cfg_name): collection =ir_mongodb_helper.IRCollection( \ 'bug_db_name', collection_cfg_name, 'a') collection.remove({'bug_id':{'$in':remove_ids}}) collection.close() remove_from_collection('bug_text_collection_name') remove_from_collection('bug_tfidf_collection_name') remove_from_collection('bug_duplicate_collection_name')
def test_get_stacktrace_text_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') stacktrace_text = IRText.get_stacktrace_text_of_bug(104400) IRLog.get_instance().println('stacktrace_text: %s' % (stacktrace_text))
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection('bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name: self.get_product(), create_ts_name: { '$gt': self.get_create_ts() - search_time_span }, bug_id_name: { '$nin': self.__exclude_report_ids } }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug( bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def batch_generate_term_count(cls): """Generate term count for text in mongodb database, and store to database. """ from ir_log import IRProgressBar from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().\ get('bug_description_name', 'desc') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'w') def iter_text(bug): summary_bow, description_bow = cls.calculate_term_count( bug[summary_name], bug[description_name]) termcount_collection.insert({ bug_id_name : bug[bug_id_name], summary_name : summary_bow, description_name : description_bow }) IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}), iter_text, "From Text to Term Count") termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) termcount_collection.close()
def test_get_summary_and_description_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRText.get_summary_and_description_of_bug(100000) IRLog.get_instance().println('summary: %s' % (summary)) IRLog.get_instance().println('description: %s' % (description))
def test_parse_dump_file(self): from ir_log import IRLog from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRMongodbHelper IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') assert None != IRConfig.get_instance() IRText.parse_dump_file('../data/test/dump_file_test') IRLog.get_instance().stop_log() con = IRMongodbHelper.get_instance().get_connection() db = con[IRConfig.get_instance().get('bug_db_name')] assert None != db col = db[IRConfig.get_instance().get('bug_text_collection_name')] assert None != col assert 1000 == col.count()
def test_get_summary_and_description_of_bug(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') summary, description = IRText.get_summary_and_description_of_bug( 100000) IRLog.get_instance().println('summary: %s' % (summary)) IRLog.get_instance().println('description: %s' % (description))
def similarity_over_all(self): """Calculate similarity between bug (summary, description) over all. Returns: dict, {bug_id -> [score, summary_score, description_score, stacktrace_score]} """ from ir_log import IRLog from ir_config import IRConfig from ir_mongodb_helper import IRCollection from ir_text import IRText from ir_tfidf import IRTFIDF logger = IRLog.get_instance() search_time_span = 2 * 3600 * 24 * 365 bug_id_name = IRConfig.get_instance().get('bug_id_name') create_ts_name = IRConfig.get_instance().get('bug_create_ts_name') product_name = IRConfig.get_instance().get('bug_product_name') basic_collection = IRCollection( 'bug_db_name', 'bug_basic_collection_name', 'r') reports2scan = basic_collection.find({ product_name : self.get_product(), create_ts_name : {'$gt' : self.get_create_ts() - search_time_span}, bug_id_name : {'$nin' : self.__exclude_report_ids} }) result = {} IRLog.get_instance().println('Comparing with %d reports.' \ % (reports2scan.count()) ) print self.__summary_text print self.__description_text for report in reports2scan: bug_id = report[bug_id_name] if bug_id == self.get_dummy_bug_id(): continue # because we don't want to load stacktrace in case of self.__stacktrace # being none, we create and fill the info of report manually other_report = IRReport("", "") other_report.__summary_tfidf, other_report.__description_tfidf = \ IRTFIDF.get_tfidf_of_bug(bug_id) # if self.__stacktrace is empty, we don't need to do this if self.get_stacktrace() is not None and \ self.get_stacktrace().__len__() > 0: other_report.__stacktrace = IRText.get_stacktrace_of_bug(bug_id) if other_report.__stacktrace is None: other_report.__stacktrace = [] result[bug_id] = self.similarity_with(other_report) return result
def get_stacktrace(self): if self.__bug_id is None: return self.__stacktrace else: if self.__allow_cache and self.__stacktrace is not None: return self.__stacktrace from ir_text import IRText stack = IRText.get_stacktrace_of_bug(self.__bug_id) if self.__allow_cache: self.__stacktrace = stack return stack
def get_basic_info(self): if self.__bug_id is None: return self.__create_ts, self.__product else: if self.__allow_cache and \ self.__create_ts is not None and \ self.__product is not None: return self.__create_ts, self.__product from ir_text import IRText create_ts, product = IRText.get_basic_info_of_bug(self.__bug_id) if self.__allow_cache: self.__create_ts, self.__product = (create_ts, product) return create_ts, product
def get_summary_and_description_text(self): if self.__bug_id is None: return [self.__summary_text, self.__description_text] else: if self.__allow_cache and \ self.__summary_text is not None and \ self.__description_text is not None: return [self.__summary_text, self.__description_text] from ir_text import IRText summary, description = \ IRText.get_summary_and_description_of_bug(self.__bug_id) if self.__allow_cache: self.__summary_text, self.__description_text = \ summary, description return summary, description
def do_test_over_file(self, filename): """Do test over the file. Args: filename: str, the input file which generated by generate_incomplete_test_file. """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup from ir_text import IRText from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF from ir_report import IRReport from ir_document_count import IRDocumentCount IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() remove_self_bug_id = IRConfig.get_instance().get_bool('remove_self_bug_id', True) sim_tot_precision = 0.0 sim_tot_recall = 0.0 sim_bi_tot_recall = 0.0 sim_tot_size = 0 dup_tot_precision = 0.0 dup_tot_recall = 0.0 dup_bi_toto_recall = 0.0 dup_num = 0 test_num = 0 infile = open(filename, 'r') for line in infile: IRLog.get_instance().println('----test----') test_num += 1 line.strip() new_report = IRReport.from_string(line) ori_report = IRReport(new_report.get_dummy_bug_id()) #IRLog.get_instance().println('Summary') #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(), # new_report.get_summary_termcount()) #IRLog.get_instance().println('Description') #IRTermCount.show_dict_compare(ori_report.get_description_termcount(), # new_report.get_description_termcount()) # do test for single similarities, duplicates = new_report.similarities_and_duplicates() sim_ids = [sim[0] for sim in similarities] dup_ids = [dup[0] for dup in duplicates] IRLog.get_instance().println('Sim ids: %s' % str(sim_ids)) IRLog.get_instance().println('Dup ids: %s' % str(dup_ids)) # evaluate sim sim_hit, sim_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id) # some group contain only one if real_duplicates.__len__() == 0: test_num -= 1 continue precision, recall = self.__report_result( new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates) sim_tot_precision += precision sim_tot_recall += recall sim_tot_size += sim_ids.__len__() sim_bi_tot_recall += 1 if recall > 0.0 else 0 if dup_ids.__len__() > 0: dup_num += 1 dup_hit, dup_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id) precision, recall = self.__report_result( new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates) dup_tot_precision += precision dup_tot_recall += recall dup_bi_toto_recall += 1 if recall > 0.0 else 0 # general conclusion if dup_num == 0: dup_num = 1.0 IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\ '#dup', 'dup pre', 'dup rec', 'dup birec'])) IRLog.get_instance().println(','.join([str(test_num), \ str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \ str(dup_num), \ str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)])) infile.close()
def test_cache_all_data(self): from ir_log import IRLog from ir_config import IRConfig from ir_text import IRText IRConfig.get_instance().load('../data/test/bug_test.cfg') IRText.cache_all_data()
def do_test_over_file(self, filename): """Do test over the file. Args: filename: str, the input file which generated by generate_incomplete_test_file. """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup from ir_text import IRText from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF from ir_report import IRReport from ir_document_count import IRDocumentCount IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() remove_self_bug_id = IRConfig.get_instance().get_bool( 'remove_self_bug_id', True) sim_tot_precision = 0.0 sim_tot_recall = 0.0 sim_bi_tot_recall = 0.0 sim_tot_size = 0 dup_tot_precision = 0.0 dup_tot_recall = 0.0 dup_bi_toto_recall = 0.0 dup_num = 0 test_num = 0 infile = open(filename, 'r') for line in infile: IRLog.get_instance().println('----test----') test_num += 1 line.strip() new_report = IRReport.from_string(line) ori_report = IRReport(new_report.get_dummy_bug_id()) #IRLog.get_instance().println('Summary') #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(), # new_report.get_summary_termcount()) #IRLog.get_instance().println('Description') #IRTermCount.show_dict_compare(ori_report.get_description_termcount(), # new_report.get_description_termcount()) # do test for single similarities, duplicates = new_report.similarities_and_duplicates() sim_ids = [sim[0] for sim in similarities] dup_ids = [dup[0] for dup in duplicates] IRLog.get_instance().println('Sim ids: %s' % str(sim_ids)) IRLog.get_instance().println('Dup ids: %s' % str(dup_ids)) # evaluate sim sim_hit, sim_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id) # some group contain only one if real_duplicates.__len__() == 0: test_num -= 1 continue precision, recall = self.__report_result( new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates) sim_tot_precision += precision sim_tot_recall += recall sim_tot_size += sim_ids.__len__() sim_bi_tot_recall += 1 if recall > 0.0 else 0 if dup_ids.__len__() > 0: dup_num += 1 dup_hit, dup_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id) precision, recall = self.__report_result( new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates) dup_tot_precision += precision dup_tot_recall += recall dup_bi_toto_recall += 1 if recall > 0.0 else 0 # general conclusion if dup_num == 0: dup_num = 1.0 IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\ '#dup', 'dup pre', 'dup rec', 'dup birec'])) IRLog.get_instance().println(','.join([str(test_num), \ str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \ str(dup_num), \ str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)])) infile.close()
def iter_text(item): if IRText.is_drop_report(item[bug_description_name]): remove_ids.append(item[bug_id_name]) IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3)