def __init__(self, id, dispatcher): from ir_report import IRReport threading.Thread.__init__(self) self.__id = id self.__msg_queue = Queue.Queue(maxsize=10) self.__report = IRReport('', '') self.__dispatcher = dispatcher
def test_top_n_similarity_over_all(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport(100000) bugs_similarities = report.top_n_similarity_over_all(10) IRLog.get_instance().println('Bugs with top similarities with bug %d: %s' \ % (100000, str(bugs_similarities))) IRLog.get_instance().stop_log()
def test_binary_search_less(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport('','') array1 = range(10,0,-1) assert 8 == report.binary_search_less(array1, lambda x:x, 3) assert 0 == report.binary_search_less(array1, lambda x:x, 11) assert 10 == report.binary_search_less(array1, lambda x:x, 1) array2 = [] assert -1 == report.binary_search_less(array2, lambda x:x, 1)
def test_get_report_difference(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_recommender import IRRecommender IRConfig.get_instance().load('../data/test/bug_test.cfg') new_report = IRReport('apple for summary', 'linux description') sim_report = IRReport('apple of ghost crashed', 'description linux wow') (diff_sum, diff_desc) = \ IRRecommender.get_report_difference(new_report, sim_report) IRLog.get_instance().println('New summary: %s' \ % (new_report.get_summary_text())) IRLog.get_instance().println('Sim summary: %s' \ % (sim_report.get_summary_text())) IRLog.get_instance().println('New description: %s' \ % (new_report.get_description_text())) IRLog.get_instance().println('Sim description: %s' \ % (sim_report.get_description_text())) IRLog.get_instance().println('Diff of summary: %s' % (diff_sum)) IRLog.get_instance().println('Diff of description: %s' % (diff_desc)) assert diff_sum == {'ghost', 'crash'} assert diff_desc == {'wow'}
def __generate_single_bug(self, bug_id, drop_rate): """Generate an incomplete bug report text. Args: bug_id: int, original bug id. drop_rate: float, 0.0 for not drop, 1.0 for totally drop. Returns: IRReport """ from ir_text import IRText from ir_term_count import IRTermCount from ir_report import IRReport # get description and summary summary, description = IRText.get_summary_and_description_of_bug( bug_id) create_ts, product = IRText.get_basic_info_of_bug(bug_id) if drop_rate > 0.001: summary, description = \ IRTermCount.create_incomplete_report(summary, description, drop_rate) print description new_report = IRReport(summary, description) new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id)) new_report.set_dummy_bug_id(bug_id) new_report.set_basic_info(create_ts, product) return new_report
def test_similarities_and_duplicates(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport(100000) similarities, duplicates = report.similarities_and_duplicates() IRLog.get_instance().println('Report %d' % (100000)) IRLog.get_instance().println('%d Similar Reports: %s' % (similarities .__len__(), ','.join([str(item[0]) for item in similarities]))) IRLog.get_instance().println('%d Duplicate Reports: %s' % (duplicates .__len__(), ','.join([str(item[0]) for item in duplicates]))) IRLog.get_instance().stop_log()
def test_similarities_and_duplicates(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport(100000) similarities, duplicates = report.similarities_and_duplicates() IRLog.get_instance().println('Report %d' % (100000)) IRLog.get_instance().println( '%d Similar Reports: %s' % (similarities.__len__(), ','.join( [str(item[0]) for item in similarities]))) IRLog.get_instance().println( '%d Duplicate Reports: %s' % (duplicates.__len__(), ','.join( [str(item[0]) for item in duplicates]))) IRLog.get_instance().stop_log()
def set_report_basic_info(report, msg): from ir_report import IRReport new_report = IRReport.from_string(msg.strip()) new_report.set_stacktrace(report.get_stacktrace()) new_report.set_penalty_terms(report.get_penalty_terms()) new_report.set_skip_terms(report.get_skip_terms()) new_report.set_exclude_report_ids(report.get_exclude_report_ids()) return new_report
def __init__(self, id, dispatcher): from ir_report import IRReport threading.Thread.__init__(self) self.__id = id self.__msg_queue = Queue.Queue(maxsize = 10) self.__report = IRReport('','') self.__dispatcher = dispatcher
def __generate_single_bug(self, bug_id, drop_rate): """Generate an incomplete bug report text. Args: bug_id: int, original bug id. drop_rate: float, 0.0 for not drop, 1.0 for totally drop. Returns: IRReport """ from ir_text import IRText from ir_term_count import IRTermCount from ir_report import IRReport # get description and summary summary, description = IRText.get_summary_and_description_of_bug(bug_id) create_ts, product = IRText.get_basic_info_of_bug(bug_id) if drop_rate > 0.001: summary, description = \ IRTermCount.create_incomplete_report(summary, description, drop_rate) print description new_report = IRReport(summary, description) new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id)) new_report.set_dummy_bug_id(bug_id) new_report.set_basic_info(create_ts, product) return new_report
def get_report_from_test_file(cls, filename, bug_id): from ir_report import IRReport infile = open(filename, 'r') for line in infile: raw = line.split(IRReport.separator) dummy_bug_id = int(raw[5]) if bug_id == dummy_bug_id: return IRReport.from_string(line.strip()) return None
def compare(cls, bug_a, bug_b): """ compare the calculation of two bugs (both in db). """ from ir_report import IRReport report_a = IRReport(bug_a) report_b = IRReport(bug_b) title_a = 'indb' + str(bug_a) title_b = 'indb' + str(bug_b) # compare text cls.print_text(title_a, report_a) cls.print_text(title_b, report_b) # compare term frequency cls.compare_and_print_termcount(title_a, report_a, title_b, report_b) # compare tfidf cls.compare_and_print_tfidf(title_a, report_a, title_b, report_b) # similarity cls.print_similarity_score(report_a, report_b)
def do_recommend_cmd(cls, cmd_text): """Do recommend from cmd_text Args: cmd_text: str, the text follows the standard format, create_ts;product;summary;raw_description """ from ir_report import IRReport new_report = IRReport.from_string(cmd_text.strip()) cls.do_recommend(new_report)
def test_get_report_differences(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_recommender import IRRecommender IRConfig.get_instance().load('../data/test/bug_test.cfg') new_report = IRReport('apple for summary', 'linux description') sim_reports = [ IRReport('apple of ghost crashed', 'description linux wow'), IRReport(100000), IRReport(100200) ] diffs = \ IRRecommender.get_all_reports_difference(new_report, sim_reports) for diff in diffs: IRLog.get_instance().println('Diff of summary: %s' % (diff[0])) IRLog.get_instance().println('Diff of description: %s' % (diff[1]))
def test_create_new_report(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_text = 'Firefox crashed' description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!' report = IRReport(summary_text, description_text) assert summary_text == report.get_summary_text() assert description_text == report.get_description_text() report.get_summary_and_description_tfidf() report.get_summary_and_description_tfidf_squared_length() IRLog.get_instance().stop_log()
def test_binary_search_less(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRConfig.get_instance().load('../data/test/bug_test.cfg') report = IRReport('', '') array1 = range(10, 0, -1) assert 8 == report.binary_search_less(array1, lambda x: x, 3) assert 0 == report.binary_search_less(array1, lambda x: x, 11) assert 10 == report.binary_search_less(array1, lambda x: x, 1) array2 = [] assert -1 == report.binary_search_less(array2, lambda x: x, 1)
def test_similarity_with(self): from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport IRConfig.get_instance().load('../data/test/bug_test.cfg') IRConfig.get_instance().set('tfidf_algorithm', 'tfidf') report_a = IRReport(100000) report_b = IRReport(100200) IRLog.get_instance().println('TFIDF similarity between %d and %d is %f' % \ (100000, 100200, report_a.similarity_with(report_b)[0])) IRConfig.get_instance().set('tfidf_algorithm', 'bidf') report_a = IRReport(100000) report_b = IRReport(100200) IRLog.get_instance().println('Bidf similarity between %d and %d is %f' % \ (100000, 100200, report_a.similarity_with(report_b)[0])) IRConfig.get_instance().set('scoring_strategy', 'weighted') IRConfig.get_instance().set('bug_summary_ratio', 0.25) IRConfig.get_instance().set('bug_description_ratio', 0.25) IRConfig.get_instance().set('bug_stacktrace_ratio', 0.5) IRLog.get_instance().println('Bidf (Weighted Scoring) similarity ' 'between ' '%d and %d ' 'is %f' % \ (100000, 100200, report_a.similarity_with(report_b)[0])) IRConfig.get_instance().set('scoring_strategy', 'heuristic') IRConfig.get_instance().set('bug_summary_ratio', 0.5) IRConfig.get_instance().set('bug_description_ratio', 0.5) IRLog.get_instance().println('Bidf (Heuristic Scoring) similarity ' 'between ' '%d and %d ' 'is %f' % \ (100000, 100200, report_a.similarity_with(report_b)[0])) IRConfig.get_instance().set('scoring_strategy', 'distweighted') IRConfig.get_instance().set('bug_summary_ratio', 0.5) IRConfig.get_instance().set('bug_description_ratio', 0.5) IRLog.get_instance().println('Bidf (Heuristic Scoring) similarity ' 'between ' '%d and %d ' 'is %f' % \ (100000, 100200, report_a.similarity_with(report_b)[0]))
def compare_with_sim_file(cls, bug_a, filename, bug_b): """Warning: bug_a acts as bug in database, bug_b acts as new report.""" from ir_sim_bug_evaluator import IRSimBugEvaluator from ir_report import IRReport title_a = "indb" + str(bug_a) title_b = "file" + str(bug_b) report_a = IRReport(bug_a) report_b = IRSimBugEvaluator.get_report_from_test_file(filename, bug_b) # text cls.print_text(title_a, report_a) cls.print_text(title_b, report_b) # term frequency cls.compare_and_print_termcount(title_a, report_a, title_b, report_b) # tfidf cls.compare_and_print_tfidf(title_a, report_a, title_b, report_b) # similarity cls.print_similarity_score(report_b, report_a)
def do_recommend(cls, new_report): import time from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_sentence import IRSentence print 'DO_RECOMMEND', str(new_report.get_stacktrace()) IRLog.get_instance().println('Finding similar reports') start_t = time.time() sim_bug_ids, dup_bug_ids = new_report.similarities_and_duplicates() similar_t = time.time() IRLog.get_instance().println('Found %d similar reports and %d duplicate reports in %fs' \ % (sim_bug_ids.__len__(), dup_bug_ids.__len__(), similar_t - start_t)) sim_bugs = [IRReport(sim_bug_id[0]) for sim_bug_id in sim_bug_ids] duplicate_packs = [ cls.get_report_text_by_bug_id(dup_bug_id[0]) for dup_bug_id in dup_bug_ids ] IRLog.get_instance().println('Duplicate reports: %s' % ','.\ join([str(dup_bug_id[0]) for dup_bug_id in dup_bug_ids])) IRLog.get_instance().println('Extracting key term') deltas = cls.get_all_reports_difference(new_report, sim_bugs) term = cls.get_term_by_simple_entropy(deltas, sim_bug_ids, new_report.get_penalty_terms()) term_t = time.time() keyword = term IRLog.get_instance().println('Choose term: %s in %fs' \ % (term, term_t - similar_t)) # pick out candidate sentences max_sentences_number = 1000 cur_sentences_number = 0 IRLog.get_instance().println('Extracting sentences') candidate_sentences = [] for index, delta in enumerate(deltas): if not term in delta[1]: continue # term in this report sentences = IRSentence.get_sentence_from_description( sim_bugs[index].get_description_text(), sim_bugs[index].get_bug_id()) for sentence in sentences: if sentence.contain_term(term): candidate_sentences.append(sentence) cur_sentences_number += 1 if cur_sentences_number > max_sentences_number: break sent_t = time.time() IRLog.get_instance().println('Extracted %d sentences from %d reports in %fs' \ % (candidate_sentences.__len__(), deltas.__len__(), sent_t - term_t)) # cluster sentences IRLog.get_instance().println('Clustering sentences') selected_sentences_num = IRConfig.get_instance().get_int( 'bug_sentence_number') if candidate_sentences.__len__() > selected_sentences_num: clusters, sentence_ids = IRSentence.cluster_sentences( candidate_sentences, selected_sentences_num) else: clusters = [x for x in xrange(candidate_sentences.__len__())] sentence_ids = clusters clust_t = time.time() IRLog.get_instance().println('Finished clustering in %fs' \ % (clust_t - sent_t)) # pick out the sentences nearest to centroid in each group #pick_group = set() #for index, cluster in enumerate(clusters): # if cluster in pick_group: # continue # pick_group.add(cluster) # if pick_group.__len__() == selected_sentences_num: # break # IRLog.get_instance().println("Recommend: %s" \ # % (candidate_sentences[index].get_text())) sentence_packs = [] sentence_report_ids = [] for sentence_id in sentence_ids: IRLog.get_instance().println("Recommend (Report#: %d): %s" \ % ( candidate_sentences[sentence_id].get_bug_id(), candidate_sentences[sentence_id].get_text()) ) sentence_packs.append( (candidate_sentences[sentence_id].get_bug_id(), candidate_sentences[sentence_id].get_text())) sentence_report_ids.append( candidate_sentences[sentence_id].get_bug_id()) IRLog.get_instance().println('Recommending finished in %fs' \ % (time.time() - start_t)) return keyword, sentence_packs, duplicate_packs
def start_shell(cls): """Start a shell that do recommending interactively""" from ir_log import IRLog from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount from ir_report import IRReport IRLog.get_instance().println("Starting Intereport...") IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println("Intereport Started. Waiting for input") new_report = None while 1: cmd = raw_input("Input command:").strip() if cmd == 'exit': IRLog.get_instance().println('Exiting') break elif cmd == 'new': IRLog.get_instance().println('Creating New Report') import time cur_time = -1 while cur_time < 0: try: cur_time = int( time.mktime( time.strptime( raw_input( "Input Time (e.g., 2011-05-05): "), '%Y-%m-%d'))) except: cur_time = -1 product = raw_input("Input Product: ") summary = raw_input("Summary: ") raw_description = raw_input("Description:\n") new_report = IRReport.from_string( IRReport.separator.join([ str(cur_time), product.lower(), summary, raw_description, '', '' ])) cls.__print_report(new_report) elif cmd == 'do': IRLog.get_instance().println('Do Recommending') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.do_recommend(new_report) elif cmd == 'ls': IRLog.get_instance().println('Show Current Report') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.__print_report(new_report) elif cmd == 'ad': IRLog.get_instance().println('Appending Description') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: append_description = raw_input("Append Description:\n") description = ' '.join([ new_report.get_description_text(), append_description ]) dummy_report = IRReport(new_report.get_summary_text(), description) dummy_report.set_stacktrace(new_report.get_stacktrace()) dummy_report.set_basic_info(new_report.get_create_ts(), new_report.get_product()) dummy_report.set_penalty_terms( new_report.get_penalty_terms()) dummy_report.set_dummy_bug_id( new_report.get_dummy_bug_id()) new_report = dummy_report IRLog.get_instance().println('Description: %s' % description) elif cmd == 'ap': IRLog.get_instance().println('Appending Penalties') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: raw = [] while raw.__len__() < 1: raw = raw_input( 'Input Penalties (split by \',\'):').split(',') from ir_term_count import IRTermCount penalty = new_report.get_penalty_terms() if penalty is None: penalty = [] penalty += IRTermCount.do_stemming(raw) new_report.set_penalty_terms(penalty) print len(penalty), penalty IRLog.get_instance().println('Penalties: %s' % \ (', '.join(penalty))) elif cmd == 'sd': IRLog.get_instance().println('Set Dummy Bug ID') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: bug_id = -1 while bug_id <= 0: try: bug_id = int(raw_input('Dummy Bug ID: ')) except: bug_id = -1 new_report.set_dummy_bug_id(bug_id) IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id) elif cmd == 'help': cls.__show_help() else: IRLog.get_instance().println('Error! Unkown command: %s' \ % cmd) cls.__show_help() # end of while 1 IRLog.get_instance().println("Bye")
def do_test_over_file(self, filename): """Do test over the file. Args: filename: str, the input file which generated by generate_incomplete_test_file. """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup from ir_text import IRText from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF from ir_report import IRReport from ir_document_count import IRDocumentCount IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() remove_self_bug_id = IRConfig.get_instance().get_bool( 'remove_self_bug_id', True) sim_tot_precision = 0.0 sim_tot_recall = 0.0 sim_bi_tot_recall = 0.0 sim_tot_size = 0 dup_tot_precision = 0.0 dup_tot_recall = 0.0 dup_bi_toto_recall = 0.0 dup_num = 0 test_num = 0 infile = open(filename, 'r') for line in infile: IRLog.get_instance().println('----test----') test_num += 1 line.strip() new_report = IRReport.from_string(line) ori_report = IRReport(new_report.get_dummy_bug_id()) #IRLog.get_instance().println('Summary') #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(), # new_report.get_summary_termcount()) #IRLog.get_instance().println('Description') #IRTermCount.show_dict_compare(ori_report.get_description_termcount(), # new_report.get_description_termcount()) # do test for single similarities, duplicates = new_report.similarities_and_duplicates() sim_ids = [sim[0] for sim in similarities] dup_ids = [dup[0] for dup in duplicates] IRLog.get_instance().println('Sim ids: %s' % str(sim_ids)) IRLog.get_instance().println('Dup ids: %s' % str(dup_ids)) # evaluate sim sim_hit, sim_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id) # some group contain only one if real_duplicates.__len__() == 0: test_num -= 1 continue precision, recall = self.__report_result( new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates) sim_tot_precision += precision sim_tot_recall += recall sim_tot_size += sim_ids.__len__() sim_bi_tot_recall += 1 if recall > 0.0 else 0 if dup_ids.__len__() > 0: dup_num += 1 dup_hit, dup_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id) precision, recall = self.__report_result( new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates) dup_tot_precision += precision dup_tot_recall += recall dup_bi_toto_recall += 1 if recall > 0.0 else 0 # general conclusion if dup_num == 0: dup_num = 1.0 IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\ '#dup', 'dup pre', 'dup rec', 'dup birec'])) IRLog.get_instance().println(','.join([str(test_num), \ str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \ str(dup_num), \ str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)])) infile.close()
def get_report_text_by_bug_id(cls, id): from ir_report import IRReport report = IRReport(id) summary_text, description_text = report.get_summary_and_description_text( ) return id, summary_text, description_text
test_file = sys.argv[3] bug_id = int(sys.argv[4]) from ir_sim_bug_evaluator import IRSimBugEvaluator new_report = IRSimBugEvaluator.get_report_from_test_file( test_file, bug_id) if new_report is None: IRLog.get_instance().println('Error! Cannot find report %d in %s' % \ (bug_id, test_file)) else: if sys.argv.__len__() > 5: from ir_term_count import IRTermCount penalty_terms_raw = sys.argv[4].split(',') penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw)) IRLog.get_instance().println('%d penalty terms: %s:' \ % (penalty_terms.__len__(), ','.join(penalty_terms))) new_report.set_penalty_terms(penalty_terms) elif mode == 'text': text = sys.argv[3] new_report = IRReport.from_string(text) elif mode == 'inte': IRRecommender.start_shell() exit() else: IRLog.get_instance().println('Error! Known mode %s' % mode) from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRRecommender.do_recommend(new_report) IRLog.get_instance().stop_log()
class IRSession(threading.Thread): """ A session. """ TIMEOUT = None def __init__(self, id, dispatcher): from ir_report import IRReport threading.Thread.__init__(self) self.__id = id self.__msg_queue = Queue.Queue(maxsize = 10) self.__report = IRReport('','') self.__dispatcher = dispatcher def enqueue(self, msgpack): self.__msg_queue.put(msgpack) def run(self): from ir_log import IRLog session_state = STATE_ALIVE while session_state == STATE_ALIVE: try: msgpack = self.__msg_queue.get(True) # do something to msgpack conn = msgpack['connection'] respack = msgpack['respack'] respack[SESSION_ID] = msgpack[SESSION_ID] # set phase for key, value in msgpack.items(): if key in SET_COMMANDS: self.__report = SET_COMMANDS[key](self.__report, value) # do phase signal = SIGNAL_CONTINUE for key, value in msgpack.items(): if key in CTL_COMMANDS: signal = CTL_COMMANDS[key](self.__report, respack) if signal == SIGNAL_BREAK: session_state = STATE_EXPIRED self.__pack_report_info(respack) IRLog.get_instance().println('Send message: %s' % str(respack)) conn.send(str(respack)) except Queue.Empty: from ir_log import IRLog IRLog.get_instance().println('Session %d time out' % self.__id, 2) break self.__dispatcher.remove_session(self.__id) def __pack_report_info(self, respack): respack[FEEDBACK_PRODUCT] = self.__report.get_product() respack[FEEDBACK_CREATE_TS] = self.__report.get_create_ts() respack[FEEDBACK_SUMMARY] = self.__report.get_summary_text() respack[FEEDBACK_DESCRIPTION] = self.__report.get_description_text() respack[FEEDBACK_PENALTY] = self.__report.get_penalty_terms() respack[FEEDBACK_SKIP] = self.__report.get_skip_terms() respack[FEEDBACK_IGNORE] = self.__report.get_exclude_report_ids() from ir_config import IRConfig respack[FEEDBACK_MAX_SENTENCES] = IRConfig.get_instance().get_int( 'bug_sentence_number') respack[FEEDBACK_MAX_DUPLICATES] = IRConfig.get_instance().get_int( 'bug_duplicate_number') respack[FEEDBACK_REPORT_LINK] = IRConfig.get_instance().get( 'bugzilla_report_link')
class IRSession(threading.Thread): """ A session. """ TIMEOUT = None def __init__(self, id, dispatcher): from ir_report import IRReport threading.Thread.__init__(self) self.__id = id self.__msg_queue = Queue.Queue(maxsize=10) self.__report = IRReport('', '') self.__dispatcher = dispatcher def enqueue(self, msgpack): self.__msg_queue.put(msgpack) def run(self): from ir_log import IRLog session_state = STATE_ALIVE while session_state == STATE_ALIVE: try: msgpack = self.__msg_queue.get(True) # do something to msgpack conn = msgpack['connection'] respack = msgpack['respack'] respack[SESSION_ID] = msgpack[SESSION_ID] # set phase for key, value in msgpack.items(): if key in SET_COMMANDS: self.__report = SET_COMMANDS[key](self.__report, value) # do phase signal = SIGNAL_CONTINUE for key, value in msgpack.items(): if key in CTL_COMMANDS: signal = CTL_COMMANDS[key](self.__report, respack) if signal == SIGNAL_BREAK: session_state = STATE_EXPIRED self.__pack_report_info(respack) IRLog.get_instance().println('Send message: %s' % str(respack)) conn.send(str(respack)) except Queue.Empty: from ir_log import IRLog IRLog.get_instance().println('Session %d time out' % self.__id, 2) break self.__dispatcher.remove_session(self.__id) def __pack_report_info(self, respack): respack[FEEDBACK_PRODUCT] = self.__report.get_product() respack[FEEDBACK_CREATE_TS] = self.__report.get_create_ts() respack[FEEDBACK_SUMMARY] = self.__report.get_summary_text() respack[FEEDBACK_DESCRIPTION] = self.__report.get_description_text() respack[FEEDBACK_PENALTY] = self.__report.get_penalty_terms() respack[FEEDBACK_SKIP] = self.__report.get_skip_terms() respack[FEEDBACK_IGNORE] = self.__report.get_exclude_report_ids() from ir_config import IRConfig respack[FEEDBACK_MAX_SENTENCES] = IRConfig.get_instance().get_int( 'bug_sentence_number') respack[FEEDBACK_MAX_DUPLICATES] = IRConfig.get_instance().get_int( 'bug_duplicate_number') respack[FEEDBACK_REPORT_LINK] = IRConfig.get_instance().get( 'bugzilla_report_link')
def set_report_info(report, msg): from ir_report import IRReport new_report = IRReport.from_string(msg.strip()) return new_report
if mode == 'file': test_file = sys.argv[3] bug_id = int(sys.argv[4]) from ir_sim_bug_evaluator import IRSimBugEvaluator new_report = IRSimBugEvaluator.get_report_from_test_file(test_file, bug_id) if new_report is None: IRLog.get_instance().println('Error! Cannot find report %d in %s' % \ (bug_id, test_file)) else: if sys.argv.__len__() > 5: from ir_term_count import IRTermCount penalty_terms_raw = sys.argv[4].split(',') penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw)) IRLog.get_instance().println('%d penalty terms: %s:' \ % (penalty_terms.__len__(), ','.join(penalty_terms))) new_report.set_penalty_terms(penalty_terms) elif mode == 'text': text = sys.argv[3] new_report = IRReport.from_string(text) elif mode == 'inte': IRRecommender.start_shell() exit() else: IRLog.get_instance().println('Error! Known mode %s' % mode) from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRRecommender.do_recommend(new_report) IRLog.get_instance().stop_log()
def get_report_text_by_bug_id(cls, id): from ir_report import IRReport report = IRReport(id) summary_text, description_text = report.get_summary_and_description_text() return id, summary_text, description_text
def test_create_new_report_from_string(self): from nose.tools import eq_ from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_term_count import IRTermCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_text = 'Firefox crashed' description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!\n' report = IRReport(summary_text, description_text) report.set_basic_info(12345, 'core') report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore'])) report.set_exclude_report_ids([100100]) report.set_dummy_bug_id(12345) report.set_skip_terms(IRTermCount.do_stemming(['new', 'please'])) # save to text text = report.to_string() IRLog.get_instance().println('Serialized report: %s' % (text)) # load from text new_report = IRReport.from_string(text) assert new_report.get_summary_text() == report.get_summary_text() eq_(new_report.get_description_text().strip(), report.get_description_text().strip()) assert new_report.get_create_ts() == report.get_create_ts() assert new_report.get_product() == report.get_product() assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id() assert new_report.get_penalty_terms() == report.get_penalty_terms() assert new_report.get_exclude_report_ids( ) == report.get_exclude_report_ids() eq_(new_report.get_skip_terms(), report.get_skip_terms()) IRLog.get_instance().stop_log()
def start_shell(cls): """Start a shell that do recommending interactively""" from ir_log import IRLog from ir_tfidf import IRTFIDF from ir_document_count import IRDocumentCount from ir_report import IRReport IRLog.get_instance().println("Starting Intereport...") IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() IRLog.get_instance().println("Intereport Started. Waiting for input") new_report = None while 1: cmd = raw_input("Input command:").strip() if cmd == 'exit': IRLog.get_instance().println('Exiting') break elif cmd == 'new': IRLog.get_instance().println('Creating New Report') import time cur_time = -1 while cur_time < 0: try: cur_time = int(time.mktime(time.strptime( raw_input("Input Time (e.g., 2011-05-05): "), '%Y-%m-%d'))) except: cur_time = -1 product = raw_input("Input Product: ") summary = raw_input("Summary: ") raw_description = raw_input("Description:\n") new_report = IRReport.from_string(IRReport.separator.join([ str(cur_time), product.lower(), summary, raw_description, '', ''])) cls.__print_report(new_report) elif cmd == 'do': IRLog.get_instance().println('Do Recommending') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.do_recommend(new_report) elif cmd == 'ls': IRLog.get_instance().println('Show Current Report') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: cls.__print_report(new_report) elif cmd == 'ad': IRLog.get_instance().println('Appending Description') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: append_description = raw_input("Append Description:\n") description =' '.join([new_report.get_description_text(), append_description]) dummy_report = IRReport(new_report.get_summary_text(), description) dummy_report.set_stacktrace(new_report.get_stacktrace()) dummy_report.set_basic_info(new_report.get_create_ts(), new_report.get_product()) dummy_report.set_penalty_terms(new_report.get_penalty_terms()) dummy_report.set_dummy_bug_id(new_report.get_dummy_bug_id()) new_report = dummy_report IRLog.get_instance().println('Description: %s' % description) elif cmd == 'ap': IRLog.get_instance().println('Appending Penalties') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: raw = [] while raw.__len__() < 1: raw = raw_input('Input Penalties (split by \',\'):').split(',') from ir_term_count import IRTermCount penalty = new_report.get_penalty_terms() if penalty is None: penalty = [] penalty += IRTermCount.do_stemming(raw) new_report.set_penalty_terms(penalty) print len(penalty), penalty IRLog.get_instance().println('Penalties: %s' % \ (', '.join(penalty))) elif cmd == 'sd': IRLog.get_instance().println('Set Dummy Bug ID') if new_report is None: IRLog.get_instance().println('Error! Please create ' 'report first.') else: bug_id = -1 while bug_id <= 0: try: bug_id = int(raw_input('Dummy Bug ID: ')) except: bug_id = -1 new_report.set_dummy_bug_id(bug_id) IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id) elif cmd == 'help': cls.__show_help() else: IRLog.get_instance().println('Error! Unkown command: %s' \ % cmd) cls.__show_help() # end of while 1 IRLog.get_instance().println("Bye")
def do_test_over_file(self, filename): """Do test over the file. Args: filename: str, the input file which generated by generate_incomplete_test_file. """ from ir_log import IRLog from ir_config import IRConfig from ir_duplicate_group import IRDuplicateGroup from ir_text import IRText from ir_term_count import IRTermCount from ir_tfidf import IRTFIDF from ir_report import IRReport from ir_document_count import IRDocumentCount IRText.cache_all_data() IRTFIDF.cache_all_data() IRDocumentCount.cache_all_data() remove_self_bug_id = IRConfig.get_instance().get_bool('remove_self_bug_id', True) sim_tot_precision = 0.0 sim_tot_recall = 0.0 sim_bi_tot_recall = 0.0 sim_tot_size = 0 dup_tot_precision = 0.0 dup_tot_recall = 0.0 dup_bi_toto_recall = 0.0 dup_num = 0 test_num = 0 infile = open(filename, 'r') for line in infile: IRLog.get_instance().println('----test----') test_num += 1 line.strip() new_report = IRReport.from_string(line) ori_report = IRReport(new_report.get_dummy_bug_id()) #IRLog.get_instance().println('Summary') #IRTermCount.show_dict_compare(ori_report.get_summary_termcount(), # new_report.get_summary_termcount()) #IRLog.get_instance().println('Description') #IRTermCount.show_dict_compare(ori_report.get_description_termcount(), # new_report.get_description_termcount()) # do test for single similarities, duplicates = new_report.similarities_and_duplicates() sim_ids = [sim[0] for sim in similarities] dup_ids = [dup[0] for dup in duplicates] IRLog.get_instance().println('Sim ids: %s' % str(sim_ids)) IRLog.get_instance().println('Dup ids: %s' % str(dup_ids)) # evaluate sim sim_hit, sim_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), sim_ids, remove_self_bug_id) # some group contain only one if real_duplicates.__len__() == 0: test_num -= 1 continue precision, recall = self.__report_result( new_report.get_dummy_bug_id(), sim_hit, sim_nothit, real_duplicates) sim_tot_precision += precision sim_tot_recall += recall sim_tot_size += sim_ids.__len__() sim_bi_tot_recall += 1 if recall > 0.0 else 0 if dup_ids.__len__() > 0: dup_num += 1 dup_hit, dup_nothit, real_duplicates = \ IRDuplicateGroup.is_in_same_duplicate_group( new_report.get_dummy_bug_id(), dup_ids, remove_self_bug_id) precision, recall = self.__report_result( new_report.get_dummy_bug_id(), dup_hit, dup_nothit, real_duplicates) dup_tot_precision += precision dup_tot_recall += recall dup_bi_toto_recall += 1 if recall > 0.0 else 0 # general conclusion if dup_num == 0: dup_num = 1.0 IRLog.get_instance().println(','.join(['#cases', 'sim pre', 'sim rec', 'sim birec', 'sim size',\ '#dup', 'dup pre', 'dup rec', 'dup birec'])) IRLog.get_instance().println(','.join([str(test_num), \ str(sim_tot_precision/test_num), str(sim_tot_recall/test_num), str(sim_bi_tot_recall/test_num), str(float(sim_tot_size)/test_num), \ str(dup_num), \ str(dup_tot_precision/dup_num), str(dup_tot_recall/dup_num), str(dup_bi_toto_recall/dup_num)])) infile.close()
def test_create_new_report_from_string(self): from nose.tools import eq_ from ir_log import IRLog from ir_config import IRConfig from ir_report import IRReport from ir_term_count import IRTermCount IRLog.get_instance().start_log() IRConfig.get_instance().load('../data/test/bug_test.cfg') summary_text = 'Firefox crashed' description_text = 'When I was openning history folder, the f**king' \ ' Firefox just crashed!\n' report = IRReport(summary_text, description_text) report.set_basic_info(12345, 'core') report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore'])) report.set_exclude_report_ids([100100]) report.set_dummy_bug_id(12345) report.set_skip_terms(IRTermCount.do_stemming(['new','please'])) # save to text text = report.to_string() IRLog.get_instance().println('Serialized report: %s' % (text)) # load from text new_report = IRReport.from_string(text) assert new_report.get_summary_text() == report.get_summary_text() eq_(new_report.get_description_text().strip(), report.get_description_text().strip()) assert new_report.get_create_ts() == report.get_create_ts() assert new_report.get_product() == report.get_product() assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id() assert new_report.get_penalty_terms() == report.get_penalty_terms() assert new_report.get_exclude_report_ids() == report.get_exclude_report_ids() eq_(new_report.get_skip_terms(), report.get_skip_terms()) IRLog.get_instance().stop_log()