def batch_generate_term_count(cls): """Generate term count for text in mongodb database, and store to database. """ from ir_log import IRProgressBar from ir_text import IRText from ir_config import IRConfig from ir_mongodb_helper import IRCollection # config bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id') summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ') description_name = IRConfig.get_instance().\ get('bug_description_name', 'desc') termcount_collection = IRCollection( 'bug_db_name', 'bug_termcount_collection_name', 'w') def iter_text(bug): summary_bow, description_bow = cls.calculate_term_count( bug[summary_name], bug[description_name]) termcount_collection.insert({ bug_id_name : bug[bug_id_name], summary_name : summary_bow, description_name : description_bow }) IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}), iter_text, "From Text to Term Count") termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)]) termcount_collection.close()
def do_remove_bad_reports(cls, config_file): from ir_log import IRLog from ir_log import IRProgressBar from ir_config import IRConfig import ir_mongodb_helper from ir_text import IRText config = IRConfig.get_instance() config.load(config_file) bug_id_name = config.get('bug_id_name') bug_description_name = config.get('bug_description_name') text_cursor = IRText.get_iterator(None) remove_ids = [] def iter_text(item): if IRText.is_drop_report(item[bug_description_name]): remove_ids.append(item[bug_id_name]) IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3) IRProgressBar.execute_iteration_for_cursor(text_cursor, iter_text) # remove from all database def remove_from_collection(collection_cfg_name): collection =ir_mongodb_helper.IRCollection( \ 'bug_db_name', collection_cfg_name, 'a') collection.remove({'bug_id':{'$in':remove_ids}}) collection.close() remove_from_collection('bug_text_collection_name') remove_from_collection('bug_tfidf_collection_name') remove_from_collection('bug_duplicate_collection_name')