예제 #1
0
 def batch_generate_term_count(cls):
     """Generate term count for text in mongodb database,
         and store to database.
     """
     from ir_log import IRProgressBar
     from ir_text import IRText
     from ir_config import IRConfig
     from ir_mongodb_helper import IRCollection
     # config
     bug_id_name = IRConfig.get_instance().get('bug_id_name', 'bug_id')
     summary_name = IRConfig.get_instance().get('bug_summary_name', 'summ')
     description_name = IRConfig.get_instance().\
             get('bug_description_name', 'desc')
     
     termcount_collection = IRCollection(
         'bug_db_name', 'bug_termcount_collection_name', 'w')
     def iter_text(bug):
         summary_bow, description_bow = cls.calculate_term_count(
             bug[summary_name], bug[description_name])
         termcount_collection.insert({
             bug_id_name : bug[bug_id_name],
             summary_name : summary_bow,
             description_name : description_bow })
     IRProgressBar.execute_iteration_for_cursor(IRText.get_iterator({}),
                                                iter_text, "From Text to Term Count")
     termcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
     termcount_collection.close()
예제 #2
0
    def do_remove_bad_reports(cls, config_file):

        from ir_log import IRLog
        from ir_log import IRProgressBar
        from ir_config import IRConfig
        import ir_mongodb_helper
        from ir_text import IRText

        
        config = IRConfig.get_instance()
        config.load(config_file)
        bug_id_name = config.get('bug_id_name')
        bug_description_name = config.get('bug_description_name')
        text_cursor = IRText.get_iterator(None)
        remove_ids = []
        def iter_text(item):
            if IRText.is_drop_report(item[bug_description_name]):
                remove_ids.append(item[bug_id_name])
                IRLog.get_instance().println('Remove report#=%d' % item[bug_id_name], 3)
        IRProgressBar.execute_iteration_for_cursor(text_cursor, iter_text)

        # remove from all database
        def remove_from_collection(collection_cfg_name):
            collection =ir_mongodb_helper.IRCollection( \
                'bug_db_name', collection_cfg_name, 'a')
            collection.remove({'bug_id':{'$in':remove_ids}})
            collection.close()

        remove_from_collection('bug_text_collection_name')
        remove_from_collection('bug_tfidf_collection_name')
        remove_from_collection('bug_duplicate_collection_name')