Python IRTermCount 예제들, ir_term_count.IRTermCount Python 예제들

예제 #1

0

파일 보기

파일: test_ir_report.py 프로젝트: LeonXJ/Intereport

    def test_create_new_report_from_string(self):
        from nose.tools import eq_
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_term_count import IRTermCount

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!\n'
        report = IRReport(summary_text, description_text)
        report.set_basic_info(12345, 'core')
        report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore']))
        report.set_exclude_report_ids([100100])
        report.set_dummy_bug_id(12345)
        report.set_skip_terms(IRTermCount.do_stemming(['new', 'please']))
        # save to text
        text = report.to_string()
        IRLog.get_instance().println('Serialized report: %s' % (text))
        # load from text
        new_report = IRReport.from_string(text)

        assert new_report.get_summary_text() == report.get_summary_text()
        eq_(new_report.get_description_text().strip(),
            report.get_description_text().strip())
        assert new_report.get_create_ts() == report.get_create_ts()
        assert new_report.get_product() == report.get_product()
        assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id()
        assert new_report.get_penalty_terms() == report.get_penalty_terms()
        assert new_report.get_exclude_report_ids(
        ) == report.get_exclude_report_ids()
        eq_(new_report.get_skip_terms(), report.get_skip_terms())
        IRLog.get_instance().stop_log()

예제 #2

0

파일 보기

파일: test_ir_report.py 프로젝트: LeonXJ/Intereport

    def test_create_new_report_from_string(self):
        from nose.tools import eq_
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_report import IRReport
        from ir_term_count import IRTermCount

        IRLog.get_instance().start_log()
        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary_text = 'Firefox crashed'
        description_text = 'When I was openning history folder, the f**king' \
                ' Firefox just crashed!\n'
        report = IRReport(summary_text, description_text)
        report.set_basic_info(12345, 'core')
        report.set_penalty_terms(IRTermCount.do_stemming(['ie', 'explore']))
        report.set_exclude_report_ids([100100])
        report.set_dummy_bug_id(12345)
        report.set_skip_terms(IRTermCount.do_stemming(['new','please']))
        # save to text
        text = report.to_string()
        IRLog.get_instance().println('Serialized report: %s' % (text))
        # load from text
        new_report = IRReport.from_string(text)

        assert new_report.get_summary_text() == report.get_summary_text()
        eq_(new_report.get_description_text().strip(), report.get_description_text().strip())
        assert new_report.get_create_ts() == report.get_create_ts()
        assert new_report.get_product() == report.get_product()
        assert new_report.get_dummy_bug_id() == report.get_dummy_bug_id()
        assert new_report.get_penalty_terms() == report.get_penalty_terms()
        assert new_report.get_exclude_report_ids() == report.get_exclude_report_ids()
        eq_(new_report.get_skip_terms(), report.get_skip_terms())
        IRLog.get_instance().stop_log()

예제 #3

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_get_termcount_of_bug(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRTermCount.get_termcount_of_bug(100000)
        assert None != summary
        assert None != description
        IRLog.get_instance().println('Summary')
        IRTermCount.show_dict_compare(summary, {})
        IRLog.get_instance().println('Description')
        IRTermCount.show_dict_compare(description, {})

예제 #4

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_get_termcount_of_bug(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary, description = IRTermCount.get_termcount_of_bug(100000)
        assert None != summary
        assert None != description
        IRLog.get_instance().println('Summary')
        IRTermCount.show_dict_compare(summary, {})
        IRLog.get_instance().println('Description')
        IRTermCount.show_dict_compare(description, {})

예제 #5

0

파일 보기

 def compare_and_print_termcount(cls, title_a, report_a, title_b, report_b):
     from ir_log import IRLog
     from ir_term_count import IRTermCount
     summary_a, description_a = \
             report_a.get_summary_and_description_termcount()
     summary_b, description_b = \
             report_b.get_summary_and_description_termcount()
     IRLog.get_instance().println('[Termcount][Summary][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(summary_a, summary_b)
     IRLog.get_instance().println('[Termcount][Description][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(description_a, description_b)

예제 #6

0

파일 보기

파일: ir_debugger.py 프로젝트: LeonXJ/Intereport

 def compare_and_print_termcount(cls, title_a, report_a,
                                 title_b, report_b):
     from ir_log import IRLog
     from ir_term_count import IRTermCount
     summary_a, description_a = \
             report_a.get_summary_and_description_termcount()
     summary_b, description_b = \
             report_b.get_summary_and_description_termcount()
     IRLog.get_instance().println('[Termcount][Summary][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(summary_a, summary_b)
     IRLog.get_instance().println('[Termcount][Description][%s][%s]' \
             % (title_a, title_b))
     IRTermCount.show_dict_compare(description_a, description_b)

예제 #7

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_batch_report_term_count(self):
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTermCount.batch_generate_term_count()
        # simple size test
        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        text_collection = db[IRConfig.get_instance().\
                get('bug_text_collection_name')]
        termcount_collection = db[IRConfig.get_instance().\
                get('bug_termcount_collection_name')]
        assert text_collection.count() == termcount_collection.count()

예제 #8

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_batch_report_term_count(self):
        from ir_config import IRConfig
        from ir_mongodb_helper import IRMongodbHelper
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTermCount.batch_generate_term_count()
        # simple size test
        con = IRMongodbHelper.get_instance().get_connection()
        db = con[IRConfig.get_instance().get('bug_db_name')]
        text_collection = db[IRConfig.get_instance().\
                get('bug_text_collection_name')]
        termcount_collection = db[IRConfig.get_instance().\
                get('bug_termcount_collection_name')]
        assert text_collection.count() == termcount_collection.count()

예제 #9

0

파일 보기

파일: ir_sim_bug_evaluator.py 프로젝트: LeonXJ/Intereport

    def __generate_single_bug(self, bug_id, drop_rate):
        """Generate an incomplete bug report text.
        
        Args:
            bug_id: int, original bug id.
            drop_rate: float, 0.0 for not drop, 1.0 for totally drop.
        
        Returns:
            IRReport
        """
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_report import IRReport

        # get description and summary
        summary, description = IRText.get_summary_and_description_of_bug(bug_id)
        create_ts, product = IRText.get_basic_info_of_bug(bug_id)
        if drop_rate > 0.001:
            summary, description = \
                IRTermCount.create_incomplete_report(summary, description, drop_rate)
            print description
        new_report = IRReport(summary, description)
        new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id))
        new_report.set_dummy_bug_id(bug_id)
        new_report.set_basic_info(create_ts, product)
        return new_report

예제 #10

0

파일 보기

파일: ir_recommender.py 프로젝트: LeonXJ/Intereport

    def get_report_difference(cls, new_report, similar_report):
        """Get the difference of terms of reports in similar_reports.
        return the dict difference of its summary and description respectly

        Args:
            new_report: IRReport, The new report.
            similar_report: IRReport, The similar report.

        Returns:
            (set, set), (diff of summary, diff of description)
        """

        new_summary_termcount, new_description_termcount = \
                new_report.get_summary_and_description_termcount()

        sim_summary_termcount, sim_description_termcount = \
                similar_report.get_summary_and_description_termcount()

        diff_summary = cls.__get_dict_difference(new_summary_termcount,
                                                 sim_summary_termcount)
        diff_description = cls.__get_dict_difference(
            new_description_termcount, sim_description_termcount)
        # still, we don't want the term in summary to be recommended
        diff_description -= set(new_summary_termcount.keys())
        # skip the skip_terms in new report
        diff_description -= set(new_report.get_skip_terms())
        # and product should not be recommended
        from ir_term_count import IRTermCount
        product = new_report.get_product()
        if product is not None:
            product_term = IRTermCount.do_stemming([product])[0]
            if product_term in diff_description:
                diff_description.remove(product_term)
        return diff_summary, diff_description

예제 #11

0

파일 보기

파일: ir_recommender.py 프로젝트: LeonXJ/Intereport

    def get_report_difference(cls, new_report, similar_report):
        """Get the difference of terms of reports in similar_reports.
        return the dict difference of its summary and description respectly

        Args:
            new_report: IRReport, The new report.
            similar_report: IRReport, The similar report.

        Returns:
            (set, set), (diff of summary, diff of description)
        """

        new_summary_termcount, new_description_termcount = \
                new_report.get_summary_and_description_termcount()
        
        sim_summary_termcount, sim_description_termcount = \
                similar_report.get_summary_and_description_termcount()

        diff_summary = cls.__get_dict_difference(
            new_summary_termcount, sim_summary_termcount)
        diff_description = cls.__get_dict_difference(
            new_description_termcount, sim_description_termcount)
        # still, we don't want the term in summary to be recommended
        diff_description -= set(new_summary_termcount.keys())
        # skip the skip_terms in new report
        diff_description -= set(new_report.get_skip_terms())
        # and product should not be recommended
        from ir_term_count import IRTermCount
        product = new_report.get_product()
        if product is not None:
            product_term = IRTermCount.do_stemming([product])[0]
            if product_term in diff_description:
                diff_description.remove(product_term)
        return diff_summary, diff_description

예제 #12

0

파일 보기

파일: ir_tfidf.py 프로젝트: LeonXJ/Intereport

    def batch_generate_tfidf(cls):
        """Batch calculate TFIDF."""

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount
        from ir_term_count import IRTermCount
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        # prepare collections
        IRDocumentCount.cache_all_data()
        tfidf_collection = IRCollection(
            'bug_db_name', 'bug_tfidf_collection_name', 'w')
        # batch calculate tfidf
        termcount_iterator = IRTermCount.get_iterator()
        bug_count = termcount_iterator.count()
        def iter_term_count(bug):
            summary_tfidf = cls.calculate_tfidf(bug[summary_name],
                                                summary_name, bug_count, None, tfidf_algorithm)
            description_tfidf = cls.calculate_tfidf(bug[description_name],
                                                    description_name, bug_count, None, tfidf_algorithm)
            tfidf_collection.insert({bug_id_name : bug[bug_id_name],
                                     summary_name : summary_tfidf,
                                     description_name : description_tfidf})
        IRProgressBar.execute_iteration_for_cursor(termcount_iterator,
                                                   iter_term_count, "Calculating TFIDF")
        tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        tfidf_collection.close()

예제 #13

0

파일 보기

파일: ir_sim_bug_evaluator.py 프로젝트: LeonXJ/Intereport

    def __generate_single_bug(self, bug_id, drop_rate):
        """Generate an incomplete bug report text.
        
        Args:
            bug_id: int, original bug id.
            drop_rate: float, 0.0 for not drop, 1.0 for totally drop.
        
        Returns:
            IRReport
        """
        from ir_text import IRText
        from ir_term_count import IRTermCount
        from ir_report import IRReport

        # get description and summary
        summary, description = IRText.get_summary_and_description_of_bug(
            bug_id)
        create_ts, product = IRText.get_basic_info_of_bug(bug_id)
        if drop_rate > 0.001:
            summary, description = \
                IRTermCount.create_incomplete_report(summary, description, drop_rate)
            print description
        new_report = IRReport(summary, description)
        new_report.set_stacktrace(IRText.get_stacktrace_of_bug(bug_id))
        new_report.set_dummy_bug_id(bug_id)
        new_report.set_basic_info(create_ts, product)
        return new_report

예제 #14

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_create_incomplete_report(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary = 'This is a test of calculation for single report term count.'
        description = 'This is the description of the test report. Just a test.'
        summary_BoW, description_BoW = \
                IRTermCount.calculate_term_count(summary, description)
        inc_summary, inc_description = \
                IRTermCount.create_incomplete_report(summary, description, 0.4)
        inc_summary_bow, inc_description_bow = \
                IRTermCount.calculate_term_count(inc_summary, inc_description)
        IRLog.get_instance().println('Original Summary: %s' % (summary))
        IRLog.get_instance().println('Original Description: %s' %
                                     (description))
        IRLog.get_instance().println('Incomplete Summary: %s' % (inc_summary))
        IRLog.get_instance().println('Incomplete Description: %s' %
                                     (inc_description))
        IRLog.get_instance().println(
            'Compare original BoW with incomplete BoW')
        IRLog.get_instance().println('%16s\t%8s\t%8s' %
                                     ('Summary', 'Ori', 'Inc'))
        IRTermCount.show_dict_compare(summary_BoW, inc_summary_bow)
        IRLog.get_instance().println('%16s\t%8s\t%8s' %
                                     ('Description', 'Ori', 'Inc'))
        IRTermCount.show_dict_compare(description_BoW, inc_description_bow)

예제 #15

0

파일 보기

파일: ir_report.py 프로젝트: LeonXJ/Intereport

 def __update_summary_and_description_termcount_from_text(self):
     from ir_term_count import IRTermCount
     summary_text, description_text = self.get_summary_and_description_text()
     summary_termcount, description_termcount = \
         IRTermCount.calculate_term_count(summary_text, description_text)
     if self.__summary_termcount is None:
         self.__summary_termcount = summary_termcount
     if self.__description_termcount is None:
         self.__description_termcount = description_termcount

예제 #16

0

파일 보기

파일: ir_report.py 프로젝트: LeonXJ/Intereport

 def __update_summary_and_description_termcount_from_text(self):
     from ir_term_count import IRTermCount
     summary_text, description_text = self.get_summary_and_description_text(
     )
     summary_termcount, description_termcount = \
         IRTermCount.calculate_term_count(summary_text, description_text)
     if self.__summary_termcount is None:
         self.__summary_termcount = summary_termcount
     if self.__description_termcount is None:
         self.__description_termcount = description_termcount

예제 #17

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_tokenization(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount
        from nose.tools import assert_equals

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        tests = ['mouse-down', 'set_background_color()']
        expects = [['mouse-down'], ['set_background_color']]
        for index, test in enumerate(tests):
            assert_equals(expects[index], IRTermCount.do_tokenization(test))

예제 #18

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_tokenization(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount
        from nose.tools import assert_equals

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        tests = ['mouse-down', 'set_background_color()']
        expects = [['mouse-down'], ['set_background_color']]
        for index, test in enumerate(tests):
            assert_equals(expects[index], IRTermCount.do_tokenization(test))

예제 #19

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_single_report_term_count(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary = 'This is a test of calculation for single report term count.'
        description = 'This is the description of the test report. Just a test.'
        summary_BoW, description_BoW = \
                IRTermCount.calculate_term_count(summary, description)
        assert summary_BoW['calcul'] == 1
        assert description_BoW['test'] == 2

예제 #20

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_single_report_term_count(self):
        #import sys
        #sys.path.append('../bin/')
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary = 'This is a test of calculation for single report term count.'
        description = 'This is the description of the test report. Just a test.'
        summary_BoW, description_BoW = \
                IRTermCount.calculate_term_count(summary, description)
        assert summary_BoW['calcul'] == 1
        assert description_BoW['test'] == 2

예제 #21

0

파일 보기

    def batch_generate_document_count(cls):
        """Batch calculate term count over documents.
        Input is from mongodb, termcount collection.
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_term_count import IRTermCount

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        # Calculate document count and stored in document_count
        document_count = {}

        def iter_term_count(bug):
            for term in bug[summary_name]:
                if not term in document_count:
                    document_count[term] = {
                        term_name: term,
                        summary_name: 0,
                        description_name: 0
                    }
                document_count[term][summary_name] += 1
            for term in bug[description_name]:
                if not term in document_count:
                    document_count[term] = {
                        term_name: term,
                        summary_name: 0,
                        description_name: 0
                    }
                document_count[term][description_name] += 1

        IRProgressBar.execute_iteration_for_cursor(
            IRTermCount.get_iterator({}), iter_term_count,
            "Counting Document Count")
        # Write to db
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'w')

        def write_to_mongo(term):
            documentcount_collection.insert(document_count[term])

        IRProgressBar.execute_iteration_for_dict(document_count,
                                                 write_to_mongo,
                                                 "Write to database")
        documentcount_collection.create_index([(bug_id_name,
                                                IRCollection.ASCENDING)])
        documentcount_collection.close()

예제 #22

0

파일 보기

파일: ir_query.py 프로젝트: LeonXJ/Intereport

    def query(cls, summary, description, top_n):

        from ir_term_count import IRTermCount
        from ir_tfidf import IRTFIDF
        summary_bow, description_bow = \
            IRTermCount.calculate_term_count(summary, description)
        summary_tfidf, description_tfidf = \
            IRTFIDF.calculate_tfidf_for_report_termcount(summary_bow,
                                                         description_bow)
        similarities = \
            IRTFIDF.get_top_n_similarity_over_all(summary_tfidf,
                                                  description_tfidf,
                                                  top_n)
        return similarities

예제 #23

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_stemming(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        tests = ['discrimination', 'disgusting', 'visualization', 'configuration']
        stemmers = ['porter', 'lancaster', 'snowball']
        for test in tests:
            out = []
            for stemmer in stemmers:
                IRConfig.get_instance().set('stemmer', stemmer)
                out_token = IRTermCount.do_stemming([test])
                out.append(':'.join([stemmer, out_token[0]]))
            IRLog.get_instance().println('%s > %s' % (test, ', '.join(out)))

예제 #24

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_stemming(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        tests = [
            'discrimination', 'disgusting', 'visualization', 'configuration'
        ]
        stemmers = ['porter', 'lancaster', 'snowball']
        for test in tests:
            out = []
            for stemmer in stemmers:
                IRConfig.get_instance().set('stemmer', stemmer)
                out_token = IRTermCount.do_stemming([test])
                out.append(':'.join([stemmer, out_token[0]]))
            IRLog.get_instance().println('%s > %s' % (test, ', '.join(out)))

예제 #25

0

파일 보기

파일: ir_report.py 프로젝트: LeonXJ/Intereport

 def get_summary_and_description_termcount(self):
     if self.__bug_id is None:
         if self.__summary_termcount is None or \
                         self.__description_termcount is None:
             self.__update_summary_and_description_termcount_from_text()
         return [self.__summary_termcount, self.__description_termcount]
     else:
         if self.__allow_cache and \
                         self.__summary_termcount is not None and \
                         self.__description_termcount is not None:
             return [self.__summary_termcount, self.__description_termcount]
         from ir_term_count import IRTermCount
         summary, description = \
                 IRTermCount.get_termcount_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__summary_termcount, self.__description_termcount = \
                     summary, description
         return summary, description

예제 #26

0

파일 보기

파일: ir_report.py 프로젝트: LeonXJ/Intereport

 def get_summary_and_description_termcount(self):
     if self.__bug_id is None:
         if self.__summary_termcount is None or \
                         self.__description_termcount is None:
             self.__update_summary_and_description_termcount_from_text()
         return [self.__summary_termcount, self.__description_termcount]
     else:
         if self.__allow_cache and \
                         self.__summary_termcount is not None and \
                         self.__description_termcount is not None:
             return [self.__summary_termcount, self.__description_termcount]
         from ir_term_count import IRTermCount
         summary, description = \
                 IRTermCount.get_termcount_of_bug(self.__bug_id)
         if self.__allow_cache:
             self.__summary_termcount, self.__description_termcount = \
                     summary, description
         return summary, description

예제 #27

0

파일 보기

    def batch_generate_tfidf(cls):
        """Batch calculate TFIDF."""

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection
        from ir_document_count import IRDocumentCount
        from ir_term_count import IRTermCount
        # get config
        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        tfidf_algorithm = IRConfig.get_instance().get('tfidf_algorithm')
        # prepare collections
        IRDocumentCount.cache_all_data()
        tfidf_collection = IRCollection('bug_db_name',
                                        'bug_tfidf_collection_name', 'w')
        # batch calculate tfidf
        termcount_iterator = IRTermCount.get_iterator()
        bug_count = termcount_iterator.count()

        def iter_term_count(bug):
            summary_tfidf = cls.calculate_tfidf(bug[summary_name],
                                                summary_name, bug_count, None,
                                                tfidf_algorithm)
            description_tfidf = cls.calculate_tfidf(bug[description_name],
                                                    description_name,
                                                    bug_count, None,
                                                    tfidf_algorithm)
            tfidf_collection.insert({
                bug_id_name: bug[bug_id_name],
                summary_name: summary_tfidf,
                description_name: description_tfidf
            })

        IRProgressBar.execute_iteration_for_cursor(termcount_iterator,
                                                   iter_term_count,
                                                   "Calculating TFIDF")
        tfidf_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        tfidf_collection.close()

예제 #28

0

파일 보기

파일: ir_document_count.py 프로젝트: LeonXJ/Intereport

    def batch_generate_document_count(cls):
        """Batch calculate term count over documents.
        Input is from mongodb, termcount collection.
        """

        from ir_log import IRProgressBar
        from ir_config import IRConfig
        from ir_mongodb_helper import IRCollection 
        from ir_term_count import IRTermCount

        bug_id_name = IRConfig.get_instance().get('bug_id_name')
        term_name = IRConfig.get_instance().get('bug_term_name')
        summary_name = IRConfig.get_instance().get('bug_summary_name')
        description_name = IRConfig.get_instance().get('bug_description_name')
        # Calculate document count and stored in document_count
        document_count = {}
        def iter_term_count(bug):
            for term in bug[summary_name]:
                if not term in document_count:
                    document_count[term] = {term_name:term, summary_name:0,
                                            description_name:0}
                document_count[term][summary_name] += 1
            for term in bug[description_name]:
                if not term in document_count:
                    document_count[term] = {term_name:term, summary_name:0,
                                            description_name:0}
                document_count[term][description_name] += 1
        IRProgressBar.execute_iteration_for_cursor(IRTermCount.get_iterator({}),
                                                   iter_term_count, "Counting Document Count")
        # Write to db
        documentcount_collection = IRCollection(
            'bug_db_name', 'bug_documentcount_collection_name', 'w')
        def write_to_mongo(term):
            documentcount_collection.insert(document_count[term])
        IRProgressBar.execute_iteration_for_dict(document_count, write_to_mongo,
                                                 "Write to database")
        documentcount_collection.create_index([(bug_id_name, IRCollection.ASCENDING)])
        documentcount_collection.close()

예제 #29

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_create_incomplete_report(self):
        from ir_log import IRLog
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        summary = 'This is a test of calculation for single report term count.'
        description = 'This is the description of the test report. Just a test.'
        summary_BoW, description_BoW = \
                IRTermCount.calculate_term_count(summary, description)
        inc_summary, inc_description = \
                IRTermCount.create_incomplete_report(summary, description, 0.4)
        inc_summary_bow, inc_description_bow = \
                IRTermCount.calculate_term_count(inc_summary, inc_description)
        IRLog.get_instance().println('Original Summary: %s' % (summary))
        IRLog.get_instance().println('Original Description: %s' % (description))
        IRLog.get_instance().println('Incomplete Summary: %s' % (inc_summary))
        IRLog.get_instance().println('Incomplete Description: %s' % (inc_description))
        IRLog.get_instance().println('Compare original BoW with incomplete BoW')
        IRLog.get_instance().println('%16s\t%8s\t%8s' % ('Summary', 'Ori', 'Inc'))
        IRTermCount.show_dict_compare(summary_BoW, inc_summary_bow)
        IRLog.get_instance().println('%16s\t%8s\t%8s' % ('Description', 'Ori', 'Inc'))
        IRTermCount.show_dict_compare(description_BoW, inc_description_bow)

예제 #30

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_cache_all(self):
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTermCount.cache_all_data()

예제 #31

0

파일 보기

파일: test_ir_term_count.py 프로젝트: LeonXJ/Intereport

    def test_cache_all(self):
        from ir_config import IRConfig
        from ir_term_count import IRTermCount

        IRConfig.get_instance().load('../data/test/bug_test.cfg')
        IRTermCount.cache_all_data()

예제 #32

0

파일 보기

파일: ir_recommender.py 프로젝트: LeonXJ/Intereport

    def start_shell(cls):
        """Start a shell that do recommending interactively"""
        from ir_log import IRLog
        from ir_tfidf import IRTFIDF
        from ir_document_count import IRDocumentCount
        from ir_report import IRReport

        IRLog.get_instance().println("Starting Intereport...")
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()
        IRLog.get_instance().println("Intereport Started. Waiting for input")

        new_report = None
        while 1:
            cmd = raw_input("Input command:").strip()
            if cmd == 'exit':
                IRLog.get_instance().println('Exiting')
                break
            elif cmd == 'new':
                IRLog.get_instance().println('Creating New Report')
                import time
                cur_time = -1
                while cur_time < 0:
                    try:
                        cur_time = int(time.mktime(time.strptime(
                            raw_input("Input Time (e.g., 2011-05-05): "),
                            '%Y-%m-%d')))
                    except:
                        cur_time = -1
                product = raw_input("Input Product: ")
                summary = raw_input("Summary: ")
                raw_description = raw_input("Description:\n")
                new_report = IRReport.from_string(IRReport.separator.join([
                    str(cur_time), product.lower(), summary, raw_description,
                    '', '']))
                cls.__print_report(new_report)
            elif cmd == 'do':
                IRLog.get_instance().println('Do Recommending')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.do_recommend(new_report)
            elif cmd == 'ls':
                IRLog.get_instance().println('Show Current Report')
                if new_report is None:
                     IRLog.get_instance().println('Error! Please create '
                                                  'report first.')
                else:
                    cls.__print_report(new_report)
            elif cmd == 'ad':
                IRLog.get_instance().println('Appending Description')
                if new_report is None:
                     IRLog.get_instance().println('Error! Please create '
                                                  'report first.')
                else:
                    append_description = raw_input("Append Description:\n")
                    description =' '.join([new_report.get_description_text(),
                                           append_description])
                    dummy_report = IRReport(new_report.get_summary_text(),
                                            description)
                    dummy_report.set_stacktrace(new_report.get_stacktrace())
                    dummy_report.set_basic_info(new_report.get_create_ts(),
                                                new_report.get_product())
                    dummy_report.set_penalty_terms(new_report.get_penalty_terms())
                    dummy_report.set_dummy_bug_id(new_report.get_dummy_bug_id())
                    new_report = dummy_report
                    IRLog.get_instance().println('Description: %s' % description)
            elif cmd == 'ap':
                IRLog.get_instance().println('Appending Penalties')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    raw = []
                    while raw.__len__() < 1:
                        raw = raw_input('Input Penalties (split by \',\'):').split(',')
                    from ir_term_count import IRTermCount
                    penalty = new_report.get_penalty_terms()
                    if penalty is None:
                        penalty = []
                    penalty += IRTermCount.do_stemming(raw)
                    new_report.set_penalty_terms(penalty)
                    print len(penalty), penalty
                    IRLog.get_instance().println('Penalties: %s' % \
                                                     (', '.join(penalty)))
            elif cmd == 'sd':
                IRLog.get_instance().println('Set Dummy Bug ID')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    bug_id = -1
                    while bug_id <= 0:
                        try:
                            bug_id = int(raw_input('Dummy Bug ID: '))
                        except:
                            bug_id = -1
                    new_report.set_dummy_bug_id(bug_id)
                    IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id)
            elif cmd == 'help':
                cls.__show_help()
            else:
                IRLog.get_instance().println('Error! Unkown command: %s' \
                                                % cmd)
                cls.__show_help()
        # end of while 1
        IRLog.get_instance().println("Bye")

예제 #33

0

파일 보기

파일: ir_recommender.py 프로젝트: LeonXJ/Intereport

    mode = sys.argv[2]

    new_report = None
    if mode == 'file':
        test_file = sys.argv[3]
        bug_id = int(sys.argv[4])
        from ir_sim_bug_evaluator import IRSimBugEvaluator
        new_report = IRSimBugEvaluator.get_report_from_test_file(test_file, bug_id)
        if new_report is None:
            IRLog.get_instance().println('Error! Cannot find report %d in %s' % \
                    (bug_id, test_file))
        else:
            if sys.argv.__len__() > 5:
                from ir_term_count import IRTermCount
                penalty_terms_raw = sys.argv[4].split(',')
                penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw))
                IRLog.get_instance().println('%d penalty terms: %s:' \
                    % (penalty_terms.__len__(), ','.join(penalty_terms)))
                new_report.set_penalty_terms(penalty_terms)
    elif mode == 'text':
        text = sys.argv[3]
        new_report = IRReport.from_string(text)
    elif mode == 'inte':
        IRRecommender.start_shell()
        exit()
    else:
        IRLog.get_instance().println('Error! Known mode %s' % mode)
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()

예제 #34

0

파일 보기

 def get_termcount(self):
     if self.__termcount is None:
         from ir_term_count import IRTermCount
         self.__termcount = \
             IRTermCount.get_bow(self.get_text(), True)
     return self.__termcount

예제 #35

0

파일 보기

파일: ir_recommender.py 프로젝트: LeonXJ/Intereport

    def start_shell(cls):
        """Start a shell that do recommending interactively"""
        from ir_log import IRLog
        from ir_tfidf import IRTFIDF
        from ir_document_count import IRDocumentCount
        from ir_report import IRReport

        IRLog.get_instance().println("Starting Intereport...")
        IRTFIDF.cache_all_data()
        IRDocumentCount.cache_all_data()
        IRLog.get_instance().println("Intereport Started. Waiting for input")

        new_report = None
        while 1:
            cmd = raw_input("Input command:").strip()
            if cmd == 'exit':
                IRLog.get_instance().println('Exiting')
                break
            elif cmd == 'new':
                IRLog.get_instance().println('Creating New Report')
                import time
                cur_time = -1
                while cur_time < 0:
                    try:
                        cur_time = int(
                            time.mktime(
                                time.strptime(
                                    raw_input(
                                        "Input Time (e.g., 2011-05-05): "),
                                    '%Y-%m-%d')))
                    except:
                        cur_time = -1
                product = raw_input("Input Product: ")
                summary = raw_input("Summary: ")
                raw_description = raw_input("Description:\n")
                new_report = IRReport.from_string(
                    IRReport.separator.join([
                        str(cur_time),
                        product.lower(), summary, raw_description, '', ''
                    ]))
                cls.__print_report(new_report)
            elif cmd == 'do':
                IRLog.get_instance().println('Do Recommending')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.do_recommend(new_report)
            elif cmd == 'ls':
                IRLog.get_instance().println('Show Current Report')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    cls.__print_report(new_report)
            elif cmd == 'ad':
                IRLog.get_instance().println('Appending Description')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    append_description = raw_input("Append Description:\n")
                    description = ' '.join([
                        new_report.get_description_text(), append_description
                    ])
                    dummy_report = IRReport(new_report.get_summary_text(),
                                            description)
                    dummy_report.set_stacktrace(new_report.get_stacktrace())
                    dummy_report.set_basic_info(new_report.get_create_ts(),
                                                new_report.get_product())
                    dummy_report.set_penalty_terms(
                        new_report.get_penalty_terms())
                    dummy_report.set_dummy_bug_id(
                        new_report.get_dummy_bug_id())
                    new_report = dummy_report
                    IRLog.get_instance().println('Description: %s' %
                                                 description)
            elif cmd == 'ap':
                IRLog.get_instance().println('Appending Penalties')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    raw = []
                    while raw.__len__() < 1:
                        raw = raw_input(
                            'Input Penalties (split by \',\'):').split(',')
                    from ir_term_count import IRTermCount
                    penalty = new_report.get_penalty_terms()
                    if penalty is None:
                        penalty = []
                    penalty += IRTermCount.do_stemming(raw)
                    new_report.set_penalty_terms(penalty)
                    print len(penalty), penalty
                    IRLog.get_instance().println('Penalties: %s' % \
                                                     (', '.join(penalty)))
            elif cmd == 'sd':
                IRLog.get_instance().println('Set Dummy Bug ID')
                if new_report is None:
                    IRLog.get_instance().println('Error! Please create '
                                                 'report first.')
                else:
                    bug_id = -1
                    while bug_id <= 0:
                        try:
                            bug_id = int(raw_input('Dummy Bug ID: '))
                        except:
                            bug_id = -1
                    new_report.set_dummy_bug_id(bug_id)
                    IRLog.get_instance().println('Dummy Bug ID: %d' % bug_id)
            elif cmd == 'help':
                cls.__show_help()
            else:
                IRLog.get_instance().println('Error! Unkown command: %s' \
                                                % cmd)
                cls.__show_help()
        # end of while 1
        IRLog.get_instance().println("Bye")

예제 #36

0

파일 보기

파일: ir_recommender.py 프로젝트: LeonXJ/Intereport

    new_report = None
    if mode == 'file':
        test_file = sys.argv[3]
        bug_id = int(sys.argv[4])
        from ir_sim_bug_evaluator import IRSimBugEvaluator
        new_report = IRSimBugEvaluator.get_report_from_test_file(
            test_file, bug_id)
        if new_report is None:
            IRLog.get_instance().println('Error! Cannot find report %d in %s' % \
                    (bug_id, test_file))
        else:
            if sys.argv.__len__() > 5:
                from ir_term_count import IRTermCount
                penalty_terms_raw = sys.argv[4].split(',')
                penalty_terms = set(IRTermCount.do_stemming(penalty_terms_raw))
                IRLog.get_instance().println('%d penalty terms: %s:' \
                    % (penalty_terms.__len__(), ','.join(penalty_terms)))
                new_report.set_penalty_terms(penalty_terms)
    elif mode == 'text':
        text = sys.argv[3]
        new_report = IRReport.from_string(text)
    elif mode == 'inte':
        IRRecommender.start_shell()
        exit()
    else:
        IRLog.get_instance().println('Error! Known mode %s' % mode)
    from ir_tfidf import IRTFIDF
    from ir_document_count import IRDocumentCount
    IRTFIDF.cache_all_data()
    IRDocumentCount.cache_all_data()