Пример #1
0
class WordExtractWithStopwordTestCase(unittest.TestCase):
    # Only use setUp() and tearDown() if necessary
    def setUp(self):
        # stopword list
        # Using mmseg to test extract word.
        self.we = WeightEngine(segment_function, stopwordList = STOPLIST)
        self.we.load_record("Hugerecord_save.dat")
        self.text = u"""
你说我在休假期间,大半夜哄完孩子来看代码也不容易。为什么要恶心我呢?
貌似GIT练习的强度不太够了,应该根据大家反馈搞一个新版本出来。不过今天要说的是注释的写法。
大家可以参看这个链接:http://www.kernel.org/pub/linux/kernel/v3.0/ChangeLog-3.3.3

要求带人的同学仔细看看,并且回复邮件说明我们和上面的之间的差距。这些同学有@克毅 @雄飞 @贺贺。

其他人仔细阅读,有心得的回复邮件参与讨论;没有心得的等我回去挨收拾吧。

再强调一遍,GIT是我们研发工作的基础,就好比我们说的话一样,都要说普通话,基础一样才能沟通交流。否则别的都是白扯。"""
        self.term = u"基础"

    def tearDown(self):
        pass
    def teststop_words(self):
        result1 = self.we.tf_idf_dict(self.text)
        show_dict(result1)
        result2 = self.we.tf_idf(self.term, self.text)
        print self.term, result2
        return
Пример #2
0
class WordExtractTestCase2(unittest.TestCase):

    # Only use setUp() and tearDown() if necessary
    def setUp(self):
        # Using mmseg to test extract word.
        self.we = WeightEngine(segment_function)
        self.we.load_record("Hugerecord_save.dat")
        self.text = u"这里要说的是,转义虽然对前端展示的时候带来了好处,但是确带来了数据的不一致性。"
        self.term = u"这里"
    def tearDown(self):
        pass
    def test_extract_tags(self):
        """
        TODO: HOW TO test this kind of function
        """
        result1 = self.we.tf_idf_dict(self.text)
        show_dict(result1)
        result2 = self.we.tf_idf(self.term, self.text)
        print self.term, result2
    def test_stop_words_filter(self):
        wordlst = [u"aa", u"bb"]
        self.assertEqual([stop_words_filter(wordlst, [u"aa"])], [u"bb"])
        pass
    def test_integrated_tf_idf(self):
        """
        tf_list
        idf_list
        """
        tf_list = [self.we.tf, self.we.log_tf, self.we.a_tf, self.we.b_tf, self.we.L_tf]
        df_list = [self.we.n_df, self.we.idf, self.we.prob_idf]
        print_format = list()

        for tf in tf_list:
            for df in df_list:
                result = sort_dict(self.we.tf_idf_dict(self.text, tf, df))
                head = "%s %s:" % (tf.__name__, df.__name__)
                showTable([word[0] for word in result],
                    [word[1] for word in result], title_name = head)
                print_format.append([head] + [word[0] for word in result[:5]])
        col_printtable(print_format)