예제 #1
0
    def get_frac_and_str_tokens_in_gs(gs_txt):
        """
        For each block, determine which and what fraction of tokens are
        also in the gold standard text ``gs_txt`` for either content
        or comments.

        Returns:
            List[float]
            List[str]
        """
        gs_tokens = tokenizer(gs_txt)

        tokens_in_gs = check_inclusion(all_blocks_tokens, gs_tokens)
        num_blocks_tokens_in_gs = [0 for _ in range(len(blocks))]
        blocks_tokens_in_gs_tokens = [[] for _ in range(len(blocks))]
        for token, token_in_gs, block_id in zip(all_blocks_tokens,
                                                tokens_in_gs,
                                                all_blocks_tokens_block_id):
            if token_in_gs is True:
                num_blocks_tokens_in_gs[block_id] += 1
                blocks_tokens_in_gs_tokens[block_id].append(token)

        blocks_tokens_strs_in_gs = [
            ' '.join(block_tokens_in_gs_tokens)
            for block_tokens_in_gs_tokens in blocks_tokens_in_gs_tokens
        ]
        frac_blocks_tokens_in_gs = [
            num_block_tokens_in_gs / num_block_tokens
            for num_block_tokens_in_gs, num_block_tokens in zip(
                num_blocks_tokens_in_gs, num_blocks_tokens)
        ]

        return (frac_blocks_tokens_in_gs, blocks_tokens_strs_in_gs)
예제 #2
0
파일: test_lcs.py 프로젝트: vck/dragnet
 def test_check_inclusion(self):
     inc = check_inclusion(
         ["some", "words", "here", "the", "football"],
         ["he", "said", "words", "kick", "the", "football"])
     self.assertTrue(inc, [False, True, False, True, True])
예제 #3
0
    def evaluate(self, training_docs, test_docs):
        self.model.train(training_docs)

        precisions = []
        recalls = []
        lcs_precisions = []
        lcs_recalls = []

        for doc in test_docs:
            # TODO: ugly
            if self.model.predict_classes():
                classes = self.model.predict(doc)
                extracted_content = doc.extract_article(classes)
            else:
                extracted_content = self.model.predict(doc)
            main_content = doc.get_main_content()

            e_list = extracted_content.encode('utf-8').split()
            m_list = main_content.encode('utf-8').split()

            # do token-level comparison
            e_words = set(e_list)
            m_words = set(m_list)
            common_words = e_words.intersection(m_words)
            if not common_words:
                common_words = set(['CANNOT_BELIEVE_THIS'])
                log('WARN: no word predicted accurately for %s' % doc, WARNING)

            if len(e_words):
                precisions.append(1.0*len(common_words)/len(e_words))
            else:
                precisions.append(0.0)
            recalls.append(1.0*len(common_words)/len(m_words))

            # do common subsequence comparison
            flags = check_inclusion(e_list, m_list)
            n_common = sum(flags)
            if n_common == 0:
                n_common = 1
                log('WARN: no common sequence extracted for %s' % doc, WARNING)

            lcs_precisions.append(1.0*n_common/len(e_list))
            lcs_recalls.append(1.0*n_common/len(m_list))

        f_measures = [2/(1/p+1/r) for p,r in zip(precisions, recalls)]
        avg_prec = Evaluator.average(precisions)
        avg_rec = Evaluator.average(recalls)
        avg_f1 = Evaluator.average(f_measures)
        log('Average prec: %.4f' % avg_prec, CRITICAL)
        log('Average rec:  %.4f' % avg_rec, CRITICAL)
        log('Average F1:   %.4f' % avg_f1, CRITICAL)

        lcs_f_measures = [2/(1/p+1/r) for p,r in zip(lcs_precisions, lcs_recalls)]
        avg_lcs_prec = Evaluator.average(lcs_precisions)
        avg_lcs_rec = Evaluator.average(lcs_recalls)
        avg_lcs_f1 = Evaluator.average(lcs_f_measures)
        log('Average LCS prec: %.4f' % avg_lcs_prec, CRITICAL)
        log('Average LCS rec:  %.4f' % avg_lcs_rec, CRITICAL)
        log('Average LCS F1:   %.4f' % avg_lcs_f1, CRITICAL)

        self.precisions.append(avg_prec)
        self.recalls.append(avg_rec)
        self.f1s.append(avg_f1)
        self.lcs_precisions.append(avg_lcs_prec)
        self.lcs_recalls.append(avg_lcs_rec)
        self.lcs_f1s.append(avg_lcs_f1)
예제 #4
0
def test_check_inclusion():
    inc = check_inclusion(["some", "words", "here", "the", "football"],
                          ["he", "said", "words", "kick", "the", "football"])
    assert inc == [False, True, False, True, True]
예제 #5
0
 def test_inclusion(self):
     inc = check_inclusion(["some", "words", "here", "the", "football"],
                     ["he", "said", "words", "kick", "the", "football"])
     self.assertTrue(inc, [False, True, False, True, True])