def get_frac_and_str_tokens_in_gs(gs_txt): """ For each block, determine which and what fraction of tokens are also in the gold standard text ``gs_txt`` for either content or comments. Returns: List[float] List[str] """ gs_tokens = tokenizer(gs_txt) tokens_in_gs = check_inclusion(all_blocks_tokens, gs_tokens) num_blocks_tokens_in_gs = [0 for _ in range(len(blocks))] blocks_tokens_in_gs_tokens = [[] for _ in range(len(blocks))] for token, token_in_gs, block_id in zip(all_blocks_tokens, tokens_in_gs, all_blocks_tokens_block_id): if token_in_gs is True: num_blocks_tokens_in_gs[block_id] += 1 blocks_tokens_in_gs_tokens[block_id].append(token) blocks_tokens_strs_in_gs = [ ' '.join(block_tokens_in_gs_tokens) for block_tokens_in_gs_tokens in blocks_tokens_in_gs_tokens ] frac_blocks_tokens_in_gs = [ num_block_tokens_in_gs / num_block_tokens for num_block_tokens_in_gs, num_block_tokens in zip( num_blocks_tokens_in_gs, num_blocks_tokens) ] return (frac_blocks_tokens_in_gs, blocks_tokens_strs_in_gs)
def test_check_inclusion(self): inc = check_inclusion( ["some", "words", "here", "the", "football"], ["he", "said", "words", "kick", "the", "football"]) self.assertTrue(inc, [False, True, False, True, True])
def evaluate(self, training_docs, test_docs): self.model.train(training_docs) precisions = [] recalls = [] lcs_precisions = [] lcs_recalls = [] for doc in test_docs: # TODO: ugly if self.model.predict_classes(): classes = self.model.predict(doc) extracted_content = doc.extract_article(classes) else: extracted_content = self.model.predict(doc) main_content = doc.get_main_content() e_list = extracted_content.encode('utf-8').split() m_list = main_content.encode('utf-8').split() # do token-level comparison e_words = set(e_list) m_words = set(m_list) common_words = e_words.intersection(m_words) if not common_words: common_words = set(['CANNOT_BELIEVE_THIS']) log('WARN: no word predicted accurately for %s' % doc, WARNING) if len(e_words): precisions.append(1.0*len(common_words)/len(e_words)) else: precisions.append(0.0) recalls.append(1.0*len(common_words)/len(m_words)) # do common subsequence comparison flags = check_inclusion(e_list, m_list) n_common = sum(flags) if n_common == 0: n_common = 1 log('WARN: no common sequence extracted for %s' % doc, WARNING) lcs_precisions.append(1.0*n_common/len(e_list)) lcs_recalls.append(1.0*n_common/len(m_list)) f_measures = [2/(1/p+1/r) for p,r in zip(precisions, recalls)] avg_prec = Evaluator.average(precisions) avg_rec = Evaluator.average(recalls) avg_f1 = Evaluator.average(f_measures) log('Average prec: %.4f' % avg_prec, CRITICAL) log('Average rec: %.4f' % avg_rec, CRITICAL) log('Average F1: %.4f' % avg_f1, CRITICAL) lcs_f_measures = [2/(1/p+1/r) for p,r in zip(lcs_precisions, lcs_recalls)] avg_lcs_prec = Evaluator.average(lcs_precisions) avg_lcs_rec = Evaluator.average(lcs_recalls) avg_lcs_f1 = Evaluator.average(lcs_f_measures) log('Average LCS prec: %.4f' % avg_lcs_prec, CRITICAL) log('Average LCS rec: %.4f' % avg_lcs_rec, CRITICAL) log('Average LCS F1: %.4f' % avg_lcs_f1, CRITICAL) self.precisions.append(avg_prec) self.recalls.append(avg_rec) self.f1s.append(avg_f1) self.lcs_precisions.append(avg_lcs_prec) self.lcs_recalls.append(avg_lcs_rec) self.lcs_f1s.append(avg_lcs_f1)
def test_check_inclusion(): inc = check_inclusion(["some", "words", "here", "the", "football"], ["he", "said", "words", "kick", "the", "football"]) assert inc == [False, True, False, True, True]
def test_inclusion(self): inc = check_inclusion(["some", "words", "here", "the", "football"], ["he", "said", "words", "kick", "the", "football"]) self.assertTrue(inc, [False, True, False, True, True])