def highlight_text(args, parser): """Outputs the result of highlighting a text.""" tokenizer = utils.get_tokenizer(args) corpus = utils.get_corpus(args) output_dir = os.path.abspath(args.output) if os.path.exists(output_dir): parser.exit(status=3, message='Output directory already exists, ' 'aborting.\n') os.makedirs(output_dir, exist_ok=True) if args.ngrams: if args.label is None or len(args.label) != len(args.ngrams): parser.error('There must be as many labels as there are files ' 'of n-grams') report = tacl.NgramHighlightReport(corpus, tokenizer) ngrams = [] for ngram_file in args.ngrams: ngrams.append(utils.get_ngrams(ngram_file)) minus_ngrams = [] if args.minus_ngrams: minus_ngrams = utils.get_ngrams(args.minus_ngrams) report.generate(args.output, args.base_name, ngrams, args.label, minus_ngrams) else: report = tacl.ResultsHighlightReport(corpus, tokenizer) report.generate(args.output, args.base_name, args.results)
def test_prepare_text_cbeta(self): input_text = '無[火*因]是<物即\n\n 同如' expected_text = ( '<span>無</span><span>[火*因]</span><span>是</span><span>物</span>' '<span>即</span>\n\n <span>同</span><span>如</span>') report = tacl.NgramHighlightReport(None, self._tokenizer) actual_text = report._prepare_text(input_text) self.assertEqual(actual_text, expected_text)
def test_prepare_text_pagel(self): input_text = "'dzin dang | snang\n \nba'i" expected_text = ( "<span>'dzin</span> <span>dang</span> | <span>snang</span>\n \n" "<span>ba'i</span>") tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL, tacl.constants.TOKENIZER_JOINER_PAGEL) report = tacl.NgramHighlightReport(None, tokenizer) actual_text = report._prepare_text(input_text) self.assertEqual(actual_text, expected_text)
def test_highlight(self): input_text = ( '<span>火</span><span>無</span><span>[火*因]</span>。' '<span>是</span><span>故</span><span>顯</span><span>物</span>') ngrams = ['無[火*因]是'] report = tacl.NgramHighlightReport(None, self._tokenizer) actual_text = report._highlight(input_text, ngrams, True) expected_text = ( '<span>火</span><span class="highlight1">無</span>' '<span class="highlight1">[火*因]</span>。' '<span class="highlight1">是</span><span>故</span><span>顯</span>' '<span>物</span>') self.assertEqual(actual_text, expected_text)
def test_highlight_minus(self): input_text = ( '<span class="highlight2">火</span><span class="highlight">無</span>' '<span class="highlight1">[火*因]</span>。<span class="highlight1">' '是</span><span>故</span><span class="highlight2">火</span>' '<span>顯</span><span>物</span><span class="highlight2">火</span>') minus_ngrams = ['火顯'] report = tacl.NgramHighlightReport(None, self._tokenizer) actual_text = report._highlight(input_text, minus_ngrams, False) expected_text = ( '<span class="highlight2">火</span><span class="highlight">無</span>' '<span class="highlight1">[火*因]</span>。<span class="highlight1">' '是</span><span>故</span><span>火</span><span>顯</span><span>物' '</span><span class="highlight2">火</span>') self.assertEqual(actual_text, expected_text)