Пример #1
0
def highlight_text(args, parser):
    """Outputs the result of highlighting a text."""
    tokenizer = utils.get_tokenizer(args)
    corpus = utils.get_corpus(args)
    output_dir = os.path.abspath(args.output)
    if os.path.exists(output_dir):
        parser.exit(status=3,
                    message='Output directory already exists, '
                    'aborting.\n')
    os.makedirs(output_dir, exist_ok=True)
    if args.ngrams:
        if args.label is None or len(args.label) != len(args.ngrams):
            parser.error('There must be as many labels as there are files '
                         'of n-grams')
        report = tacl.NgramHighlightReport(corpus, tokenizer)
        ngrams = []
        for ngram_file in args.ngrams:
            ngrams.append(utils.get_ngrams(ngram_file))
        minus_ngrams = []
        if args.minus_ngrams:
            minus_ngrams = utils.get_ngrams(args.minus_ngrams)
        report.generate(args.output, args.base_name, ngrams, args.label,
                        minus_ngrams)
    else:
        report = tacl.ResultsHighlightReport(corpus, tokenizer)
        report.generate(args.output, args.base_name, args.results)
Пример #2
0
 def test_prepare_text_cbeta(self):
     input_text = '無[火*因]是<物即\n\n    同如'
     expected_text = (
         '<span>無</span><span>[火*因]</span><span>是</span><span>物</span>'
         '<span>即</span>\n\n    <span>同</span><span>如</span>')
     report = tacl.NgramHighlightReport(None, self._tokenizer)
     actual_text = report._prepare_text(input_text)
     self.assertEqual(actual_text, expected_text)
Пример #3
0
 def test_prepare_text_pagel(self):
     input_text = "'dzin dang | snang\n \nba'i"
     expected_text = (
         "<span>'dzin</span> <span>dang</span> | <span>snang</span>\n \n"
         "<span>ba'i</span>")
     tokenizer = tacl.Tokenizer(tacl.constants.TOKENIZER_PATTERN_PAGEL,
                                tacl.constants.TOKENIZER_JOINER_PAGEL)
     report = tacl.NgramHighlightReport(None, tokenizer)
     actual_text = report._prepare_text(input_text)
     self.assertEqual(actual_text, expected_text)
Пример #4
0
 def test_highlight(self):
     input_text = (
         '<span>火</span><span>無</span><span>[火*因]</span>。'
         '<span>是</span><span>故</span><span>顯</span><span>物</span>')
     ngrams = ['無[火*因]是']
     report = tacl.NgramHighlightReport(None, self._tokenizer)
     actual_text = report._highlight(input_text, ngrams, True)
     expected_text = (
         '<span>火</span><span class="highlight1">無</span>'
         '<span class="highlight1">[火*因]</span>。'
         '<span class="highlight1">是</span><span>故</span><span>顯</span>'
         '<span>物</span>')
     self.assertEqual(actual_text, expected_text)
Пример #5
0
 def test_highlight_minus(self):
     input_text = (
         '<span class="highlight2">火</span><span class="highlight">無</span>'
         '<span class="highlight1">[火*因]</span>。<span class="highlight1">'
         '是</span><span>故</span><span class="highlight2">火</span>'
         '<span>顯</span><span>物</span><span class="highlight2">火</span>')
     minus_ngrams = ['火顯']
     report = tacl.NgramHighlightReport(None, self._tokenizer)
     actual_text = report._highlight(input_text, minus_ngrams, False)
     expected_text = (
         '<span class="highlight2">火</span><span class="highlight">無</span>'
         '<span class="highlight1">[火*因]</span>。<span class="highlight1">'
         '是</span><span>故</span><span>火</span><span>顯</span><span>物'
         '</span><span class="highlight2">火</span>')
     self.assertEqual(actual_text, expected_text)