예제 #1
0
def test_filterpunc():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnot'
        expected_tokens = ['Hello', 'World', 'I', 'not']
        create_single_output(filename, sentence)
        result = runner.invoke(filterpunc, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #2
0
def test_words2ngrams():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.'
        expected_tokens = ['Hello World !', 'World ! I', '! I love', 'I love go']
        create_single_output(filename, sentence)
        result = runner.invoke(words2ngrams, ['-n', 3, filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #3
0
def test_sentences():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello World! I love go.'
        expected_tokens = ['Hello World!', 'I love go.']
        create_single_output(filename, sentence)
        result = runner.invoke(text2sentences, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #4
0
def test_text2words():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello World!\nI.\nnot sure where to go'
        expected_tokens = ['Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go'] 
        create_single_output(filename, sentence)
        result = runner.invoke(text2words, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #5
0
def test_punc():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nlove,\ngo\n.'
        expected_tokens = ['!', ',', '.']
        create_single_output(filename, sentence)
        result = runner.invoke(text2punc, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #6
0
def test_filterpunc():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnot'
        expected_tokens = ['Hello', 'World', 'I', 'not']
        create_single_output(filename, sentence)
        result = runner.invoke(filterpunc, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #7
0
def test_sentences():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello World! I love go.'
        expected_tokens = ['Hello World!', 'I love go.']
        create_single_output(filename, sentence)
        result = runner.invoke(text2sentences, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #8
0
def test_punc():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nlove,\ngo\n.'
        expected_tokens = ['!', ',', '.']
        create_single_output(filename, sentence)
        result = runner.invoke(text2punc, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #9
0
def test_uppercase():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n'
        expected_tokens = ['HELLO', 'WORLD', '!', 'I', '.', 'NOOOO']
        create_single_output(filename, sentence)

        result = runner.invoke(tokens2upper, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #10
0
def test_punc_multifile():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filenames = ['in.txt', 'in2.txt']
        sentences = ['Hello\nWorld\n!\nI\nlove,\ngo\n.',
                     'Goodbye World!\n I...\n know everything\'s about you?']
        expected_tokens = ['!', ',', '.', '!', '...', "'", '?']
        create_multifile_output(filenames, sentences)
        result = runner.invoke(text2punc, filenames)
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #11
0
def test_filterwords():
    runner = CliRunner()
    with runner.isolated_filesystem():

        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.'
        expected_tokens = ['Hello', 'World', '!', 'crook', '.']
        create_single_output(filename, sentence)
        result = runner.invoke(filterwords, ['--language', 'english', filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #12
0
def test_uppercase():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n'
        expected_tokens = ['HELLO', 'WORLD', '!', 'I', '.', 'NOOOO']
        create_single_output(filename, sentence)

        result = runner.invoke(tokens2upper, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #13
0
def test_lowercase():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n'
        expected_tokens = ['hello', 'world', '!', 'i', '.', 'noooo']
        create_single_output(filename, sentence)

        result = runner.invoke(lowercase, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #14
0
def test_text2words():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello World!\nI.\nnot sure where to go'
        expected_tokens = [
            'Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go'
        ]
        create_single_output(filename, sentence)
        result = runner.invoke(text2words, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #15
0
def test_filterwords():
    runner = CliRunner()
    with runner.isolated_filesystem():

        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.'
        expected_tokens = ['Hello', 'World', '!', 'crook', '.']
        create_single_output(filename, sentence)
        result = runner.invoke(filterwords,
                               ['--language', 'english', filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #16
0
def test_nonewlines():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nam\nin.\n'
        expected_tokens = ['Hello World ! I am in.']
        
        create_single_output(filename, sentence)
        result = runner.invoke(nonewlines, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        assert len(result.output.split('\n')) == 2
        compare_results(tokens, expected_tokens)
예제 #17
0
def test_nonewlines_multifile():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filenames = ['in.txt', 'in2.txt']
        sentences = ['Hello\nWorld\n!\nI\nam\nin.', 
            'What are you\na creature\nof mystery']
        expected_tokens = ['Hello World ! I am in. What are you a creature of mystery']
        create_multifile_output(filenames, sentences)
        result = runner.invoke(nonewlines, filenames)
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        assert len(result.output.split('\n')) == 2
        compare_results(tokens, expected_tokens)
예제 #18
0
def test_nonewlines():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nam\nin.\n'
        expected_tokens = ['Hello World ! I am in.']

        create_single_output(filename, sentence)
        result = runner.invoke(nonewlines, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        assert len(result.output.split('\n')) == 2
        compare_results(tokens, expected_tokens)
예제 #19
0
def test_words2ngrams():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.'
        expected_tokens = [
            'Hello World !', 'World ! I', '! I love', 'I love go'
        ]
        create_single_output(filename, sentence)
        result = runner.invoke(words2ngrams, ['-n', 3, filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #20
0
def test_pos_tokens():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou'
        expected_tokens = ['Hello,NNP', 'world,NN', '!,.',
                           'I,PRP', 'love,VBP', 'this,DT',
                           'world,NN', 'and,CC', 'love,VB', 'you,PRP']
        create_single_output(filename, sentence)
        result = runner.invoke(tokens2pos, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #21
0
def test_nonewlines_multifile():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filenames = ['in.txt', 'in2.txt']
        sentences = ['Hello\nWorld\n!\nI\nam\nin.',
                     'What are you\na creature\nof mystery']
        expected_tokens = ['Hello World ! I am in. What are you a creature of mystery']
        create_multifile_output(filenames, sentences)
        result = runner.invoke(nonewlines, filenames)
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        assert len(result.output.split('\n')) == 2
        compare_results(tokens, expected_tokens)
예제 #22
0
def test_punc_multifile():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filenames = ['in.txt', 'in2.txt']
        sentences = [
            'Hello\nWorld\n!\nI\nlove,\ngo\n.',
            'Goodbye World!\n I...\n know everything\'s about you?'
        ]
        expected_tokens = ['!', ',', '.', '!', '...', "'", '?']
        create_multifile_output(filenames, sentences)
        result = runner.invoke(text2punc, filenames)
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #23
0
def test_text2words_multifile():
    runner = CliRunner()
    with runner.isolated_filesystem():

        filenames = ['in.txt', 'in2.txt']
        sentences = ('Hello World!\nI.\nnot sure where to go',
            'Goodbye World!\n I.\n know everything about you')
        expected_tokens = ['Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go', 
            'Goodbye', 'World', '!', 'I.', 'know', 'everything', 'about', 'you']
        create_multifile_output(filenames, sentences)
        result = runner.invoke(text2words, filenames)
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #24
0
def test_count_tokens():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello,\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou'
        expected_tokens = ['love,2', 'world,2', 'and,1', 'I,1', 'you,1',
                           'this,1', '\"Hello,\",1', '!,1', '']
        expected_tokens.sort()
        create_single_output(filename, sentence)
        result = runner.invoke(tokens2counts, [filename])
        tokens = result.output.split('\n')
        tokens.sort()
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #25
0
def test_pos_tokens():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou'
        expected_tokens = [
            'Hello,NNP', 'world,NN', '!,.', 'I,PRP', 'love,VBP', 'this,DT',
            'world,NN', 'and,CC', 'love,VB', 'you,PRP'
        ]
        create_single_output(filename, sentence)
        result = runner.invoke(tokens2pos, [filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #26
0
def test_count_tokens():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello,\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou'
        expected_tokens = [
            'love,2', 'world,2', 'and,1', 'I,1', 'you,1', 'this,1',
            '\"Hello,\",1', '!,1', ''
        ]
        expected_tokens.sort()
        create_single_output(filename, sentence)
        result = runner.invoke(tokens2counts, [filename])
        tokens = result.output.split('\n')
        tokens.sort()
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #27
0
def test_text2words_multifile():
    runner = CliRunner()
    with runner.isolated_filesystem():

        filenames = ['in.txt', 'in2.txt']
        sentences = ('Hello World!\nI.\nnot sure where to go',
                     'Goodbye World!\n I.\n know everything about you')
        expected_tokens = [
            'Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go',
            'Goodbye', 'World', '!', 'I.', 'know', 'everything', 'about', 'you'
        ]
        create_multifile_output(filenames, sentences)
        result = runner.invoke(text2words, filenames)
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #28
0
def test_filterwords_custom():
    runner = CliRunner()
    with runner.isolated_filesystem():

        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.'
        expected_tokens = ['World','!','crook','.']
        custom_stopword_filename = 'custom.txt'
        custom_stopwords = 'hello\n'

        create_single_output(filename, sentence)
        create_single_output(custom_stopword_filename, custom_stopwords)

        result = runner.invoke(filterwords, ['--custom', 'custom.txt', filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #29
0
def test_filterwords_custom():
    runner = CliRunner()
    with runner.isolated_filesystem():

        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.'
        expected_tokens = ['World', '!', 'crook', '.']
        custom_stopword_filename = 'custom.txt'
        custom_stopwords = 'hello\n'

        create_single_output(filename, sentence)
        create_single_output(custom_stopword_filename, custom_stopwords)

        result = runner.invoke(filterwords,
                               ['--custom', 'custom.txt', filename])
        tokens = result.output.split('\n')
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #30
0
def test_filterlengths():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnot\nwin\n'

        create_single_output(filename, sentence)

        # default length 3
        result = runner.invoke(filterlengths, [filename])
        tokens = result.output.split('\n')
        expected_tokens = ['Hello', 'World', 'not', 'win']
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)

        # minumum length 4
        result = runner.invoke(filterlengths, ['-m', '4', filename])
        tokens = result.output.split('\n')
        expected_tokens = ['Hello', 'World']
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #31
0
def test_filterlengths():
    runner = CliRunner()
    with runner.isolated_filesystem():
        filename = 'in.txt'
        sentence = 'Hello\nWorld\n!\nI\n.\nnot\nwin\n'

        create_single_output(filename, sentence)

        # default length 3
        result = runner.invoke(filterlengths, [filename])
        tokens = result.output.split('\n')
        expected_tokens = ['Hello', 'World', 'not', 'win']
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)

        # minumum length 4
        result = runner.invoke(filterlengths, ['-m', '4', filename])
        tokens = result.output.split('\n')
        expected_tokens = ['Hello', 'World']
        assert result.exit_code == 0
        compare_results(tokens, expected_tokens)
예제 #32
0
def test_annotate_settings():

    TEXT = """<div><p>Paris Hilton wasn't going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in Paris.</p></div>"""

    RESULT = """<div><p>Paris Hilton wasn\'t going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in <a class="anchorman" lemma="Paris" type="location">Paris</a>.</p></div>"""

    links = [{
        u'Paris': {
            'lemma': u'Paris',
            'type': 'location'
        }
    }, {
        u'Paris Hilton': {
            'lemma': u'Paris Hilton',
            'type': 'person'
        }
    }]

    cfg = get_config()

    cfg['markup'] = {
        'anchor_pattern':
        '<a class="anchorman" lemma="{lemma}" type="{type}">{token}</a>',
        'decorate_anchor_key': 'the_anchor'
    }

    rules = {
        'return_applied_links': True,
        # apply high score candidates first
        'sort_by_item_value': {
            'key': 'score',
            'default': 0
        },
        # 'replaces_per_element': {
        #     'number': 1,
        #     'key': 'lemma'
        # },
        # 'replaces_at_all': 5, #self.max_links,
        # not available 'longest_match_first': False,
        # 'replaces': {
        #     'by_attribute': {
        #         'key': 'type',
        #         # 'value_per_unit': 1
        #         'value_overall': 2 #self.max_per_etype
        #     }
        # },
        'items_per_unit': 4,  #self.links_per_paragraph,
        'filter_by_attribute': {
            'attributes': [{
                'key': 'type',
                'value': 'person'
            }]
        }
    }

    settings = {
        # "log_level": "DEBUG",
        "return_applied_links": True,
        # "forbidden_areas": {
        #     "tags": ["img", "a"],
        # "classes": ["first", "p--heading-3"]
        # }
    }

    cfg['settings'].update(settings)
    cfg['rules'].update(rules)

    annotated, applied, rest = annotate(TEXT, links, config=cfg)

    from tests.utils import compare_results

    RESULT = re.sub(" +", " ", RESULT)
    compare_results(annotated, RESULT)
    assert annotated == RESULT