Exemplo n.º 1
0
def test_pattern(morph):
    fe = FeatureExtractor(
        [features.token_lower],
        [
            features.sentence_start,
            features.sentence_end,
            features.Pattern([-1, 'token_lower']),
        ],
    )
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    assert fe.transform_single(sent, parsed) == [
        {
            'token_lower': 'летят',
            'sentence_start': 1.0
        },
        {
            'token_lower': 'гуси',
            'token_lower[i-1]': 'летят'
        },
        {
            'token_lower': 'на',
            'token_lower[i-1]': 'гуси'
        },
        {
            'token_lower': 'юг',
            'sentence_end': 1.0,
            'token_lower[i-1]': 'на'
        },
    ]
Exemplo n.º 2
0
def test_pattern_callable_complex(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]

    def not_title(token, parses, feature_dict):
        return not feature_dict.get('title', False)

    fe = FeatureExtractor([], [
        features.Pattern(
            [0, lambda token, parses: token.istitle(), 'title'],
        ),
        features.Pattern(
            [-1, 'title'],
            [ 0, 'title'],
        ),
        features.Pattern(
            [-1, not_title],
            [ 0, not_title],
            [+1, not_title],
        )
    ])
    assert fe.transform_single(sent, parsed) == [
        {'title': True},
        {'title': False, 'title[i-1]/title[i]': 'True/False', 'not_title[i-1]/not_title[i]/not_title[i+1]': 'False/True/True'},
        {'title': False, 'title[i-1]/title[i]': 'False/False', 'not_title[i-1]/not_title[i]/not_title[i+1]': 'True/True/True'},
        {'title': False, 'title[i-1]/title[i]': 'False/False'},
    ]
Exemplo n.º 3
0
def test_token_features(morph):
    fe = FeatureExtractor(
        [features.bias, features.token_lower],
    )
    tokens = 'Летят гуси'.split()
    parsed = [morph.parse(t) for t in tokens]
    assert fe.transform_single(tokens, parsed) == [
        {'bias': 1, 'token_lower': 'летят'},
        {'bias': 1, 'token_lower': 'гуси'},
    ]
Exemplo n.º 4
0
def test_pattern_cartesian(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower, features.Grammeme(threshold=0.1)],
        [
            features.Pattern([-1, 'Grammeme'], [0, 'Grammeme']),
            features.Drop('Grammeme')
        ],
    )
    xseq = fe.transform_single(sent, parsed)
    assert xseq[0] == {'token_lower': 'летят'}
    assert sorted(xseq[1].keys()) == sorted(['Grammeme[i-1]/Grammeme[i]', 'token_lower'])
    assert xseq[1]['Grammeme[i-1]/Grammeme[i]']['VERB/NOUN'] == 1.0
Exemplo n.º 5
0
def test_pattern_callable(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor([], [
        features.Pattern(
            [0, lambda token, parses: token.istitle(), 'title'],
        ),
    ])
    assert fe.transform_single(sent, parsed) == [
        {'title': True},
        {'title': False},
        {'title': False},
        {'title': False},
    ]
Exemplo n.º 6
0
def test_pattern_cartesian(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower,
         features.Grammeme(threshold=0.1)],
        [
            features.Pattern([-1, 'Grammeme'], [0, 'Grammeme']),
            features.Drop('Grammeme')
        ],
    )
    xseq = fe.transform_single(sent, parsed)
    assert xseq[0] == {'token_lower': 'летят'}
    assert sorted(xseq[1].keys()) == sorted(
        ['Grammeme[i-1]/Grammeme[i]', 'token_lower'])
    assert xseq[1]['Grammeme[i-1]/Grammeme[i]']['VERB/NOUN'] == 1.0
Exemplo n.º 7
0
def test_pattern(morph):
    fe = FeatureExtractor(
        [features.token_lower],
        [
            features.sentence_start,
            features.sentence_end,
            features.Pattern([-1, 'token_lower']),
        ],
    )
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    assert fe.transform_single(sent, parsed) == [
        {'token_lower': 'летят', 'sentence_start': 1.0},
        {'token_lower': 'гуси', 'token_lower[i-1]': 'летят'},
        {'token_lower': 'на', 'token_lower[i-1]': 'гуси'},
        {'token_lower': 'юг', 'sentence_end': 1.0, 'token_lower[i-1]': 'на'},
    ]
Exemplo n.º 8
0
def test_pattern_bigram_with_dict(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower, features.Grammeme(threshold=0.1)],
        [
            features.Pattern([-1, 'Grammeme'], [0, 'token_lower']),
            features.Pattern([-1, 'token_lower'], [0, 'Grammeme']),
        ],
    )
    xseq = fe.transform_single(sent, parsed)
    assert sorted(xseq[1].keys()) == sorted([
        'Grammeme',
        'Grammeme[i-1]/token_lower[i]',
        'token_lower',
        'token_lower[i-1]/Grammeme[i]',
    ])
    assert xseq[1]['Grammeme[i-1]/token_lower[i]'] == {'гуси': xseq[0]['Grammeme']}
    assert xseq[1]['token_lower[i-1]/Grammeme[i]'] == {'летят': xseq[1]['Grammeme']}
Exemplo n.º 9
0
def test_pattern_callable(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor([], [
        features.Pattern(
            [0, lambda token, parses: token.istitle(), 'title'], ),
    ])
    assert fe.transform_single(sent, parsed) == [
        {
            'title': True
        },
        {
            'title': False
        },
        {
            'title': False
        },
        {
            'title': False
        },
    ]
Exemplo n.º 10
0
def test_pattern_callable_complex(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]

    def not_title(token, parses, feature_dict):
        return not feature_dict.get('title', False)

    fe = FeatureExtractor([], [
        features.Pattern(
            [0, lambda token, parses: token.istitle(), 'title'], ),
        features.Pattern(
            [-1, 'title'],
            [0, 'title'],
        ),
        features.Pattern(
            [-1, not_title],
            [0, not_title],
            [+1, not_title],
        )
    ])
    assert fe.transform_single(sent, parsed) == [
        {
            'title': True
        },
        {
            'title': False,
            'title[i-1]/title[i]': 'True/False',
            'not_title[i-1]/not_title[i]/not_title[i+1]': 'False/True/True'
        },
        {
            'title': False,
            'title[i-1]/title[i]': 'False/False',
            'not_title[i-1]/not_title[i]/not_title[i+1]': 'True/True/True'
        },
        {
            'title': False,
            'title[i-1]/title[i]': 'False/False'
        },
    ]
Exemplo n.º 11
0
def test_pattern_kwargs(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower],
        [
            features.sentence_start,
            features.sentence_end,
            features.Pattern(
                [+1, 'token_lower'],
                [-1, 'sentence_start'],
                name='low+1 BOS-1',
                index_low=0,
                index_high=0,
            ),
        ],
    )
    assert fe.transform_single(sent, parsed) == [
        {'token_lower': 'летят', 'sentence_start': 1.0, 'low+1 BOS-1': 'гуси/?'},
        {'token_lower': 'гуси', 'low+1 BOS-1': 'на/1.0'},
        {'token_lower': 'на', 'low+1 BOS-1': 'юг/0.0'},
        {'token_lower': 'юг', 'sentence_end': 1.0, 'low+1 BOS-1': '?/0.0'},
    ]
Exemplo n.º 12
0
def test_pattern_kwargs(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower],
        [
            features.sentence_start,
            features.sentence_end,
            features.Pattern(
                [+1, 'token_lower'],
                [-1, 'sentence_start'],
                name='low+1 BOS-1',
                index_low=0,
                index_high=0,
            ),
        ],
    )
    assert fe.transform_single(sent, parsed) == [
        {
            'token_lower': 'летят',
            'sentence_start': 1.0,
            'low+1 BOS-1': 'гуси/?'
        },
        {
            'token_lower': 'гуси',
            'low+1 BOS-1': 'на/1.0'
        },
        {
            'token_lower': 'на',
            'low+1 BOS-1': 'юг/0.0'
        },
        {
            'token_lower': 'юг',
            'sentence_end': 1.0,
            'low+1 BOS-1': '?/0.0'
        },
    ]
Exemplo n.º 13
0
def test_pattern_bigram_with_dict(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower,
         features.Grammeme(threshold=0.1)],
        [
            features.Pattern([-1, 'Grammeme'], [0, 'token_lower']),
            features.Pattern([-1, 'token_lower'], [0, 'Grammeme']),
        ],
    )
    xseq = fe.transform_single(sent, parsed)
    assert sorted(xseq[1].keys()) == sorted([
        'Grammeme',
        'Grammeme[i-1]/token_lower[i]',
        'token_lower',
        'token_lower[i-1]/Grammeme[i]',
    ])
    assert xseq[1]['Grammeme[i-1]/token_lower[i]'] == {
        'гуси': xseq[0]['Grammeme']
    }
    assert xseq[1]['token_lower[i-1]/Grammeme[i]'] == {
        'летят': xseq[1]['Grammeme']
    }
Exemplo n.º 14
0
def test_token_and_global_features(morph):
    sent = 'Летят гуси на юг'.split()
    parsed = [morph.parse(t) for t in sent]
    fe = FeatureExtractor(
        [features.token_lower],
        [features.sentence_start, features.sentence_end],
    )
    assert fe.transform_single(sent, parsed) == [
        {'token_lower': 'летят', 'sentence_start': 1.0},
        {'token_lower': 'гуси'},
        {'token_lower': 'на'},
        {'token_lower': 'юг', 'sentence_end': 1.0},
    ]

    assert [fe.transform_single(sent, parsed)] == fe.transform(zip([sent], [parsed]))
    assert [fe.transform_single(sent, parsed)] == fe.fit_transform(zip([sent], [parsed]))

    sent = 'юг'.split()
    parsed = [morph.parse(t) for t in sent]
    assert fe.transform_single(sent, parsed) == [
        {'token_lower': 'юг', 'sentence_start': 1.0, 'sentence_end': 1.0},
    ]