Пример #1
0
def test():
    assert_tags_equal(
        tagger=LinearTagger(),
        expected=[
            (0,
             Tag(
                 pos=PartOfSpeech.ADJECTIVE,
                 case=Case.NOMINATIVE,
                 degree=Degree.POSITIVE,
                 gender=Gender.NEUTER,
                 number=Number.SINGULAR,
                 variant=Variant.FULL,
             )),
            (
                1,
                Tag(
                    pos=PartOfSpeech.NOUN,
                    animacy=Animacy.INANIMATE,
                    case=Case.NOMINATIVE,
                    gender=Gender.NEUTER,
                    number=Number.SINGULAR,
                ),
            ),
        ],
        words=['чёрное', 'зеркало'],
    )
Пример #2
0
def test():
    assert_tags_equal(
        tagger=CRFTagger(),
        expected=[
            (0,
             Tag(
                 pos=PartOfSpeech.ADJECTIVE,
                 case=Case.NOMINATIVE,
                 degree=Degree.POSITIVE,
                 gender=Gender.MASCULINE,
                 number=Number.SINGULAR,
                 variant=Variant.FULL,
             )),
            (
                1,
                Tag(
                    pos=PartOfSpeech.NOUN,
                    animacy=Animacy.ANIMATE,
                    case=Case.NOMINATIVE,
                    gender=Gender.MASCULINE,
                    number=Number.SINGULAR,
                ),
            ),
        ],
        words=['настоящий', 'детектив'],
    )
Пример #3
0
def test():
    assert_tags_equal(tagger=RNNTagger(),
                      expected=[
                          (
                              0,
                              Tag(
                                  pos=PartOfSpeech.ADJECTIVE,
                                  case=Case.NOMINATIVE,
                                  degree=Degree.POSITIVE,
                                  number=Number.PLURAL,
                                  variant=Variant.FULL,
                              ),
                          ),
                          (
                              1,
                              Tag(
                                  pos=PartOfSpeech.NOUN,
                                  animacy=Animacy.INANIMATE,
                                  case=Case.NOMINATIVE,
                                  gender=Gender.NEUTER,
                                  number=Number.PLURAL,
                              ),
                          ),
                      ],
                      words=['необычные', 'дела'])
Пример #4
0
def test_tag():
    tag = Tag(pos=PartOfSpeech.NOUN)

    _assert_analyzed_equal(
        expected=[Morph(word='hello', lemma='hello', tag=tag)],
        taggers=[ConstantTagger(word='hello', tag=tag)],
        text=['hello'],
    )
Пример #5
0
def test_tag_partially():
    tag = Tag(pos=PartOfSpeech.ADJECTIVE)

    _assert_analyzed_equal(
        expected=[
            Morph(word='hello', lemma='hello', tag=_UNKNOWN),
            Morph(word='world', lemma='world', tag=tag),
        ],
        taggers=[ConstantTagger(word='world', tag=tag)],
        text=['hello', 'world'],
    )
Пример #6
0
def get_tag(parse: pymorphy2.analyzer.Parse) -> Tag:
    return Tag(
        pos=get_part_of_speech(parse),
        animacy=get_animacy(parse),
        aspect=get_aspect(parse),
        case=get_case(parse),
        degree=get_degree(parse),
        gender=get_gender(parse),
        mood=get_mood(parse),
        number=get_number(parse),
        person=get_person(parse),
        tense=get_tense(parse),
        verbform=get_verbform(parse),
        voice=get_voice(parse),
    )
Пример #7
0
import re
from typing import Iterator

from maru.grammeme import PartOfSpeech
from maru.grammeme.numform import NumericalForm
from maru.tag import Tag
from maru.tagger.abstract import ITagger, Tagged
from maru.types import Text, Indices

_REGEX = re.compile(f'(?P<{NumericalForm.REAL}>\d+[.,]\d+$)|'
                    f'(?P<{NumericalForm.INTEGER}>\d+$)')

_INTEGER = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER)
_REAL = Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL)


class NumericalTagger(ITagger):
    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        for index in indices:
            match = _REGEX.match(text[index])
            if match is not None:
                group = match.lastgroup
                tag = _REAL if group == NumericalForm.REAL else _INTEGER
                yield index, tag
Пример #8
0
import pytest

from maru.grammeme import PartOfSpeech
from maru.tag import Tag
from maru.tagger.punctuation import PunctuationTagger
from tests.tagger.base import TaggerTest

_PUNCTUATION = Tag(pos=PartOfSpeech.PUNCTUATION)


@pytest.fixture(name='tagger', scope='session')
def create_tagger():
    return PunctuationTagger()


@pytest.mark.parametrize(
    'test',
    [
        TaggerTest(
            words=['!', '@', '.....,'],
            tags=[(0, _PUNCTUATION), (1, _PUNCTUATION), (2, _PUNCTUATION)],
        ),
        TaggerTest(
            words=['?!', '"', ':', ';'],
            tags=[
                (0, _PUNCTUATION),
                (1, _PUNCTUATION),
                (2, _PUNCTUATION),
                (3, _PUNCTUATION),
            ],
        ),
Пример #9
0
def test():
    lemmatizer = PymorphyLemmatizer()

    assert lemmatizer.lemmatize('мыло', Tag(pos=PartOfSpeech.VERB)) == 'мыть'
Пример #10
0
@pytest.fixture(name='tagger', scope='session')
def create_tagger():
    return RNNTagger()


@pytest.mark.parametrize(
    'test',
    [
        TaggerTest(
            words=['необычные', 'дела'],
            tags=[
                (
                    0,
                    Tag(
                        pos=PartOfSpeech.ADJECTIVE,
                        case=Case.NOMINATIVE,
                        degree=Degree.POSITIVE,
                        number=Number.PLURAL,
                        variant=Variant.FULL,
                    ),
                ),
                (
                    1,
                    Tag(
                        pos=PartOfSpeech.NOUN,
                        animacy=Animacy.INANIMATE,
                        case=Case.NOMINATIVE,
                        gender=Gender.NEUTER,
                        number=Number.PLURAL,
                    ),
Пример #11
0
from typing import Sequence

from maru.grammeme import PartOfSpeech
from maru.lemmatizer import DummyLemmatizer
from maru.morph import Morph
from maru.analyzer import Analyzer
from maru.tag import Tag
from maru.tagger import ITagger
from maru.types import Text

from tests.stubs.tagger import ConstantTagger

_UNKNOWN = Tag(pos=PartOfSpeech.UNKNOWN)


def _assert_analyzed_equal(expected: Sequence[Morph],
                           taggers: Sequence[ITagger],
                           text: Text,
                           ):
    analyzer = Analyzer(taggers, lemmatizer=DummyLemmatizer())

    assert expected == list(analyzer.analyze(text))


def test_unknown():
    _assert_analyzed_equal(
        expected=[
            Morph(
                word='hello',
                lemma='hello',
                tag=_UNKNOWN,
Пример #12
0
import re
from typing import Iterator

from maru.grammeme import PartOfSpeech
from maru.grammeme.numform import NumericalForm
from maru.tag import Tag
from maru.tagger.abstract import ITagger, Tagged
from maru.types import Indices, Text

_REGEX = re.compile(rf'(?P<{NumericalForm.REAL}>\d+[.,/]\d+$)|'
                    rf'(?P<{NumericalForm.INTEGER}>\d+$)|'
                    rf'(?P<{NumericalForm.RANGE}>\d+[‑–—−-]\d+)')
_TAGS = {
    NumericalForm.REAL:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.REAL),
    NumericalForm.INTEGER:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.INTEGER),
    NumericalForm.RANGE:
    Tag(pos=PartOfSpeech.NUMERICAL, numform=NumericalForm.RANGE),
}


class NumericalTagger(ITagger):
    def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]:
        for index in indices:
            match = _REGEX.match(text[index])
            if match is not None:
                form = NumericalForm(match.lastgroup)
                yield index, _TAGS[form]