def test_type_errors():
    F = fact('F', ['a'])
    RULE = rule(
        'a',
        eq('1').interpretation(
            custom(int)
        )
    ).interpretation(
        F.a
    )
    parser = Parser(RULE)
    match = parser.match('a 1')
    with pytest.raises(TypeError):
        match.fact

    F = fact('F', ['a'])
    RULE = rule(
        'a',
        eq('1').interpretation(
            custom(int)
        )
    ).interpretation(
        custom(str)
    )
    parser = Parser(RULE)
    match = parser.match('a 1')
    with pytest.raises(TypeError):
        match.fact
Exemplo n.º 2
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    NAME = gram('Name')
    PREP = gram('PREP')
    #GEO=gram('Geox')
    GEO = rule(
        and_(
            gram('Geox'),
            not_(
                or_(
                    eq('артема'),
                    eq('фармана'),
                    eq('оскол'),
                    eq('мунарева'),
                ))))

    NAME_OR_NOUN = or_(NAME, NOUN)

    CITY = morph_pipeline(['город', 'Нижний', 'новгород'])

    CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол']))

    CITY_NOT = rule(not_(or_(eq('артем'), eq('фармана'), eq('оскол'), INT)))

    CITY_PITER = rule(eq('санкт'), eq('-'), eq('петербург'))

    COMPLICATED_CITY = or_(rule(CITY.optional(), GEO), CITY_PITER)

    FINAL_CITY = or_(COMPLICATED_CITY)
    return FINAL_CITY
Exemplo n.º 3
0
def test_type_errors():
    F = fact('F', ['a'])
    RULE = rule(
        'a',
        eq('1').interpretation(
            custom(int)
        )
    ).interpretation(
        F.a
    )
    parser = Parser(RULE)
    match = parser.match('a 1')
    with pytest.raises(TypeError):
        match.fact

    F = fact('F', ['a'])
    RULE = rule(
        'a',
        eq('1').interpretation(
            custom(int)
        )
    ).interpretation(
        custom(str)
    )
    parser = Parser(RULE)
    match = parser.match('a 1')
    with pytest.raises(TypeError):
        match.fact
Exemplo n.º 4
0
def test_repeatable():
    F = fact('F', [attribute('a').repeatable()])
    RULE = rule(eq('a').interpretation(F.a),
                eq('b').interpretation(F.a)).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a b')
    record = match.fact
    assert record == F(a=['a', 'b'])
    assert record.spans == [(0, 1), (2, 3)]
    assert record.as_json == {'a': ['a', 'b']}
Exemplo n.º 5
0
def test_merge_facts():
    F = fact('F', ['a', 'b'])
    A = rule(eq('a').interpretation(F.a)).interpretation(F)
    B = rule(eq('b').interpretation(F.b)).interpretation(F)
    RULE = rule(A, B).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a b')
    record = match.fact
    assert record == F(a='a', b='b')
    assert record.spans == [(0, 1), (2, 3)]
    assert record.as_json == {'a': 'a', 'b': 'b'}
Exemplo n.º 6
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    GEO = gram('Geox')
    PREP = gram('PREP')
    CONJ = gram('CONJ')

    NAME = rule(and_(gram('Name'), not_(PREP), not_(GEO)))

    NOUN_NOT_CONJ = rule(and_(NOUN, not_(CONJ)))

    STREET_SUFFIXS = morph_pipeline([
        'улица', 'тракт', 'бульвар', 'проспект', 'микрорайон', 'проезд',
        'шоссе', 'парк'
    ])

    SPECIAL_STREET_SUFFIXS = morph_pipeline(['шоссе', 'тракт'])

    SIMPLE_STREETS_FROM_ARRAY = morph_pipeline([
        'краснопресненская', 'республике', 'маршала захарова', 'доватора',
        'мичурина', 'зеленые аллеи', 'бехтеева', 'октябрьская',
        'новогиреевская', 'югорская', 'артема', 'парковая', 'зеленые аллеи',
        'алтуфьевское', 'горького', 'Кавказский', 'хамовнический вал',
        'Кусковская', 'марьинский парк', 'московская', 'береговая',
        'антонова овсиенко', 'школьная', 'юнтоловский', 'гагарина'
    ])

    EXCEPTIONAL_STREET_CONST = morph_pipeline(['Кавказский'])

    NOUN_NOT_APPART = rule(not_(or_(eq('дом'), eq('квартира'), INT, CONJ)))

    COMPLICATED_STREETS = or_(
        rule(STREET_SUFFIXS, INT, NOUN, NOUN),
        rule(STREET_SUFFIXS, INT, ADJF, NOUN),
        rule(STREET_SUFFIXS, NOUN_NOT_CONJ, NOUN_NOT_APPART, NAME.optional()),
        rule(NAME, NOUN_NOT_APPART), rule(ADJF, NAME),
        rule(STREET_SUFFIXS, ADJF, NOUN_NOT_APPART),
        rule(STREET_SUFFIXS, CONJ, NOUN, NOUN))

    SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(STREET_SUFFIXS, NOUN_NOT_APPART)
    SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(ADJF,
                                                     SPECIAL_STREET_SUFFIXS)

    SIMPLE_STREETS = or_(SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX,
                         SIMPLE_STREETS_WITH_STREET_SUFFIX,
                         SIMPLE_STREETS_FROM_ARRAY)

    FINAL_STREET = or_(COMPLICATED_STREETS, SIMPLE_STREETS)

    return FINAL_STREET
def test_repeatable():
    F = fact('F', [attribute('a').repeatable()])
    RULE = rule(
        eq('a').interpretation(F.a),
        eq('b').interpretation(F.a)
    ).interpretation(
        F
    )
    parser = Parser(RULE)
    match = parser.match('a b')
    record = match.fact
    assert record == F(a=['a', 'b'])
    assert record.spans == [(0, 1), (2, 3)]
    assert record.as_json == {'a': ['a', 'b']}
Exemplo n.º 8
0
def req_preposition(preposition: str = None):
    if preposition == "None":
        return y.empty()
    else:
        return y.or_(
            y.and_(yp.gram("PREP"), yp.eq(preposition)), y.not_(yp.gram("PREP"))
        )
def test_merge_facts():
    F = fact('F', ['a', 'b'])
    A = rule(
        eq('a').interpretation(F.a)
    ).interpretation(F)
    B = rule(
        eq('b').interpretation(F.b)
    ).interpretation(F)
    RULE = rule(
        A, B
    ).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a b')
    record = match.fact
    assert record == F(a='a', b='b')
    assert record.spans == [(0, 1), (2, 3)]
    assert record.as_json == {'a': 'a', 'b': 'b'}
Exemplo n.º 10
0
def test_predicate_attribute():
    F = fact('F', ['a'])
    RULE = rule(eq('a').interpretation(F.a)).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a')
    record = match.fact
    assert record == F(a='a')
    assert record.spans == [(0, 1)]
    assert record.as_json == {'a': 'a'}
Exemplo n.º 11
0
def test_insted_attributes():
    F = fact('F', ['a', 'b'])
    RULE = rule(eq('a').interpretation(F.a)).interpretation(
        F.b).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a')
    record = match.fact
    assert record == F(a=None, b='a')
    assert record.spans == [(0, 1)]
    assert record.as_json == {'b': 'a'}
Exemplo n.º 12
0
def get_rules():
    RU = type('RU')
    INT = type('INT')
    NONE = type('NONE')
    NOUN = gram('NOUN')
    ADJF = gram('ADJF')
    CONJ = gram('CONJ')
    NAME = gram('Name')
    PREP = gram('PREP')
    NPRO = gram('NPRO')
    #GEO=gram('Geox')
    GEO = rule(
        and_(
            gram('Geox'),
            not_(
                or_(
                    eq('артема'),
                    eq('фармана'),
                    eq('оскол'),
                    eq('мунарева'),
                ))))

    NAME_OR_NOUN = or_(NAME, NOUN)

    HOUSE = morph_pipeline(['дом', 'корпус', 'квартира', 'строение', 'ст'])

    CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол']))

    HOUSE_NOT = rule(and_(not_(ADJF)))
    HOUSE1 = morph_pipeline(['a', 'а', '/', 'б'])

    UNIT1 = or_(
        rule(and_(INT, not_(eq('3'))), HOUSE1.optional(), HOUSE_NOT.optional(),
             INT.optional()))

    DOUBLED = rule(RU, RU)

    UNIT = or_(rule(HOUSE.optional(), UNIT1))

    COMPLICATED_HOUSE = rule(UNIT.repeatable())

    FINAL_HOUSE = or_(COMPLICATED_HOUSE)
    return FINAL_HOUSE
Exemplo n.º 13
0
def test_nested_facts():
    F = fact('F', ['a'])
    G = fact('G', ['b'])
    RULE = rule(eq('a').interpretation(F.a)).interpretation(F).interpretation(
        G.b).interpretation(G)
    parser = Parser(RULE)
    match = parser.match('a')
    record = match.fact
    assert record == G(b=F(a='a'))
    assert record.spans == [(0, 1)]
    assert record.as_json == {'b': {'a': 'a'}}
Exemplo n.º 14
0
def test_predicate_attribute():
    F = fact('F', ['a'])
    RULE = rule(
        eq('a').interpretation(F.a)
    ).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a')
    record = match.fact
    assert record == F(a='a')
    assert record.spans == [(0, 1)]
    assert record.as_json == {'a': 'a'}
Exemplo n.º 15
0
def get_hyperonyms(main_word):
    HYPONYM = eq(utils.deaccent(main_word))
    RULE = or_(rule(HYPONYM, ATAKJE, START, MID, END), rule(HYPONYM, MID, END),
               rule(START_S, END, KAK, HYPONYM), rule(END, INCLUDING, HYPONYM))
    parser = Parser(RULE)
    text = utils.deaccent(wikipedia.summary(main_word))
    print(text)
    text = re.sub(r'\(.+?\)', '', text)
    text = text.lower().replace('* сергии радонежскии* ', '')
    for idx, match in enumerate(parser.findall(text.lower())):
        k = [_.value for _ in match.tokens]
        print(k)
Exemplo n.º 16
0
def test_insted_attributes():
    F = fact('F', ['a', 'b'])
    RULE = rule(
        eq('a').interpretation(F.a)
    ).interpretation(
        F.b
    ).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a')
    record = match.fact
    assert record == F(a=None, b='a')
    assert record.spans == [(0, 1)]
    assert record.as_json == {'b': 'a'}
Exemplo n.º 17
0
def _abbreviate(word: str, abbrs: List[str], opt=False):
    abbrs, dashed = partition(lambda abbr: '-' in abbr, abbrs)
    dashed = map(
        lambda a: rule(*map(caseless, intersperse('-', a.split('-')))), dashed)

    original_word = rule(normalized(word))
    dashed_sequence = rule(or_(*dashed))
    abbr_with_dot = rule(
        or_(*map(caseless, abbrs)),
        eq('.').optional(),
    )

    result = or_(original_word, dashed_sequence, abbr_with_dot) \
        .interpretation(interpretation.const(word))

    return result.optional() if opt else result
Exemplo n.º 18
0
def test_nested_facts():
    F = fact('F', ['a'])
    G = fact('G', ['b'])
    RULE = rule(
        eq('a').interpretation(F.a)
    ).interpretation(
        F
    ).interpretation(
        G.b
    ).interpretation(
        G
    )
    parser = Parser(RULE)
    match = parser.match('a')
    record = match.fact
    assert record == G(b=F(a='a'))
    assert record.spans == [(0, 1)]
    assert record.as_json == {'b': {'a': 'a'}}
Exemplo n.º 19
0
        pidxs = json.loads(prestr(val[2]))
        concp = [el.split(",")[0] for el in json.loads(prestr(val[3]))]
        idx2syns.update(dict(zip(pidxs, concp)))
    except:
        print(prestr(val[2]))
        print(prestr(val[3]))

# In[ ]:

# In[4]:

START = rule(
    or_(rule(gram('ADJF')), rule(gram('NOUN'))).optional(), gram('NOUN'))

START_S = or_(
    eq('такой'),
    eq('такие'),
)

KAK = eq('как')
INCLUDING = or_(
    or_(
        eq('в'),
        eq('том'),
        eq('числе'),
    ),
    eq('включающий'),
    or_(
        eq('включающий'),
        eq('в'),
        eq('себя'),
Exemplo n.º 20
0
# coding: utf-8
from __future__ import unicode_literals

from yargy import (rule, and_, or_, fact)
from yargy.predicates import (eq, in_, gram, normalized, caseless)

Money = fact('Money', ['amount', 'currency'])

EURO = normalized('евро')

DOLLARS = or_(normalized('доллар'), eq('$'))

RUBLES = or_(rule(normalized('рубль')),
             rule(or_(caseless('руб'), caseless('р')),
                  eq('.').optional()))

CURRENCY = or_(rule(EURO), rule(DOLLARS),
               RUBLES).interpretation(Money.currency)

INT = gram('INT')

AMOUNT_ = or_(
    rule(INT),
    rule(INT, INT),
    rule(INT, INT, INT),
    rule(INT, '.', INT),
    rule(INT, '.', INT, '.', INT),
)

FRACTION_AMOUN = rule(AMOUNT_, in_({',', '.'}), INT)
Exemplo n.º 21
0
from yargy import rule, and_, or_
from yargy.interpretation import (fact, const, attribute)
from yargy.predicates import (eq, length_eq, in_, in_caseless, type,
                              normalized, caseless, dictionary)

Part = fact('Part', ['part'])

Money = fact('Money', [
    'integer_min',
    attribute('integer_max', -1),
    attribute('currency', '-'),
    attribute('multiplier', -1),
    attribute('period', '-')
])

DOT = eq('.')
INT = type('INT')

########
#
#   CURRENCY
#
##########

EURO = or_(normalized('евро'), normalized('euro'), eq('€'),
           caseless('EUR')).interpretation(const('EUR'))

DOLLARS = or_(normalized('доллар'), normalized('дол'), normalized('dollar'),
              eq('$'), caseless('USD')).interpretation(const('USD'))

RUBLES = or_(
                max.amount *= 1000
        if not min.currency:
            min.currency = max.currency
        # if (min.currency is not None) and (min.currency != 'RUB') and (max.currency is not None):
        #     max.currency
        elif min.currency != max.currency:
            min.currency = max.currency
        # для рублевых вилок типа 150-250 без указания тысяч домножаем на тысячу
        if (max.amount < 1000) and (min.amount < 1000) and (max.currency
                                                            == 'RUB'):
            min.amount *= 1000
            max.amount *= 1000
        return dsl.Range(min, max)


DOT = eq('.')
INT = type('INT')

########
#
#   CURRENCY
#
##########

# EURO = or_(
#     normalized('евро'),
#     #in_(['€', 'EUR'])
#     eq('€'),
#     #eq('EUR')
# ).interpretation(
#     const(dsl.EURO)
Exemplo n.º 23
0
    rule(caseless('м'), '.'),
    rule(normalized('метро')),
)

__quotes = "„“”‚‘’'\""
LEFT_QUOTE = in_("«" + __quotes)
RIGHT_QUOTE = in_("»" + __quotes)

STATION = rule(
    STATION_WORD.optional(),
    METRO_WORD.optional(),
    LEFT_QUOTE.optional(),
    STATION_TITLE.interpretation(
        meaning.custom(lambda p: p.value)).interpretation(Station.name),
    rule(
        eq('-').optional(),
        LIST_OF_NUMERALS.interpretation(Station.num),
    ).optional(),
    RIGHT_QUOTE.optional(),
).interpretation(Station)

LIST_OF_STATIONS = rule(
    STATION.means(Array.element),
    rule(
        in_caseless('и,-'),
        STATION.means(Array.element),
    ).repeatable().optional(),
).interpretation(Array).interpretation(meaning.custom(lambda p: p.element))

FROM_STATION_TO_STATION = rule(
    or_(caseless('с'), caseless('со')),
Exemplo n.º 24
0
    'девяносто': 90,
    'сто': 100,
    'двести': 200,
    'триста': 300,
    'четыреста': 400,
    'пятьсот': 500,
    'шестьсот': 600,
    'семьсот': 700,
    'восемьсот': 800,
    'девятьсот': 900,
    'тысяча': 10**3,
    'миллион': 10**6,
    'миллиард': 10**9,
    'триллион': 10**12,
}
DOT = eq('.')
INT = type('INT')
THOUSANDTH = rule(caseless_pipeline(['тысячных', 'тысячная'])).interpretation(const(10**-3))
HUNDREDTH = rule(caseless_pipeline(['сотых', 'сотая'])).interpretation(const(10**-2))
TENTH = rule(caseless_pipeline(['десятых', 'десятая'])).interpretation(const(10**-1))
THOUSAND = or_(
    rule(caseless('т'), DOT),
    rule(caseless('тыс'), DOT.optional()),
    rule(normalized('тысяча')),
    rule(normalized('тыща'))
).interpretation(const(10**3))
MILLION = or_(
    rule(caseless('млн'), DOT.optional()),
    rule(normalized('миллион'))
).interpretation(const(10**6))
MILLIARD = or_(
Exemplo n.º 25
0
    'август': 8,
    'сентябрь': 9,
    'октябрь': 10,
    'ноябрь': 11,
    'декабрь': 12,
}

MONTH_NAME = dictionary(MONTHS).interpretation(Date.month.normalized())

MONTH = and_(gte(1), lte(12)).interpretation(Date.month)

DAY = and_(gte(1), lte(31)).interpretation(Date.day)

YEAR_WORD = or_(
    rule('г',
         eq('.').optional()),
    rule(normalized('г.'),
         eq('.').optional()),
    rule(normalized('год')),
    rule(normalized('гг')),
    rule(')'),
)
YEAR_PREFIX = or_(
    rule('в '),
    rule('c '),
    rule(', '),
    rule('('),
)

YEAR_POSTFIX = or_(
    rule('е'),
Exemplo n.º 26
0

Range = fact('Range', ['min', 'max'])


class Range(Range, Normalizable):
    @property
    def normalized(self):
        min = self.min.normalized
        max = self.max.normalized
        if not min.currency:
            min.currency = max.currency
        return dsl.Range(min, max)


DOT = eq('.')
INT = type('INT')

########
#
#   CURRENCY
#
##########

EURO = or_(normalized('евро'), eq('€')).interpretation(const(dsl.EURO))

DOLLARS = or_(normalized('доллар'), eq('$')).interpretation(const(dsl.DOLLARS))

RUBLES = or_(
    rule(normalized('рубль')),
    rule(or_(caseless('руб'), caseless('р'), eq('₽')),
Exemplo n.º 27
0
MONTH = and_(
    gte(1),
    lte(12)
).interpretation(
    Date.month.custom(int)
)

DAY = and_(
    gte(1),
    lte(31)
).interpretation(
    Date.day.custom(int)
)

YEAR_WORD = or_(
    rule('г', eq('.').optional()),
    rule(normalized('год'))
)

YEAR = and_(
    gte(1000),
    lte(2100)
).interpretation(
    Date.year.custom(int)
)

YEAR_SHORT = and_(
    length_eq(2),
    gte(0),
    lte(99)
).interpretation(
Exemplo n.º 28
0
)
Street = fact(
    'Street',
    ['name', 'type']
)
Building = fact(
    'Building',
    ['number', 'type']
)
Room = fact(
    'Room',
    ['number', 'type']
)


DASH = eq('-')
DOT = eq('.')

ADJF = gram('ADJF')
NOUN = gram('NOUN')
INT = type('INT')
TITLE = is_title()

ANUM = rule(
    INT,
    DASH.optional(),
    in_caseless({
        'я', 'й', 'е',
        'ое', 'ая', 'ий', 'ой'
    })
)
Exemplo n.º 29
0
    'Range',
    ['min', 'max']
)


class Range(Range, Normalizable):
    @property
    def normalized(self):
        min = self.min.normalized
        max = self.max.normalized
        if not min.currency:
            min.currency = max.currency
        return dsl.Range(min, max)


DOT = eq('.')
INT = type('INT')


########
#
#   CURRENCY
#
##########


EURO = or_(
    normalized('евро'),
    eq('€')
).interpretation(
    const(dsl.EURO)
Exemplo n.º 30
0
    'июль': 7,
    'август': 8,
    'сентябрь': 9,
    'октябрь': 10,
    'ноябрь': 11,
    'декабрь': 12,
}

MONTH_NAME = dictionary(MONTHS).interpretation(Date.month.normalized().custom(
    MONTHS.__getitem__))

MONTH = and_(gte(1), lte(12)).interpretation(Date.month.custom(int))

DAY = and_(gte(1), lte(31)).interpretation(Date.day.custom(int))

YEAR_WORD = or_(rule('г', eq('.').optional()), rule(normalized('год')))

YEAR = and_(gte(1000), lte(2100)).interpretation(Date.year.custom(int))

YEAR_SHORT = and_(gte(0), lte(99)).interpretation(
    Date.year.custom(lambda _: 1900 + int(_)))

ERA_YEAR = and_(gte(1), lte(100000)).interpretation(Date.year.custom(int))

ERA_WORD = rule(
    eq('до'),
    or_(rule('н', eq('.'), 'э',
             eq('.').optional()),
        rule(normalized('наша'),
             normalized('эра')))).interpretation(Date.current_era.const(False))
Exemplo n.º 31
0
Arquivo: City.py Projeto: yazimut/NLP
from yargy import rule, and_, or_, not_
from yargy.predicates import eq, type as _type, normalized, custom
from yargy.pipelines import morph_pipeline
from yargy.interpretation import fact

CityFact = fact('city', ['prefix', 'title'])

CityTitle = morph_pipeline({
    'липецк', 'сургут', 'нальчик', 'москва', 'санкт-петербург', 'питер',
    'нижний новгород', 'видное'
}).interpretation(CityFact.title.normalized())

CityRule = rule(
    normalized('город').optional().interpretation(CityFact.prefix), CityTitle,
    eq(';').optional()).interpretation(CityFact)
Exemplo n.º 32
0
)
COD = fact(
      'Codex',
      ['n0', 'point', 'n1', 'subpoint', 'n2', 'part', 'n3', 'article', 'n4', 'par', 'n5',
       'subsection', 'n6', 'section', 'n7', 'chapter', 'n8', 'type', 'codex']
)

COURT_ = fact(
    'Court',
    ['smth', 'type', 'court', 'rf']
)

NUM = and_(gte(1), lte(10000))

NUMBERS = rule(NUM,
               rule(eq('.').optional(), NUM).repeatable().optional())


CODEX = rule(
        or_(rule(normalized('пункт')),
            rule('п', eq('.').optional())
        ).repeatable().optional().interpretation(COD.point),

        NUMBERS.repeatable().optional().interpretation(COD.n1),

        or_(rule(normalized('подпункт')),
            rule('пп', eq('.').optional())
        ).repeatable().optional().interpretation(COD.subpoint),

        NUMBERS.repeatable().optional().interpretation(COD.n2),
Exemplo n.º 33
0
    'нотариальная контора',
    'букмекерская контора',
    'авиазавод',
    'автозавод',
    'винзавод',
    'подстанция',
    'гидроэлектростанция',
])

gnc = gnc_relation()
ADJF_PREFIX = rule(
    or_(
        rule(gram('ADJF').match(gnc)),  # международное
        rule(  # историко-просветительское
            true(),
            eq('-'),
            gram('ADJF').match(gnc),
        ),
    ),
    or_(caseless('и'), eq(',')).optional(),
).repeatable()

case = case_relation()
GENT_GROUP = rule(
    gram('gent').match(case)
).repeatable().optional()

QUOTED = rule(
    TYPE,
    in_(QUOTES),
    not_(in_(QUOTES)).repeatable(),
Exemplo n.º 34
0
            'data scientist', 'data engineer', 'engineer', 'analyst',
            'data analyst', 'data manager', 'scientist', 'researcher',
            "developer", "intern"
        ]), rule(dictionary(['DS', 'DE']), is_capitalized()),
        morph_pipeline(["аналитик", "разработчик",
                        "стажер"])).interpretation(Position.name.inflected()))

FIELD = rule(
    caseless_pipeline([
        'ML', 'DL', 'CV', 'computer vision', 'NLP', 'bi', 'machine learning',
        'deep learning', 'software', 'research', 'big data', 'python', 'c++',
        "scala", "java", 'ios', "android", 'devops', "backend", 'frontend'
    ]).interpretation(Position.field))

HEAD = rule(
    caseless('head').interpretation(Position.level), eq('of'),
    caseless_pipeline(['analytics', 'predictive analytics',
                       'data science']).interpretation(Position.field))

POSITION = or_(
    rule(LEVEL.optional(), FIELD.optional(),
         eq('-').optional(), NAME), HEAD).interpretation(Position)


# TODO: нужен метод extract с фильтрацией. Например, разбирать возможные ложные срабатывания ("аналитика"),
#  фильтровать по длине (чем больше полей заполнено, тем предпочтительнее)
#  можно выдавать "аналитик" только тогда, когда ничего более конкретного не нашлось
class PositionExtractor(Extractor):
    def __init__(self):
        super(PositionExtractor, self).__init__(POSITION)
Exemplo n.º 35
0
from yargy import rule, and_, not_, or_
from yargy.interpretation import fact
from yargy.predicates import gram, eq, type, in_
from yargy.relations import gnc_relation
from yargy.pipelines import morph_pipeline
from .data import NCONTRACT

INT = type('INT')
DOT = eq('.')
LEFT = eq('<')
RIGHT = rule(in_('>.'))

Datecont = fact('Datecont', ['day', 'month', 'year'])

OT = rule(eq('от'))

BEFOREDATE = or_(NCONTRACT, OT)

DAY = rule(INT).interpretation(Datecont.day.custom(int))

MONTH = or_(
    morph_pipeline([
        'январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август',
        'сентябрь', 'октябрь', 'ноябрь', 'декабрь'
    ]), rule(INT)).interpretation(Datecont.month)

YEAR = rule(INT).interpretation(Datecont.year.custom(int))

#Правило для даты документа
DATECONT = rule(BEFOREDATE, LEFT.optional(), DAY, RIGHT.optional(), MONTH,
                DOT.optional(), YEAR).interpretation(Datecont)
Exemplo n.º 36
0
RULE_DAY_TIME = rule(
    rule("в").optional(),
    or_(
        rule(
            HOUR_OF_A_DAY.interpretation(DayTime.hour.normalized().custom(
                lambda val: int(WORDS_HOUR_OF_A_DAY.get(val, val)))),
            normalized("час").optional(),
            AM_PM.optional().interpretation(
                DayTime.am_pm.normalized().custom(normalize_am_pm))),
        rule(
            and_(
                gte(0),
                lte(23),
            ).interpretation(DayTime.hour.custom(int)),
            eq(":").interpretation(
                DayTime.strict_format.custom(lambda _: True)),
            and_(gte(0), lte(59)).interpretation(
                DayTime.minute.normalized().custom(int)),
            rule(
                ":",
                and_(gte(0), lte(59)).interpretation(DayTime.second.custom(
                    int))).optional()))).interpretation(DayTime)

RULE_RELATIVE_DAY = rule(
    rule("в").optional(),
    RELATIVE_DAY.interpretation(
        RelativeDay.relative_day.normalized().custom(normalize_relative_day)),
    RULE_DAY_TIME.optional().interpretation(
        RelativeDay.day_time)).interpretation(RelativeDay)

RULE_DAY_OF_THE_WEEK = rule(
Exemplo n.º 37
0
from imaplib import Months
from yargy import Parser, rule, and_, or_, not_
from yargy.pipelines import morph_pipeline
from yargy.interpretation import fact
from IPython.display import display
from yargy.predicates import gram, eq, lte, gte, in_, is_capitalized, dictionary, normalized, caseless,type as typ
from yargy.predicates.bank import tokenize

from yargy.tokenizer import INT, MorphTokenizer


# tokenizer = MorphTokenizer()

NUMINT = typ('INT')
DOT = or_(eq('.'), eq(','))
FLOAT = rule(
    NUMINT,
    DOT,
    NUMINT
)
NUM = or_(
    rule(NUMINT),
    FLOAT
)

# Temperature

Temperature = fact(
    'Temperature',
    ['min', 'max', 'singular']
)
Exemplo n.º 38
0
    'апрель': 4,
    'май': 5,
    'июнь': 6,
    'июль': 7,
    'август': 8,
    'сентябрь': 9,
    'октябрь': 10,
    'ноябрь': 11,
    'декабрь': 12,
}

MONTH_NAME = dictionary(MONTHS).interpretation(Date.month.normalized())

MONTH = and_(gte(1), lte(12)).interpretation(Date.month)

DAY = and_(gte(1), lte(31)).interpretation(Date.day)

YEAR_WORD = or_(rule('г', eq('.').optional()), rule(normalized('год')))

YEAR = and_(gte(1900), lte(2100)).interpretation(Date.year)

YEAR_SHORT = and_(gte(0), lte(99)).interpretation(Date.year)

DATE = or_(
    rule(DAY, '.', MONTH, '.', or_(YEAR, YEAR_SHORT), YEAR_WORD.optional()),
    rule(YEAR, YEAR_WORD),
    rule(DAY, MONTH_NAME),
    rule(MONTH_NAME, YEAR, YEAR_WORD.optional()),
    rule(DAY, MONTH_NAME, YEAR, YEAR_WORD.optional()),
).interpretation(Date)
Exemplo n.º 39
0
    or_(
        or_(
            gram('PREP'),
            gram('Vpre'),
            gram('CONJ'),
            gram('PRCL'),
            gram('INTJ'),
        ),
        gram('POST'),
    ).optional())

case = case_relation()
GENT_GROUP = rule(gram('gent').match(case)).repeatable().optional()

#ADJF
ADJF_PREFIX_COUNTABLE = rule(or_(caseless('и'), eq(',')).optional(), )

ADJF_PREFIX_ADJF = and_(ADJF, TITLE).repeatable()

ADJF_NORM = rule(
    and_(ADJF, custom(lambda s: EDUORG_DICT_REGEXP.search(s),
                      types=(str)))).repeatable()

ADJF_PREFIX = rule(
    ADJF_PREFIX_ADJF,
    ADJF.optional(),  #Киевском государственном университете
    ADJF_PREFIX_COUNTABLE).repeatable()
#
###

### 1-ST RING RULES