def req_deverbal(require_deverbal_noun: str = "?"): if require_deverbal_noun == "1": ## strictly deverbal noun return y.and_(yp.gram("NOUN"), yp.in_caseless(deverbal_nouns)) elif require_deverbal_noun == "0": ## strictly regular verb return y.or_(yp.gram("VERB"), yp.gram("INFN")) elif require_deverbal_noun == "?": ## anything return y.or_( y.and_(yp.gram("NOUN"), yp.in_caseless(deverbal_nouns)), yp.gram("VERB"), yp.gram("INFN"), ) else: raise ValueError("Incorrect deverbal status")
) DASH = eq('-') DOT = eq('.') ADJF = gram('ADJF') NOUN = gram('NOUN') INT = type('INT') TITLE = is_title() ANUM = rule( INT, DASH.optional(), in_caseless({ 'я', 'й', 'е', 'ое', 'ая', 'ий', 'ой' }) ) ######### # # STRANA # ########## # TODO COUNTRY_VALUE = dictionary({ 'россия', 'украина'
'сутки', 'смена', 'm', 'month', 'м' 'месяц', 'мес', 'y', 'year', 'г', 'год', } PERIOD = dictionary(PERIODS) PER = or_(eq('/'), in_caseless({'в', 'за', 'per'})) RATE = rule(PER, PERIOD.interpretation(Money.period)) MONEY = rule( or_( in_({ '•', ':', '`', '~', '*', '-', '–', '—', ';', '.', '(', 'от', 'from' }), type('RU'), type('LATIN'), ).optional(), CURRENCY.interpretation(Money.currency).optional(), eq('+').optional(), eq('*').optional(), INTEGER.interpretation(Money.integer_min.custom(normalize_integer)),
MULTIPLIER = or_(MILLIARD, MILLION, THOUSAND).interpretation(Money.multiplier) ######## # # NUMERAL # ####### NUMR = or_( gram('NUMR'), # https://github.com/OpenCorpora/opencorpora/issues/818 dictionary({'ноль', 'один'}), ) # TODO: можно выпилить дробные части для снижения числа ложных срабатываний, их все равно не бывает в реальных вилках # Хотя одна вакаха в Tampere University of Technology реально была с дробями MODIFIER = in_caseless({'целых', 'сотых', 'десятых'}) PART = or_(rule(or_(INT, NUMR, MODIFIER)), MILLIARD, MILLION, THOUSAND, CURRENCY, COINS_CURRENCY) # TODO: вот здесь можно поправить, чтобы телефоны не парсились BOUND = in_('()//') NUMERAL = rule(BOUND, PART.repeatable(), BOUND) ####### # # AMOUNT # ########
RIGHT_QUOTE = in_("»" + __quotes) STATION = rule( STATION_WORD.optional(), METRO_WORD.optional(), LEFT_QUOTE.optional(), STATION_TITLE.interpretation( meaning.custom(lambda p: p.value)).interpretation(Station.name), rule( eq('-').optional(), LIST_OF_NUMERALS.interpretation(Station.num), ).optional(), RIGHT_QUOTE.optional(), ).interpretation(Station) LIST_OF_STATIONS = rule( STATION.means(Array.element), rule( in_caseless('и,-'), STATION.means(Array.element), ).repeatable().optional(), ).interpretation(Array).interpretation(meaning.custom(lambda p: p.element)) FROM_STATION_TO_STATION = rule( or_(caseless('с'), caseless('со')), STATION.means(Array.element), caseless('на'), STATION.means(Array.element ), # todo LIST_OF_STATIONS: со спасской на садовую и сенную ).interpretation(Array).interpretation(meaning.custom(lambda p: p.element))
class Building(Building): value = value('buildingName') class AddrPart(AddrPart): @property def obj(self): from natasha import obj part = self.value return obj.AddrPart(part.value, part.type) INT = type('INT') LETTER = in_caseless(set('абвгдежзиклмнопрстуфхшщэюя')) TYPE_CITY = dictionary({'город'}).interpretation(City.typeCity) STRUCTURE_TYPE = dictionary({'строение', 'ст'}).interpretation(Structure.structureType) TYPE_APPART = dictionary({'квартира'}).interpretation(Appart.typeAppart) BUILDING_TYPE = dictionary({'дом', 'шоссе', 'проспект', 'улица'}).interpretation(Building.buildingType) VALUE = rule(INT, LETTER.optional()) SEP = in_(r'/\-')
from yargy.tokenizer import QUOTES INT = type('INT') DOT = eq('.') ADJF = gram('ADJF') NOUN = gram('NOUN') TITLE = is_title() DASH = eq('-') SLASH = eq('/') QUOTE = in_(QUOTES) ANUM = rule( INT, DASH.optional(), in_caseless({ 'я', 'й', 'е', 'ое', 'ая', 'ий', 'ой' }) ) def value(key): @property def field(self): return getattr(self, key) return field OnlyNameStreet = fact( 'OnlyNameStreet', ['name'] ) class OnlyNameStreet(OnlyNameStreet):
MULTIPLIER = or_(MILLIARD, MILLION, THOUSAND).interpretation(Money.multiplier) ######## # # NUMERAL # ####### NUMR = or_( gram('NUMR'), # https://github.com/OpenCorpora/opencorpora/issues/818 dictionary({'ноль', 'один'}), ) MODIFIER = in_caseless({'целых', 'сотых', 'десятых'}) PART = or_(rule(or_(INT, NUMR, MODIFIER)), MILLIARD, MILLION, THOUSAND, CURRENCY, COINS_CURRENCY) BOUND = in_('()//') NUMERAL = rule(BOUND, PART.repeatable(), BOUND) ####### # # AMOUNT # ########
return obj.AddrPart(part.value, part.type) DASH = eq('-') DOT = eq('.') ADJF = gram('ADJF') NOUN = gram('NOUN') INT = type('INT') TITLE = is_title() ANUM = rule( INT, DASH.optional(), in_caseless({ 'я', 'й', 'е', 'ое', 'ая', 'ий', 'ой' }) ) ######### # # STRANA # ########## # TODO COUNTRY_VALUE = dictionary({ 'россия', 'украина'
from yargy import or_, rule from yargy.interpretation import attribute, fact import yargy.interpretation as meaning from yargy.predicates import caseless, gram, in_caseless, normalized from .station import FROM_STATION_TO_STATION, LIST_OF_STATIONS, STATION Transfer = fact('Transfer', [attribute('to', default=[])]) TRANSFER = rule( gram('ADJF').optional(), # пешеходный normalized('переход'), or_( FROM_STATION_TO_STATION.interpretation(Transfer.to), rule( or_(caseless('на'), caseless('между'), caseless('с')).optional(), LIST_OF_STATIONS.interpretation(Transfer.to)), ).optional(), ).interpretation(Transfer) StationAndTransfer = fact('StationAndTransfer', ['station', 'transfer']) STATION_AND_TRANSFER = rule( STATION.interpretation(StationAndTransfer.station), rule( in_caseless('и,'), TRANSFER.interpretation(meaning.custom(lambda p: p.to)).interpretation( StationAndTransfer.transfer), ).optional()).interpretation(StationAndTransfer)
# ####### NUMR = or_( gram('NUMR'), # https://github.com/OpenCorpora/opencorpora/issues/818 dictionary({ 'ноль', 'один' }), ) MODIFIER = in_caseless({ 'целых', 'сотых', 'десятых' }) PART = or_( rule( or_( INT, NUMR, MODIFIER ) ), MILLIARD, MILLION, THOUSAND, CURRENCY,
Nums = fact('Nums', [attribute('values').repeatable()]) __literals = { 'один': 1, 'два': 2, 'три': 3, 'четыре': 4, 'пять': 5, 'шесть': 6, 'семь': 7, 'восемь': 8, 'девять': 9, } LITERAL = dictionary(__literals).means( interp.normalized().custom(__literals.get)) CONJ_NUMS = in_caseless('-и,') NUMERAL = or_(*[eq(str(i)) for i in __literals.values()]).means(interp.custom(int)) # вестибюль 1 и 2 LIST_OF_NUMERALS = connect(NUMERAL.means(Nums.values), CONJ_NUMS) \ .means(Nums).means(meaning.custom(lambda p: list(sorted(set(p.values))))) # первый и второй вестибюли LIST_OF_LITERALS = connect(LITERAL.means(Nums.values), CONJ_NUMS) \ .means(Nums).means(meaning.custom(lambda p: list(sorted(set(p.values)))))
PREP = gram('PREP') CONJ = gram('CONJ') # А творение ниже сделано для критикалов и береговой SIMPLE_WILDCARD = rule( NOUN.repeatable().optional(), ADJF.optional(), INT.optional() ) TITLE = is_title() ANUM = rule( INT, DASH.optional(), in_caseless({ 'я', 'й', 'е', 'ое', 'ая', 'ий', 'ой' }) ) # # # # # # # # # Адрес # # # # # # # # # # # Address = fact( 'Address', ['City', 'Street', 'House', 'Building', 'Appartment'] ) # # # # # # # # # Города # # # # # # # # # # # # Сначала беру готовые списки названий городов + добавляю пару своих