POSELOK_WORDS, POSELOK_NAME ).interpretation( Settlement ) ############## # # ADDRESS PERSON # ############ ABBR = and_( length_eq(1), is_title() ) PART = and_( TITLE, or_( gram('Name'), gram('Surn') ) ) MAYBE_FIO = or_( rule(TITLE, PART), rule(PART, TITLE), rule(ABBR, '.', TITLE),
) YEAR_WORD = or_( rule('г', eq('.').optional()), rule(normalized('год')) ) YEAR = and_( gte(1000), lte(2100) ).interpretation( Date.year.custom(int) ) YEAR_SHORT = and_( length_eq(2), gte(0), lte(99) ).interpretation( Date.year.custom(lambda _: 1900 + int(_)) ) ERA_YEAR = and_( gte(1), lte(100000) ).interpretation( Date.year.custom(int) ) ERA_WORD = rule( eq('до'),
MULTIPLIER = or_(MILLION, THOUSAND).interpretation(Money.multiplier) ####### # # AMOUNT # ######## def normalize_integer(value): # integer = re.sub('[\s\.,]+', '', value) integer = re.sub('[\s\.,](\S|$|\s$)', '\g<1>', value) return integer PART = and_(INT, length_eq(3)) SEP = in_(',.') INTEGER = or_( rule(INT), rule(INT, PART), rule(INT, PART, PART), rule(INT, SEP, PART), rule(INT, SEP, PART, SEP, PART), ) # *Вилка*: 150к-250к (примерно 50-100, 120-180, 180-250 Junior/Middle/Senior) gross + премия 20% годового дохода по KPI # give 150 -250, 50 - 100120, 180 - 180250 due to ',' - is in SEP for '5,000' and yargy ignore spaces # and if we would ignore ',' in SEP we wouldn't able to catch '5,000' like samples
} MONTH_NAME = dictionary(MONTHS).interpretation(Date.month.normalized().custom( MONTHS.__getitem__)) MONTH = and_(gte(1), lte(12)).interpretation(Date.month.custom(int)) DAY = and_(gte(1), lte(31)).interpretation(Date.day.custom(int)) COUNT = and_(gte(1), ).interpretation(Date.day.custom(int)) YEAR_WORD = or_(rule('г', eq('.').optional()), rule(normalized('год'))) YEAR = and_(gte(1000), lte(2100)).interpretation(Date.year.custom(int)) YEAR_SHORT = and_(length_eq(2), gte(0), lte(99)).interpretation( Date.year.custom(lambda _: 2000 + int(_))) ERA_YEAR = and_(gte(1), lte(100000)).interpretation(Date.year.custom(int)) ERA_WORD = rule( eq('до'), or_(rule('н', eq('.'), 'э', eq('.').optional()), rule(normalized('наша'), normalized('эра')))).interpretation(Date.current_era.const(False)) DATE = or_( rule(DAY, '.', MONTH, '.', or_(YEAR, YEAR_SHORT), YEAR_WORD.optional()), rule(DAY, '.', MONTH), rule(YEAR, YEAR_WORD), rule(DAY, MONTH_NAME), rule(MONTH_NAME, YEAR, YEAR_WORD.optional()),
NUMERAL = rule(BOUND, PART.repeatable(), BOUND) ####### # # AMOUNT # ######## def normalize_integer(value): integer = re.sub('[\s.,]+', '', value) return int(integer) PART = and_(INT, length_eq(3)) SEP = in_(',.') INTEGER = or_( rule(INT), rule(INT, PART), rule(INT, PART, PART), rule(INT, SEP, PART), rule(INT, SEP, PART, SEP, PART), ).interpretation(Money.integer.custom(normalize_integer)) FRACTION = and_(INT, or_(length_eq(1), length_eq(2))).interpretation(Money.fraction.custom(int))
# AMOUNT # ######## def normalize_integer(value): integer = re.sub('[\s.,]+', '', value) return int(integer) def normalize_fraction(value): fraction = value.ljust(2, '0') return int(fraction) PART = and_(INT, length_eq(3)) SEP = in_(',.') INTEGER = or_( rule(INT), rule(INT, PART), rule(INT, PART, PART), rule(INT, SEP, PART), rule(INT, SEP, PART, SEP, PART), ).interpretation(Money.integer.custom(normalize_integer)) FRACTION = and_(INT, or_(length_eq(1), length_eq(2))).interpretation( Money.fraction.custom(normalize_fraction)) AMOUNT = rule(
######## def normalize_integer(value): integer = re.sub('[\s.,]+', '', value) return int(integer) def normalize_fraction(value): fraction = value.ljust(2, '0') return int(fraction) PART = and_( INT, length_eq(3) ) SEP = in_(',.') INTEGER = or_( rule(INT), rule(INT, PART), rule(INT, PART, PART), rule(INT, SEP, PART), rule(INT, SEP, PART, SEP, PART), ).interpretation( Money.integer.custom(normalize_integer) ) FRACTION = and_(
POSELOK_WORDS, POSELOK_NAME ).interpretation( Settlement ) ############## # # ADDR PERSON # ############ ABBR = and_( length_eq(1), is_title() ) PART = and_( TITLE, or_( gram('Name'), gram('Surn') ) ) MAYBE_FIO = or_( rule(TITLE, PART), rule(PART, TITLE), rule(ABBR, '.', TITLE),
rule(normalized('посёлок')), rule(caseless('р'), DOT.optional(), caseless('п'), DOT.optional()), rule(normalized('рабочий'), normalized('посёлок'))).interpretation( Settlement.type.const('посёлок')) POSELOK_NAME = SETTLEMENT_NAME.interpretation(Settlement.name) POSELOK = rule(POSELOK_WORDS, POSELOK_NAME).interpretation(Settlement) ############## # # ADDRESS PERSON # ############ ABBR = and_(length_eq(1), is_title()) PART = and_(TITLE, or_(gram('Name'), gram('Surn'))) MAYBE_FIO = or_(rule(TITLE, PART), rule(PART, TITLE), rule(ABBR, '.', TITLE), rule(ABBR, '.', ABBR, '.', TITLE), rule(TITLE, ABBR, '.', ABBR, '.')) POSITION_WORDS_ = or_( rule( dictionary({ 'мичман', 'геолог', 'подводник', 'краевед', 'снайпер',
from yargy.predicates import gram, is_capitalized, dictionary, is_upper, length_eq from docx import Document from docx.shared import Inches import list_header as lh list_header = lh.list_header class CompetitionResult(): def __init__(self, FGOS, competition, result): self.FGOS = FGOS self.competition = competition self.result = result FGOS = rule(and_(is_upper(), or_(length_eq(2), length_eq(3)))) IsCodeFGOS = rule(dictionary({'код', 'ФГОС'})) IsCompetitions = rule(dictionary({'компетенция', 'содержание'})) IsResults = rule(dictionary({'результат'})) TableIsCompetitionsAndResults = rule( dictionary({'результат', 'ФГОС', 'компетенция'})) parser_Table = Parser(TableIsCompetitionsAndResults) parser_FGOS = Parser(IsCodeFGOS) parser_Result = Parser(IsResults) parser_Competition = Parser(IsCompetitions) parser_code_FGOS = Parser(FGOS) # из текста
# COMPONENTS # ########### IS_FIRST = dictionary(FIRST_DICT) MAYBE_FIRST = or_( and_( gram('Name'), not_(gram('Abbr')) # А. Леонидов ), dictionary(MAYBE_FIRST_DICT)) TITLE_FIRST = and_(or_(IS_FIRST, MAYBE_FIRST), is_capitalized()) TITLE_FIRST_ABBR = and_(length_eq(1), is_capitalized()) TITLE_MIDDLE = and_( gram('Patr'), not_(gram('Abbr')), # Фил О’Рейли -> "О" is Patr is_capitalized()) TITLE_MIDDLE_ABBR = and_(length_eq(1), is_capitalized()) IS_LAST = dictionary(LAST_DICT) MAYBE_LAST = or_(gram('Surn'), dictionary(MAYBE_LAST_DICT)) TITLE_LAST = and_(or_(IS_LAST, MAYBE_LAST), is_capitalized()) #########
# COMPONENTS # ########### IS_FIRST = dictionary(FIRST_DICT) MAYBE_FIRST = or_( and_( gram('Name'), not_(gram('Abbr')) # А. Леонидов ), dictionary(MAYBE_FIRST_DICT)) TITLE_FIRST = and_(or_(IS_FIRST, MAYBE_FIRST), is_title()) TITLE_FIRST_ABBR = and_(length_eq(1), is_title()) TITLE_MIDDLE = and_( gram('Patr'), not_(gram('Abbr')), # Фил О’Рейли -> "О" is Patr is_title()) TITLE_MIDDLE_ABBR = and_(length_eq(1), is_title()) IS_LAST = dictionary(LAST_DICT) MAYBE_LAST = or_(gram('Surn'), dictionary(MAYBE_LAST_DICT)) TITLE_LAST = and_(or_(IS_LAST, MAYBE_LAST), is_title()) #########
from yargy import Parser, rule, and_, or_ from yargy.predicates import gram, is_capitalized, dictionary, is_upper, length_eq from docx import Document from docx.shared import Inches class CompetitionResult(): def __init__(self, FGOS, competition, result): self.FGOS = FGOS self.competition = competition self.result = result FGOS = rule( and_( is_upper(), or_( length_eq(2), length_eq(3) ) ) ) IsCodeFGOS = rule( dictionary( {'код', 'ФГОС'}) ) IsCompetitions = rule(dictionary( { 'компетенция' } )) IsResults = rule(dictionary(
'Name', [attribute('first', ''), attribute('middle', ''), attribute('last', '')]) DOT = eq('.') LAST = and_(type('RU'), is_capitalized(), length_grt(1)).interpretation(Name.last.custom(str.capitalize)) FIRST = and_(gram('Name'), length_grt(1)).interpretation(Name.first.custom(str.capitalize)) MIDDLE = and_(gram('Patr'), length_grt(1)).interpretation(Name.middle.custom(str.capitalize)) ABBR = and_(length_eq(1), is_capitalized()) FIRST_ABBR = ABBR.interpretation(Name.first.custom(str.upper)) MIDDLE_ABBR = ABBR.interpretation(Name.middle.custom(str.upper)) unsubstantial = {'Бен Режеб Т.Б.К.'} UNSUBSTANIAL = caseless_pipeline(unsubstantial).interpretation(Name) NAME = or_( rule(UNSUBSTANIAL), rule(LAST, DOT, FIRST_ABBR, DOT, MIDDLE_ABBR, DOT), rule(LAST, FIRST_ABBR, DOT, MIDDLE_ABBR, DOT), rule(FIRST_ABBR, DOT, MIDDLE_ABBR, DOT, LAST), rule(LAST, FIRST_ABBR, DOT),
# Еще немного пытался, но потом решил просто делать то, что работает. Street = fact( 'Street', ['name', 'street_type'] ) # Сначала длеаю обрабочтик для названий улиц с именами # Сразу снизу слово для "подзаборынх академиков" WITH_NAME = or_( and_(ADJF, not_(APRO)), and_(NOUN, GEN) ) # Одна буковка, совсем одна SINGLE = length_eq(1) # Мои попытки сделать универсальный парсер, а именно "двойные стандарты". # То, что с заглавной буквы может быть названием улицы без ключевого слова. # То, что без регистра должно иметь хоть какой-то ключ. # Позже увидел тесты и забил на все это. PART_UPPER = and_( TITLE, or_( gram('Name'), gram('Surn') ) ) PART_NOCASE = or_( gram('Name'),