def test_pipeline(): RULE = rule(pipeline(['a b c', 'b c']), 'd') parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule(pipeline(['a b']).repeatable(), 'c') parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule(caseless_pipeline(['A B']), 'c') parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')
def test_activate(): from yargy.pipelines import pipeline from yargy.predicates import gram from yargy.tokenizer import MorphTokenizer tokenizer = MorphTokenizer() A = pipeline(['a']).named('A') B = A.activate(tokenizer) assert_bnf(B, 'A -> pipeline') A = rule(gram('NOUN')).named('A') B = A.activate(tokenizer) assert_bnf(B, "A -> gram('NOUN')")
def update_rules(name): NAME = pipeline(name).interpretation(Socdem.name) SOCDEM_ELEMS = rule(or_(NAME, GENDER, date.DATE, AGE, LOCATION)) SOCDEM = rule( NAME, GENDER.optional(), or_( rule(AGE.optional(), date.DATE.interpretation(Socdem.date_of_birth).optional()), rule( date.DATE.interpretation(Socdem.date_of_birth).optional(), AGE.optional()), ), LOCATION.optional()).interpretation(Socdem) return SOCDEM_ELEMS, SOCDEM
from .helpers import TOKENIZER, ID_TOKENIZER, load_named_entities from .education import EducationExtractor from .workplace import WorkplaceExtractor from .hobby import HobbyExtractor from yargy.parser import Parser from yargy.pipelines import pipeline, caseless_pipeline EXP_TITLE = pipeline(['Опыт работы']) EDU_TITLE = pipeline(['Образование']) EXTRA_EDU_TITLE = caseless_pipeline(['Курсы', 'Сертификаты']) HOBBY_TITLE = caseless_pipeline(['Хобби', 'Увлечения']) def parse(text): named_entities = load_named_entities(text) exp_tokens = edu_tokens = hobby_tokens = tokens = list(TOKENIZER(text)) extra_edu_tokens = [] parser = Parser(EXP_TITLE, tokenizer=ID_TOKENIZER) exp_title = parser.find(tokens) parser = Parser(EDU_TITLE, tokenizer=ID_TOKENIZER) edu_title = parser.find(tokens) parser = Parser(HOBBY_TITLE, tokenizer=ID_TOKENIZER) hobby_title = parser.find(tokens) if exp_title:
from yargy import (rule, or_, Parser) from yargy.predicates import (eq, gram) from yargy.pipelines import (caseless_pipeline, pipeline) from yargy.interpretation import (fact, attribute) from .helpers import ID_TOKENIZER, select_span_tokens, show_matches Hobby = fact('Hobby', [attribute('name').repeatable()]) HYPHEN = rule(pipeline(['-', '—', '–'])) COLON = rule(eq(':')) COMMA = rule(eq(',')) DOT = rule(eq('.')) TITLES = caseless_pipeline(['Хобби', 'Увлечения']) TITLE = rule(TITLES, or_(COLON, HYPHEN)) ITEM = rule(or_(gram('NOUN'), gram('ADJF')).repeatable(max=3)).interpretation(Hobby.name) HOBBY_ITEMS = rule(or_(TITLE, ITEM, COMMA, DOT)) HOBBIES = rule( TITLE, rule(ITEM, or_(COMMA, DOT)).repeatable(), ).interpretation(Hobby) class HobbyExtractor:
from yargy.pipelines import (caseless_pipeline, pipeline) from yargy.interpretation import (fact) from .helpers import load_lines, load_named_entities, select_span_tokens, ID_TOKENIZER Workplace = fact('Workplace', ['period', 'org_name', 'occupation']) """ Dicts """ FOLDER = os.path.dirname(__file__) DICTS_FOLDER = os.path.join(FOLDER, 'dicts') OCCUPATIONS = load_lines(os.path.join(DICTS_FOLDER, 'occupations.txt')) """ """ HYPHEN = rule(pipeline(['-', '—', '–'])) COMMA = eq(',') MONTHS = { 'январь': 1, 'февраль': 2, 'март': 3, 'апрель': 4, 'май': 5, 'июнь': 6, 'июль': 7, 'август': 8, 'сентябрь': 9, 'октябрь': 10, 'ноябрь': 11, 'декабрь': 12