def test_pipeline_key(): from yargy.pipelines import morph_pipeline pipeline = morph_pipeline([ 'закрытое общество', 'завод' ]) F = fact('F', ['a']) RULE = pipeline.interpretation( F.a.normalized() ).interpretation( F ) parser = Parser(RULE) match = parser.match('закрытом обществе') record = match.fact assert record == F(a='закрытое общество') RULE = pipeline.interpretation( normalized() ) parser = Parser(RULE) match = parser.match('заводе') value = match.fact assert value == 'завод'
def show_matches(rule, *lines): parser = Parser(rule) for line in lines: matches = parser.findall(line) spans = [_.span for _ in matches] show_markup(line, spans)
def test_type_errors(): F = fact('F', ['a']) RULE = rule( 'a', eq('1').interpretation( custom(int) ) ).interpretation( F.a ) parser = Parser(RULE) match = parser.match('a 1') with pytest.raises(TypeError): match.fact F = fact('F', ['a']) RULE = rule( 'a', eq('1').interpretation( custom(int) ) ).interpretation( custom(str) ) parser = Parser(RULE) match = parser.match('a 1') with pytest.raises(TypeError): match.fact
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("OGRNExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ogrn_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() OGRN = morph_pipeline([ 'огрн', 'основной государственный регистрационный номер', 'огрнип' ]) INT = type('INT') OGRN_NUMBER = rule(OGRN, INT) self.full_ogrn_parser = Parser(OGRN_NUMBER) self.ogrn_num_parser = Parser(rule(INT))
def test_inflected_custom(): MONTHS = {'январь': 1} RULE = rule('январе').interpretation( inflected({'nomn', 'sing'}).custom(MONTHS.get)) parser = Parser(RULE) match = parser.match('январе') assert match.fact == 1
def __init__(self, names: list = [], version_numbers: list = [], version_names: list = [], consoles: list = []): rules = rule(morph_pipeline(names).interpretation(self.__game.name.const(names[0])), morph_pipeline(version_numbers).interpretation(self.__game.version_number).optional(), morph_pipeline(version_names).interpretation(self.__game.version_name).optional(), morph_pipeline(consoles).interpretation(self.__game.console).optional()) game = or_(rules).interpretation(self.__game) self.parser = Parser(game)
def test_samples(rules: Union[NamedRule, List[NamedRule]], texts: List[str], num: int = 20, seed: int = None, markup=None, fact=False): from random import seed as sed, sample sed(seed) texts, num = (texts, len(texts)) if len(texts) < num else (sample(texts, num), num) results: Dict[int, Dict[int, List]] = defaultdict(dict) if not (isinstance(rules, list) or isinstance(rules, tuple)): rules = [rules] for rule_idx, rule in enumerate(rules): parser = Parser(rule) for text_idx in range(num): matches = parser.findall(texts[text_idx]) results[text_idx][rule_idx] = list(matches) for text_idx, rule_matches in results.items(): spans = [(m.span[0], m.span[1], str(rules[rule_idx].name)) for rule_idx, matches in rule_matches.items() for m in matches] show_markup(texts[text_idx], spans, markup or BoxLabelMarkup) if fact: for rule_idx, matches in rule_matches.items(): for m in matches: display(m.fact)
def test(rule, *lines, tree=False, facts=False): is_at = lambda span, set: any((span == s) for s in set) parser = Parser(rule) for line in lines: if isinstance(line, str): text, expected = line, [] else: text = line[0] expected = [find(text, substr) for substr in line[1:]] matches = list(sorted(parser.findall(text), key=lambda _: _.span)) # display(matches) matched_spans = [_.span for _ in matches] spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \ + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)] show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup) if matches: for _ in matches: if tree: display(matches[0].tree.as_dot) if facts: display(_.fact)
def predict(self, input): address = Address() # Parse cities matches = list(Parser(CityFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.city = (Fact.title, Fact.prefix) # Parse streets matches = list(Parser(StreetFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.street = (Fact.title, Fact.prefix) # Parse buildings matches = list(Parser(BuildingFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.building = (Fact.house, Fact.corpus, Fact.structure) # Parse appartments matches = list(Parser(AppartmentFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.appartment = Fact.appartment return address
def test_name(): Name = fact( 'Name', ['first', 'last'] ) gnc = gnc_relation() FIRST = gram('Name').interpretation( Name.first.inflected() ).match(gnc) LAST = gram('Surn').interpretation( Name.last.inflected() ).match(gnc) NAME = rule( FIRST, LAST ).interpretation(Name) parser = Parser(NAME) match = parser.match('саше иванову') assert match.fact == Name(first='саша', last='иванов') match = parser.match('сашу иванову') assert match.fact == Name(first='саша', last='иванова') match = parser.match('сашу ивановой') assert not match
def test_person(): Name = fact( 'Name', ['first', 'last'], ) Person = fact('Person', ['position', 'name']) LAST = and_( gram('Surn'), not_(gram('Abbr')), ) FIRST = and_( gram('Name'), not_(gram('Abbr')), ) POSITION = morph_pipeline(['управляющий директор', 'вице-мэр']) gnc = gnc_relation() NAME = rule( FIRST.interpretation(Name.first).match(gnc), LAST.interpretation(Name.last).match(gnc)).interpretation(Name) PERSON = rule( POSITION.interpretation(Person.position).match(gnc), NAME.interpretation(Person.name)).interpretation(Person) parser = Parser(PERSON) match = parser.match('управляющий директор Иван Ульянов') assert match assert match.fact == Person(position='управляющий директор', name=Name(first='Иван', last='Ульянов'))
def test_inflected_custom_attribute(): F = fact('F', ['a']) MONTHS = {'январь': 1} RULE = rule('январе').interpretation( F.a.inflected({'nomn', 'sing'}).custom(MONTHS.get)).interpretation(F) parser = Parser(RULE) match = parser.match('январе') assert match.fact == F(a=1)
def test_normalized_custom_attribute(): F = fact('F', ['a']) MONTHS = {'январь': 1} RULE = rule('январе').interpretation(F.a.normalized().custom( MONTHS.get)).interpretation(F) parser = Parser(RULE) match = parser.match('январе') assert match.fact == F(a=1)
def __init__(self, rule, morph): # wraps pymorphy subclass # add methods check_gram, normalized # uses parse method that is cached morph = MorphAnalyzer(morph) tokenizer = MorphTokenizer(morph=morph) YargyParser.__init__(self, rule, tokenizer=tokenizer)
class OGRNExtractor: def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("OGRNExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ogrn_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() OGRN = morph_pipeline([ 'огрн', 'основной государственный регистрационный номер', 'огрнип' ]) INT = type('INT') OGRN_NUMBER = rule(OGRN, INT) self.full_ogrn_parser = Parser(OGRN_NUMBER) self.ogrn_num_parser = Parser(rule(INT)) def preprocess(self, line): line = line.replace("\n", " ").replace(""", "\"") return line def extract(self, line): line = self.preprocess(line) matches = list(self.full_ogrn_parser.findall(line)) spans = [_.span for _ in matches] result = [] for span in spans: match = line[span.start:span.stop] int_matches = list(self.ogrn_num_parser.findall(match)) int_spans = [_.span for _ in int_matches] for int_span in int_spans: int_match = match[int_span.start:int_span.stop] result.append(int_match) result = list(set(result)) return result def show_tokens(self, line): line = line.replace("\n", " ").replace(""", "\"") return list(self.tokenizer(line))
def test_const(): RULE = rule( 'a' ).interpretation( const(1) ) parser = Parser(RULE) match = parser.match('a') assert match.fact == 1
def test_inflected(): RULE = rule( 'московским' ).interpretation( inflected({'nomn', 'femn'}) ) parser = Parser(RULE) match = parser.match('московским') assert match.fact == 'московская'
def text_normalized(): RULE = rule( 'московским' ).interpretation( normalized() ) parser = Parser(RULE) match = parser.match('московским') assert match.fact == 'московский'
def test_rule_custom(): RULE = rule( '3', '.', '14' ).interpretation( custom(float) ) parser = Parser(RULE) match = parser.match('3.14') assert match.fact == 3.14
def test_attribute_custom_custom(): F = fact('F', 'a') MAPPING = {'a': 1} RULE = rule('A').interpretation(F.a.custom(str.lower).custom( MAPPING.get)).interpretation(F) parser = Parser(RULE) match = parser.match('A') record = match.fact assert record == F(a=1)
def test_attribute_custom(): F = fact('F', 'a') RULE = rule('1').interpretation(F.a.custom(int)).interpretation(F) parser = Parser(RULE) match = parser.match('1') record = match.fact assert record == F(a=1) assert record.spans == [(0, 1)] assert record.as_json == {'a': 1}
def test_tagger(): text = 'a b c d e f g' A = tag('I').repeatable() parser = Parser(A, tagger=MyTagger()) matches = parser.findall(text) spans = [_.span for _ in matches] substrings = [text[start:stop] for start, stop in spans] assert substrings == ['b c', 'e f']
def test_attribute_normalized(): F = fact('F', 'a') RULE = rule('январе').interpretation(F.a.normalized()).interpretation(F) parser = Parser(RULE) match = parser.match('январе') record = match.fact assert record == F(a='январь') assert record.spans == [(0, 6)] assert record.as_json == {'a': 'январь'}
def test_rule_attribute(): F = fact('F', ['a']) RULE = rule('a', 'A').interpretation(F.a).interpretation(F) parser = Parser(RULE) match = parser.match('a A') record = match.fact assert record == F(a='a A') assert record.spans == [(0, 5)] assert record.as_json == {'a': 'a A'}
def find_factors(factor_types): for i in range(len(factor_types)): factor_lst = [] FACT_RULE = morph_pipeline(factor_types[i]) parser = Parser(FACT_RULE) for match in parser.findall(text): factor_lst.append(' '.join([_.value for _ in match.tokens])) factors_span.append(match.span) if factor_lst: factors.append(i+1)
def test_constant_attribute(): MONEY_RULE = rule( gram('INT').interpretation(Money.count), dictionary({'тысяча'}).interpretation(Money.base.const(10**3)), dictionary({'рубль', 'доллар'}).interpretation(Money.currency), ).interpretation(Money) parser = Parser(MONEY_RULE) matches = list(parser.match('1 тысяча рублей')) assert matches[0].fact == Money(count=1, base=1000, currency='рублей')
def test_rule_custom_custom(): MAPPING = {'a': 1} RULE = rule( 'A' ).interpretation( custom(str.lower).custom(MAPPING.get) ) parser = Parser(RULE) match = parser.match('A') assert match.fact == 1
def test_attribute_const(): F = fact('F', 'a') RULE = rule( 'январь' ).interpretation( F.a.const(1) ) parser = Parser(RULE) match = parser.match('январь') assert match.fact == 1
def test_attribute(): F = fact('F', 'a') RULE = rule( 'a' ).interpretation( F.a ) parser = Parser(RULE) match = parser.match('a') assert match.fact == 'a'
def test_insted_attributes(): F = fact('F', ['a', 'b']) RULE = rule(eq('a').interpretation(F.a)).interpretation( F.b).interpretation(F) parser = Parser(RULE) match = parser.match('a') record = match.fact assert record == F(a=None, b='a') assert record.spans == [(0, 1)] assert record.as_json == {'b': 'a'}
def test_attribute_inflected(): F = fact('F', 'a') RULE = rule('январе').interpretation(F.a.inflected({'nomn', 'plur' })).interpretation(F) parser = Parser(RULE) match = parser.match('январе') record = match.fact assert record == F(a='январи') assert record.spans == [(0, 6)] assert record.as_json == {'a': 'январи'}
def test_repeatable(): F = fact('F', [attribute('a').repeatable()]) RULE = rule(eq('a').interpretation(F.a), eq('b').interpretation(F.a)).interpretation(F) parser = Parser(RULE) match = parser.match('a b') record = match.fact assert record == F(a=['a', 'b']) assert record.spans == [(0, 1), (2, 3)] assert record.as_json == {'a': ['a', 'b']}
def test_predicate_attribute(): F = fact('F', ['a']) RULE = rule( eq('a').interpretation(F.a) ).interpretation(F) parser = Parser(RULE) match = parser.match('a') record = match.fact assert record == F(a='a') assert record.spans == [(0, 1)] assert record.as_json == {'a': 'a'}
def test_inflected_custom(): MONTHS = { 'январь': 1 } RULE = rule( 'январе' ).interpretation( inflected({'nomn', 'sing'}).custom(MONTHS.get) ) parser = Parser(RULE) match = parser.match('январе') assert match.fact == 1
def test_tagger(): text = 'a b c d e f g' A = tag('I').repeatable() parser = Parser(A, tagger=MyTagger()) matches = parser.findall(text) spans = [_.span for _ in matches] substrings = [ text[start:stop] for start, stop in spans ] assert substrings == ['b c', 'e f']
def test_normalized_custom(): MONTHS = { 'январь': 1 } RULE = rule( 'январе' ).interpretation( normalized().custom(MONTHS.get) ) parser = Parser(RULE) match = parser.match('январе') assert match.fact == 1
def test_rule_attribute_custom(): F = fact('F', ['a']) RULE = rule( '1' ).interpretation( F.a ).interpretation( custom(int) ) parser = Parser(RULE) match = parser.match('1') assert match.fact == 1
def test_insted_attributes(): F = fact('F', ['a', 'b']) RULE = rule( eq('a').interpretation(F.a) ).interpretation( F.b ).interpretation(F) parser = Parser(RULE) match = parser.match('a') record = match.fact assert record == F(a=None, b='a') assert record.spans == [(0, 1)] assert record.as_json == {'b': 'a'}
def test_attribute_custom_custom(): F = fact('F', 'a') MAPPING = {'a': 1} RULE = rule( 'A' ).interpretation( F.a.custom(str.lower).custom(MAPPING.get) ).interpretation( F ) parser = Parser(RULE) match = parser.match('A') record = match.fact assert record == F(a=1)
def test_repeatable(): F = fact('F', [attribute('a').repeatable()]) RULE = rule( eq('a').interpretation(F.a), eq('b').interpretation(F.a) ).interpretation( F ) parser = Parser(RULE) match = parser.match('a b') record = match.fact assert record == F(a=['a', 'b']) assert record.spans == [(0, 1), (2, 3)] assert record.as_json == {'a': ['a', 'b']}
def test_attribute_inflected(): F = fact('F', 'a') RULE = rule( 'январе' ).interpretation( F.a.inflected({'nomn', 'plur'}) ).interpretation( F ) parser = Parser(RULE) match = parser.match('январе') record = match.fact assert record == F(a='январи') assert record.spans == [(0, 6)] assert record.as_json == {'a': 'январи'}
def test_attribute_normalized(): F = fact('F', 'a') RULE = rule( 'январе' ).interpretation( F.a.normalized() ).interpretation( F ) parser = Parser(RULE) match = parser.match('январе') record = match.fact assert record == F(a='январь') assert record.spans == [(0, 6)] assert record.as_json == {'a': 'январь'}
def test_attribute_custom(): F = fact('F', 'a') RULE = rule( '1' ).interpretation( F.a.custom(int) ).interpretation( F ) parser = Parser(RULE) match = parser.match('1') record = match.fact assert record == F(a=1) assert record.spans == [(0, 1)] assert record.as_json == {'a': 1}
def test_normalized_custom_attribute(): F = fact('F', ['a']) MONTHS = { 'январь': 1 } RULE = rule( 'январе' ).interpretation( F.a.normalized().custom(MONTHS.get) ).interpretation( F ) parser = Parser(RULE) match = parser.match('январе') assert match.fact == F(a=1)
def test_inflected_custom_attribute(): F = fact('F', ['a']) MONTHS = { 'январь': 1 } RULE = rule( 'январе' ).interpretation( F.a.inflected({'nomn', 'sing'}).custom(MONTHS.get) ).interpretation( F ) parser = Parser(RULE) match = parser.match('январе') assert match.fact == F(a=1)
def test_merge_facts(): F = fact('F', ['a', 'b']) A = rule( eq('a').interpretation(F.a) ).interpretation(F) B = rule( eq('b').interpretation(F.b) ).interpretation(F) RULE = rule( A, B ).interpretation(F) parser = Parser(RULE) match = parser.match('a b') record = match.fact assert record == F(a='a', b='b') assert record.spans == [(0, 1), (2, 3)] assert record.as_json == {'a': 'a', 'b': 'b'}
class Extractor(object): def __init__(self, rule, tokenizer=TOKENIZER, tagger=None): self.parser = Parser(rule, tokenizer=tokenizer, tagger=tagger) def __call__(self, text): text = normalize_text(text) matches = self.parser.findall(text) return Matches(text, matches)
def test_nested_facts(): F = fact('F', ['a']) G = fact('G', ['b']) RULE = rule( eq('a').interpretation(F.a) ).interpretation( F ).interpretation( G.b ).interpretation( G ) parser = Parser(RULE) match = parser.match('a') record = match.fact assert record == G(b=F(a='a')) assert record.spans == [(0, 1)] assert record.as_json == {'b': {'a': 'a'}}
def __init__(self, rule, tokenizer=TOKENIZER, tagger=None): self.parser = Parser(rule, tokenizer=tokenizer, tagger=tagger)
def test_pipeline(): RULE = rule( pipeline(['a b c', 'b c']), 'd' ) parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule( pipeline(['a b']).repeatable(), 'c' ) parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule( caseless_pipeline(['A B']), 'c' ) parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')