def sections(self, segment='all'): """ attributes: <all> <themes> <lectures> <practices> <srs> """ themes = Parser(self.section_rule) lectures = Parser(self.lectures_rule) practices = Parser(self.pract_rule) srs = Parser(self.srs_rule) found = False for table in self.docxdoc.tables: for column in table.columns: for cell in column.cells: index = 0 if segment == 'all' or segment == 'themes': cell_search_themes = themes.findall(cell.text) for each in cell_search_themes: index += 1 if index > 2: return self.lectures(table, column) if segment != 'all': found = True print("this is theme") break if segment == 'all' or segment == 'lectures': cell_search_lectures = lectures.findall(cell.text) for each in cell_search_lectures: return self.lectures(table, column) if segment != 'all': found = True print("ЛЕКЦИИ") break if segment == 'all' or segment == 'practices': cell_search_practices = practices.findall(cell.text) for each in cell_search_practices: return self.lectures(table, column) if segment != 'all': found = True print("практика") break if segment == 'all' or segment == 'srs': cell_search_srs = srs.findall(cell.text) for each in cell_search_srs: return self.lectures(table, column) if segment != 'all': found = True print("практика") break if found: break if found: break
def test_pipeline_key(): from yargy import or_ from yargy.pipelines import morph_pipeline pipeline = morph_pipeline([ 'закрытое общество', 'завод' ]) F = fact('F', ['a']) RULE = pipeline.interpretation( F.a.normalized() ).interpretation( F ) parser = Parser(RULE) match = parser.match('закрытом обществе') record = match.fact assert record == F(a='закрытое общество') RULE = pipeline.interpretation( normalized() ) parser = Parser(RULE) match = parser.match('заводе') value = match.fact assert value == 'завод'
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("OGRNExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ogrn_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.tokenizer = MorphTokenizer() OGRN = morph_pipeline([ 'огрн', 'основной государственный регистрационный номер', 'огрнип' ]) INT = type('INT') OGRN_NUMBER = rule(OGRN, INT) self.full_ogrn_parser = Parser(OGRN_NUMBER) self.ogrn_num_parser = Parser(rule(INT))
def test_type_errors(): F = fact('F', ['a']) RULE = rule( 'a', eq('1').interpretation( custom(int) ) ).interpretation( F.a ) parser = Parser(RULE) match = parser.match('a 1') with pytest.raises(TypeError): match.fact F = fact('F', ['a']) RULE = rule( 'a', eq('1').interpretation( custom(int) ) ).interpretation( custom(str) ) parser = Parser(RULE) match = parser.match('a 1') with pytest.raises(TypeError): match.fact
def predict(self, input): address = Address() # Parse cities matches = list(Parser(CityFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.city = (Fact.title, Fact.prefix) # Parse streets matches = list(Parser(StreetFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.street = (Fact.title, Fact.prefix) # Parse buildings matches = list(Parser(BuildingFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.building = (Fact.house, Fact.corpus, Fact.structure) # Parse appartments matches = list(Parser(AppartmentFilter).findall(input)) if (len(matches)): Fact = matches[0].fact address.appartment = Fact.appartment return address
def test_pipeline(): RULE = rule(pipeline(['a b c', 'b c']), 'd') parser = Parser(RULE) assert parser.match('b c d') assert parser.match('a b c d') RULE = rule(pipeline(['a b']).repeatable(), 'c') parser = Parser(RULE) assert parser.match('a b a b c') RULE = rule(caseless_pipeline(['A B']), 'c') parser = Parser(RULE) assert parser.match('A b c') RULE = morph_pipeline([ 'текст', 'текст песни', 'материал', 'информационный материал', ]) parser = Parser(RULE) matches = list(parser.findall('текстом песни музыкальной группы')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['текстом', 'песни'] matches = list(parser.findall('информационного материала под названием')) assert len(matches) == 1 match = matches[0] assert [_.value for _ in match.tokens] == ['информационного', 'материала'] RULE = morph_pipeline(['1 B.']) parser = Parser(RULE) assert parser.match('1 b .')
def find(tokens): parser = Parser(HOBBY_ITEMS, tokenizer=ID_TOKENIZER) matches = parser.findall(tokens) spans = [_.span for _ in matches] tokens = list(select_span_tokens(tokens, spans)) # print([_.value for _ in tokens]) parser = Parser(HOBBIES, tokenizer=ID_TOKENIZER) matches = list(parser.findall(tokens)) return matches
def find(self, tokens): parser = Parser(self.WORKPLACE_ELEM, tokenizer=ID_TOKENIZER) matches = parser.findall(tokens) spans = [_.span for _ in matches] tokens = list(select_span_tokens(tokens, spans)) # print([_.value for _ in tokens]) parser = Parser(self.WORKPLACE, tokenizer=ID_TOKENIZER) matches = list(parser.findall(tokens)) return matches
def __init__(self, logger = None, env = 'local'): self.env = env if logger is None: self.logger = logging.getLogger("AdsExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ads_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.texttools = texttools.TextTools(self.logger) self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() EXCLUDE = morph_pipeline([ 'без', 'не', 'вправе отказаться', 'может отказаться', 'услуга' ]) AGREEMENT = morph_pipeline([ 'соглашаться с получением' ]) SUBJECT = morph_pipeline([ 'рассылка', 'предложение' ]) KIND = morph_pipeline([ 'рекламный' ]) SPECIALS = morph_pipeline([ 'рекламныя цель' ]) ADS = or_( rule(KIND, SUBJECT), rule(SUBJECT, KIND), or_(SPECIALS, AGREEMENT) ) self.ads_parser = Parser(ADS) self.exclude_parser = Parser(rule(EXCLUDE))
def find_feature(feature, RULE, RULE2, space=[40,40]): parser = Parser(RULE) lst = [] for match in parser.findall(text): lst.append((match.span, [_.value for _ in match.tokens])) if lst: add_text = text[list(match.span)[1]-space[0]:list(match.span)[1]+space[1]] parser = Parser(RULE2) lst = [] for match in parser.findall(add_text): lst.append((match.span, [_.value for _ in match.tokens])) if lst: dict_symp[feature] = 1 else: dict_symp[feature] = 0
def __init__(self, names: list = [], version_numbers: list = [], version_names: list = [], consoles: list = []): rules = rule(morph_pipeline(names).interpretation(self.__game.name.const(names[0])), morph_pipeline(version_numbers).interpretation(self.__game.version_number).optional(), morph_pipeline(version_names).interpretation(self.__game.version_name).optional(), morph_pipeline(consoles).interpretation(self.__game.console).optional()) game = or_(rules).interpretation(self.__game) self.parser = Parser(game)
def test_inflected_custom(): MONTHS = {'январь': 1} RULE = rule('январе').interpretation( inflected({'nomn', 'sing'}).custom(MONTHS.get)) parser = Parser(RULE) match = parser.match('январе') assert match.fact == 1
def test_person(): Name = fact( 'Name', ['first', 'last'], ) Person = fact('Person', ['position', 'name']) LAST = and_( gram('Surn'), not_(gram('Abbr')), ) FIRST = and_( gram('Name'), not_(gram('Abbr')), ) POSITION = morph_pipeline(['управляющий директор', 'вице-мэр']) gnc = gnc_relation() NAME = rule( FIRST.interpretation(Name.first).match(gnc), LAST.interpretation(Name.last).match(gnc)).interpretation(Name) PERSON = rule( POSITION.interpretation(Person.position).match(gnc), NAME.interpretation(Person.name)).interpretation(Person) parser = Parser(PERSON) match = parser.match('управляющий директор Иван Ульянов') assert match assert match.fact == Person(position='управляющий директор', name=Name(first='Иван', last='Ульянов'))
def show_matches(rule, *lines): parser = Parser(rule) for line in lines: matches = parser.findall(line) spans = [_.span for _ in matches] show_markup(line, spans)
def test_name(): Name = fact( 'Name', ['first', 'last'] ) gnc = gnc_relation() FIRST = gram('Name').interpretation( Name.first.inflected() ).match(gnc) LAST = gram('Surn').interpretation( Name.last.inflected() ).match(gnc) NAME = rule( FIRST, LAST ).interpretation(Name) parser = Parser(NAME) match = parser.match('саше иванову') assert match.fact == Name(first='саша', last='иванов') match = parser.match('сашу иванову') assert match.fact == Name(first='саша', last='иванова') match = parser.match('сашу ивановой') assert not match
def get_all_collocation(lines, word): """ Function for finding all collocations of word and any word after it. :param lines: list of string Lines for processing. :param word: str Word for searching. :return: List of all valid collocations. """ if not isinstance(lines, list) or not isinstance(word, str): raise TypeError gr = rule(normalized(word), and_(not_(yargy_type('PUNCT')), not_(yargy_type('OTHER')))) result_list = [] for line in lines: if not isinstance(line, str): raise TypeError for match in Parser(gr).findall(line): result_list.append(' '.join( [Normalizer.normalise(token.value) for token in match.tokens])) return result_list
def test_samples(rules: Union[NamedRule, List[NamedRule]], texts: List[str], num: int = 20, seed: int = None, markup=None, fact=False): from random import seed as sed, sample sed(seed) texts, num = (texts, len(texts)) if len(texts) < num else (sample(texts, num), num) results: Dict[int, Dict[int, List]] = defaultdict(dict) if not (isinstance(rules, list) or isinstance(rules, tuple)): rules = [rules] for rule_idx, rule in enumerate(rules): parser = Parser(rule) for text_idx in range(num): matches = parser.findall(texts[text_idx]) results[text_idx][rule_idx] = list(matches) for text_idx, rule_matches in results.items(): spans = [(m.span[0], m.span[1], str(rules[rule_idx].name)) for rule_idx, matches in rule_matches.items() for m in matches] show_markup(texts[text_idx], spans, markup or BoxLabelMarkup) if fact: for rule_idx, matches in rule_matches.items(): for m in matches: display(m.fact)
def test(rule, *lines, tree=False, facts=False): is_at = lambda span, set: any((span == s) for s in set) parser = Parser(rule) for line in lines: if isinstance(line, str): text, expected = line, [] else: text = line[0] expected = [find(text, substr) for substr in line[1:]] matches = list(sorted(parser.findall(text), key=lambda _: _.span)) # display(matches) matched_spans = [_.span for _ in matches] spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \ + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)] show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup) if matches: for _ in matches: if tree: display(matches[0].tree.as_dot) if facts: display(_.fact)
def test_inflected_custom_attribute(): F = fact('F', ['a']) MONTHS = {'январь': 1} RULE = rule('январе').interpretation( F.a.inflected({'nomn', 'sing'}).custom(MONTHS.get)).interpretation(F) parser = Parser(RULE) match = parser.match('январе') assert match.fact == F(a=1)
def test_normalized_custom_attribute(): F = fact('F', ['a']) MONTHS = {'январь': 1} RULE = rule('январе').interpretation(F.a.normalized().custom( MONTHS.get)).interpretation(F) parser = Parser(RULE) match = parser.match('январе') assert match.fact == F(a=1)
def test_inflected(): RULE = rule( 'московским' ).interpretation( inflected({'nomn', 'femn'}) ) parser = Parser(RULE) match = parser.match('московским') assert match.fact == 'московская'
def test_const(): RULE = rule( 'a' ).interpretation( const(1) ) parser = Parser(RULE) match = parser.match('a') assert match.fact == 1
def test_rule_attribute(): F = fact('F', ['a']) RULE = rule('a', 'A').interpretation(F.a).interpretation(F) parser = Parser(RULE) match = parser.match('a A') record = match.fact assert record == F(a='a A') assert record.spans == [(0, 5)] assert record.as_json == {'a': 'a A'}
def test_attribute_normalized(): F = fact('F', 'a') RULE = rule('январе').interpretation(F.a.normalized()).interpretation(F) parser = Parser(RULE) match = parser.match('январе') record = match.fact assert record == F(a='январь') assert record.spans == [(0, 6)] assert record.as_json == {'a': 'январь'}
def text_normalized(): RULE = rule( 'московским' ).interpretation( normalized() ) parser = Parser(RULE) match = parser.match('московским') assert match.fact == 'московский'
def test_attribute_custom(): F = fact('F', 'a') RULE = rule('1').interpretation(F.a.custom(int)).interpretation(F) parser = Parser(RULE) match = parser.match('1') record = match.fact assert record == F(a=1) assert record.spans == [(0, 1)] assert record.as_json == {'a': 1}
def test_tagger(): text = 'a b c d e f g' A = tag('I').repeatable() parser = Parser(A, tagger=MyTagger()) matches = parser.findall(text) spans = [_.span for _ in matches] substrings = [text[start:stop] for start, stop in spans] assert substrings == ['b c', 'e f']
def test_attribute_custom_custom(): F = fact('F', 'a') MAPPING = {'a': 1} RULE = rule('A').interpretation(F.a.custom(str.lower).custom( MAPPING.get)).interpretation(F) parser = Parser(RULE) match = parser.match('A') record = match.fact assert record == F(a=1)
def test_rule_custom(): RULE = rule( '3', '.', '14' ).interpretation( custom(float) ) parser = Parser(RULE) match = parser.match('3.14') assert match.fact == 3.14
def test_constant_attribute(): MONEY_RULE = rule( gram('INT').interpretation(Money.count), dictionary({'тысяча'}).interpretation(Money.base.const(10**3)), dictionary({'рубль', 'доллар'}).interpretation(Money.currency), ).interpretation(Money) parser = Parser(MONEY_RULE) matches = list(parser.match('1 тысяча рублей')) assert matches[0].fact == Money(count=1, base=1000, currency='рублей')