Пример #1
0
    def sections(self, segment='all'):
        """
        attributes:
        <all>
        <themes>
        <lectures>
        <practices>
        <srs>
        """

        themes = Parser(self.section_rule)
        lectures = Parser(self.lectures_rule)
        practices = Parser(self.pract_rule)
        srs = Parser(self.srs_rule)
        found = False
        for table in self.docxdoc.tables:
            for column in table.columns:
                for cell in column.cells:
                    index = 0

                    if segment == 'all' or segment == 'themes':
                        cell_search_themes = themes.findall(cell.text)
                        for each in cell_search_themes:
                            index += 1
                        if index > 2:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("this is theme")
                            break

                    if segment == 'all' or segment == 'lectures':
                        cell_search_lectures = lectures.findall(cell.text)
                        for each in cell_search_lectures:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("ЛЕКЦИИ")
                            break

                    if segment == 'all' or segment == 'practices':
                        cell_search_practices = practices.findall(cell.text)
                        for each in cell_search_practices:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("практика")
                            break

                    if segment == 'all' or segment == 'srs':
                        cell_search_srs = srs.findall(cell.text)
                        for each in cell_search_srs:
                            return self.lectures(table, column)
                            if segment != 'all':
                                found = True
                            print("практика")
                            break

                if found: break
            if found: break
Пример #2
0
def test_pipeline_key():
    from yargy import or_
    from yargy.pipelines import morph_pipeline

    pipeline = morph_pipeline([
        'закрытое общество',
        'завод'
    ])

    F = fact('F', ['a'])

    RULE = pipeline.interpretation(
        F.a.normalized()
    ).interpretation(
        F
    )
    parser = Parser(RULE)
    match = parser.match('закрытом обществе')
    record = match.fact
    assert record == F(a='закрытое общество')

    RULE = pipeline.interpretation(
        normalized()
    )
    parser = Parser(RULE)
    match = parser.match('заводе')
    value = match.fact
    assert value == 'завод'
    def __init__(self, logger=None, env='local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("OGRNExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("ogrn_extractor.log",
                                          mode='a',
                                          encoding='utf-8',
                                          backupCount=5,
                                          maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.tokenizer = MorphTokenizer()

        OGRN = morph_pipeline([
            'огрн', 'основной государственный регистрационный номер', 'огрнип'
        ])

        INT = type('INT')

        OGRN_NUMBER = rule(OGRN, INT)

        self.full_ogrn_parser = Parser(OGRN_NUMBER)
        self.ogrn_num_parser = Parser(rule(INT))
Пример #4
0
def test_type_errors():
    F = fact('F', ['a'])
    RULE = rule(
        'a',
        eq('1').interpretation(
            custom(int)
        )
    ).interpretation(
        F.a
    )
    parser = Parser(RULE)
    match = parser.match('a 1')
    with pytest.raises(TypeError):
        match.fact

    F = fact('F', ['a'])
    RULE = rule(
        'a',
        eq('1').interpretation(
            custom(int)
        )
    ).interpretation(
        custom(str)
    )
    parser = Parser(RULE)
    match = parser.match('a 1')
    with pytest.raises(TypeError):
        match.fact
Пример #5
0
    def predict(self, input):
        address = Address()
        
        # Parse cities
        matches = list(Parser(CityFilter).findall(input))
        if (len(matches)):
            Fact = matches[0].fact
            address.city = (Fact.title, Fact.prefix)

        # Parse streets
        matches = list(Parser(StreetFilter).findall(input))
        if (len(matches)):
            Fact = matches[0].fact
            address.street = (Fact.title, Fact.prefix)

        # Parse buildings
        matches = list(Parser(BuildingFilter).findall(input))
        if (len(matches)):
            Fact = matches[0].fact
            address.building = (Fact.house, Fact.corpus, Fact.structure)

        # Parse appartments
        matches = list(Parser(AppartmentFilter).findall(input))
        if (len(matches)):
            Fact = matches[0].fact
            address.appartment = Fact.appartment

        return address
Пример #6
0
def test_pipeline():
    RULE = rule(pipeline(['a b c', 'b c']), 'd')
    parser = Parser(RULE)
    assert parser.match('b c d')
    assert parser.match('a b c d')

    RULE = rule(pipeline(['a b']).repeatable(), 'c')
    parser = Parser(RULE)
    assert parser.match('a b a b c')

    RULE = rule(caseless_pipeline(['A B']), 'c')
    parser = Parser(RULE)
    assert parser.match('A b c')

    RULE = morph_pipeline([
        'текст',
        'текст песни',
        'материал',
        'информационный материал',
    ])
    parser = Parser(RULE)
    matches = list(parser.findall('текстом песни музыкальной группы'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['текстом', 'песни']

    matches = list(parser.findall('информационного материала под названием'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['информационного', 'материала']

    RULE = morph_pipeline(['1 B.'])
    parser = Parser(RULE)
    assert parser.match('1 b .')
Пример #7
0
    def find(tokens):
        parser = Parser(HOBBY_ITEMS, tokenizer=ID_TOKENIZER)
        matches = parser.findall(tokens)
        spans = [_.span for _ in matches]

        tokens = list(select_span_tokens(tokens, spans))
        # print([_.value for _ in tokens])

        parser = Parser(HOBBIES, tokenizer=ID_TOKENIZER)

        matches = list(parser.findall(tokens))
        return matches
Пример #8
0
    def find(self, tokens):
        parser = Parser(self.WORKPLACE_ELEM, tokenizer=ID_TOKENIZER)
        matches = parser.findall(tokens)
        spans = [_.span for _ in matches]

        tokens = list(select_span_tokens(tokens, spans))
        # print([_.value for _ in tokens])

        parser = Parser(self.WORKPLACE, tokenizer=ID_TOKENIZER)

        matches = list(parser.findall(tokens))
        return matches
Пример #9
0
    def __init__(self, logger = None, env = 'local'):

        self.env = env

        if logger is None:
            self.logger = logging.getLogger("AdsExtractor")
            self.logger.setLevel(logging.DEBUG)
            handler = RotatingFileHandler("ads_extractor.log", mode='a', encoding='utf-8', backupCount=5,
                                     maxBytes=1 * 1024 * 1024)
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
        else:
            self.logger = logger

        self.texttools = texttools.TextTools(self.logger)

        self.tokenizer = MorphTokenizer()
        self.morph = pymorphy2.MorphAnalyzer()

        EXCLUDE = morph_pipeline([
            'без',
            'не',
            'вправе отказаться',
            'может отказаться',
            'услуга'
        ])

        AGREEMENT = morph_pipeline([
            'соглашаться с получением'
        ])

        SUBJECT = morph_pipeline([
            'рассылка',
            'предложение'
        ])

        KIND = morph_pipeline([
            'рекламный'
        ])

        SPECIALS = morph_pipeline([
            'рекламныя цель'
        ])

        ADS = or_(
            rule(KIND, SUBJECT),
            rule(SUBJECT, KIND),
            or_(SPECIALS, AGREEMENT)
        )

        self.ads_parser = Parser(ADS)
        self.exclude_parser = Parser(rule(EXCLUDE))
Пример #10
0
 def find_feature(feature, RULE, RULE2, space=[40,40]):
     parser = Parser(RULE)
     lst = []
     for match in parser.findall(text):
         lst.append((match.span, [_.value for _ in match.tokens]))
     if lst:
         add_text = text[list(match.span)[1]-space[0]:list(match.span)[1]+space[1]]
         parser = Parser(RULE2)
         lst = []
         for match in parser.findall(add_text):
             lst.append((match.span, [_.value for _ in match.tokens]))
         if lst:
             dict_symp[feature] = 1
         else:
             dict_symp[feature] = 0
Пример #11
0
 def __init__(self, names: list = [], version_numbers: list = [], version_names: list = [], consoles: list = []):
     rules = rule(morph_pipeline(names).interpretation(self.__game.name.const(names[0])),
                  morph_pipeline(version_numbers).interpretation(self.__game.version_number).optional(),
                  morph_pipeline(version_names).interpretation(self.__game.version_name).optional(),
                  morph_pipeline(consoles).interpretation(self.__game.console).optional())
     game = or_(rules).interpretation(self.__game)
     self.parser = Parser(game)
Пример #12
0
def test_inflected_custom():
    MONTHS = {'январь': 1}
    RULE = rule('январе').interpretation(
        inflected({'nomn', 'sing'}).custom(MONTHS.get))
    parser = Parser(RULE)
    match = parser.match('январе')
    assert match.fact == 1
Пример #13
0
def test_person():
    Name = fact(
        'Name',
        ['first', 'last'],
    )
    Person = fact('Person', ['position', 'name'])

    LAST = and_(
        gram('Surn'),
        not_(gram('Abbr')),
    )
    FIRST = and_(
        gram('Name'),
        not_(gram('Abbr')),
    )

    POSITION = morph_pipeline(['управляющий директор', 'вице-мэр'])

    gnc = gnc_relation()
    NAME = rule(
        FIRST.interpretation(Name.first).match(gnc),
        LAST.interpretation(Name.last).match(gnc)).interpretation(Name)

    PERSON = rule(
        POSITION.interpretation(Person.position).match(gnc),
        NAME.interpretation(Person.name)).interpretation(Person)

    parser = Parser(PERSON)

    match = parser.match('управляющий директор Иван Ульянов')
    assert match

    assert match.fact == Person(position='управляющий директор',
                                name=Name(first='Иван', last='Ульянов'))
Пример #14
0
def show_matches(rule, *lines):
    parser = Parser(rule)
    for line in lines:
        matches = parser.findall(line)
        spans = [_.span for _ in matches]

        show_markup(line, spans)
Пример #15
0
def test_name():
    Name = fact(
        'Name',
        ['first', 'last']
    )

    gnc = gnc_relation()

    FIRST = gram('Name').interpretation(
        Name.first.inflected()
    ).match(gnc)

    LAST = gram('Surn').interpretation(
        Name.last.inflected()
    ).match(gnc)

    NAME = rule(
        FIRST,
        LAST
    ).interpretation(Name)

    parser = Parser(NAME)
    match = parser.match('саше иванову')
    assert match.fact == Name(first='саша', last='иванов')

    match = parser.match('сашу иванову')
    assert match.fact == Name(first='саша', last='иванова')

    match = parser.match('сашу ивановой')
    assert not match
Пример #16
0
def get_all_collocation(lines, word):
    """
    Function for finding all collocations of word and any word after it.

    :param lines: list of string
        Lines for processing.

    :param word: str
        Word for searching.

    :return:
        List of all valid collocations.
    """
    if not isinstance(lines, list) or not isinstance(word, str):
        raise TypeError

    gr = rule(normalized(word),
              and_(not_(yargy_type('PUNCT')), not_(yargy_type('OTHER'))))

    result_list = []

    for line in lines:
        if not isinstance(line, str):
            raise TypeError
        for match in Parser(gr).findall(line):
            result_list.append(' '.join(
                [Normalizer.normalise(token.value) for token in match.tokens]))

    return result_list
Пример #17
0
def test_samples(rules: Union[NamedRule, List[NamedRule]],
                 texts: List[str],
                 num: int = 20,
                 seed: int = None,
                 markup=None,
                 fact=False):
    from random import seed as sed, sample

    sed(seed)
    texts, num = (texts,
                  len(texts)) if len(texts) < num else (sample(texts, num),
                                                        num)
    results: Dict[int, Dict[int, List]] = defaultdict(dict)

    if not (isinstance(rules, list) or isinstance(rules, tuple)):
        rules = [rules]

    for rule_idx, rule in enumerate(rules):
        parser = Parser(rule)

        for text_idx in range(num):
            matches = parser.findall(texts[text_idx])
            results[text_idx][rule_idx] = list(matches)

    for text_idx, rule_matches in results.items():
        spans = [(m.span[0], m.span[1], str(rules[rule_idx].name))
                 for rule_idx, matches in rule_matches.items()
                 for m in matches]

        show_markup(texts[text_idx], spans, markup or BoxLabelMarkup)

        if fact:
            for rule_idx, matches in rule_matches.items():
                for m in matches:
                    display(m.fact)
Пример #18
0
def test(rule, *lines, tree=False, facts=False):
    is_at = lambda span, set: any((span == s) for s in set)
    parser = Parser(rule)

    for line in lines:
        if isinstance(line, str):
            text, expected = line, []
        else:
            text = line[0]
            expected = [find(text, substr) for substr in line[1:]]

        matches = list(sorted(parser.findall(text), key=lambda _: _.span))
        # display(matches)
        matched_spans = [_.span for _ in matches]
        spans = [(s[0], s[1], '#aec7e8' if is_at(s, expected) else '#ff9896') for s in matched_spans] \
                + [(s[0], s[1], '#ccc') for s in expected if not is_at((s[0], s[1]), matched_spans)]

        show_markup(text, [s for s in spans if s[0] < s[1]], LineMarkup)

        if matches:
            for _ in matches:
                if tree:
                    display(matches[0].tree.as_dot)
                if facts:
                    display(_.fact)
Пример #19
0
def test_inflected_custom_attribute():
    F = fact('F', ['a'])
    MONTHS = {'январь': 1}
    RULE = rule('январе').interpretation(
        F.a.inflected({'nomn', 'sing'}).custom(MONTHS.get)).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('январе')
    assert match.fact == F(a=1)
Пример #20
0
def test_normalized_custom_attribute():
    F = fact('F', ['a'])
    MONTHS = {'январь': 1}
    RULE = rule('январе').interpretation(F.a.normalized().custom(
        MONTHS.get)).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('январе')
    assert match.fact == F(a=1)
Пример #21
0
def test_inflected():
    RULE = rule(
        'московским'
    ).interpretation(
        inflected({'nomn', 'femn'})
    )
    parser = Parser(RULE)
    match = parser.match('московским')
    assert match.fact == 'московская'
Пример #22
0
def test_const():
    RULE = rule(
        'a'
    ).interpretation(
        const(1)
    )
    parser = Parser(RULE)
    match = parser.match('a')
    assert match.fact == 1
Пример #23
0
def test_rule_attribute():
    F = fact('F', ['a'])
    RULE = rule('a', 'A').interpretation(F.a).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('a   A')
    record = match.fact
    assert record == F(a='a A')
    assert record.spans == [(0, 5)]
    assert record.as_json == {'a': 'a A'}
Пример #24
0
def test_attribute_normalized():
    F = fact('F', 'a')
    RULE = rule('январе').interpretation(F.a.normalized()).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('январе')
    record = match.fact
    assert record == F(a='январь')
    assert record.spans == [(0, 6)]
    assert record.as_json == {'a': 'январь'}
Пример #25
0
def text_normalized():
    RULE = rule(
        'московским'
    ).interpretation(
        normalized()
    )
    parser = Parser(RULE)
    match = parser.match('московским')
    assert match.fact == 'московский'
Пример #26
0
def test_attribute_custom():
    F = fact('F', 'a')
    RULE = rule('1').interpretation(F.a.custom(int)).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('1')
    record = match.fact
    assert record == F(a=1)
    assert record.spans == [(0, 1)]
    assert record.as_json == {'a': 1}
Пример #27
0
def test_tagger():
    text = 'a b c d e f g'
    A = tag('I').repeatable()
    parser = Parser(A, tagger=MyTagger())

    matches = parser.findall(text)
    spans = [_.span for _ in matches]
    substrings = [text[start:stop] for start, stop in spans]
    assert substrings == ['b c', 'e f']
Пример #28
0
def test_attribute_custom_custom():
    F = fact('F', 'a')
    MAPPING = {'a': 1}
    RULE = rule('A').interpretation(F.a.custom(str.lower).custom(
        MAPPING.get)).interpretation(F)
    parser = Parser(RULE)
    match = parser.match('A')
    record = match.fact
    assert record == F(a=1)
Пример #29
0
def test_rule_custom():
    RULE = rule(
        '3', '.', '14'
    ).interpretation(
        custom(float)
    )
    parser = Parser(RULE)
    match = parser.match('3.14')
    assert match.fact == 3.14
Пример #30
0
def test_constant_attribute():
    MONEY_RULE = rule(
        gram('INT').interpretation(Money.count),
        dictionary({'тысяча'}).interpretation(Money.base.const(10**3)),
        dictionary({'рубль', 'доллар'}).interpretation(Money.currency),
    ).interpretation(Money)

    parser = Parser(MONEY_RULE)
    matches = list(parser.match('1 тысяча рублей'))
    assert matches[0].fact == Money(count=1, base=1000, currency='рублей')