Пример #1
0
def test_pipeline():
    RULE = rule(pipeline(['a b c', 'b c']), 'd')
    parser = Parser(RULE)
    assert parser.match('b c d')
    assert parser.match('a b c d')

    RULE = rule(pipeline(['a b']).repeatable(), 'c')
    parser = Parser(RULE)
    assert parser.match('a b a b c')

    RULE = rule(caseless_pipeline(['A B']), 'c')
    parser = Parser(RULE)
    assert parser.match('A b c')

    RULE = morph_pipeline([
        'текст',
        'текст песни',
        'материал',
        'информационный материал',
    ])
    parser = Parser(RULE)
    matches = list(parser.findall('текстом песни музыкальной группы'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['текстом', 'песни']

    matches = list(parser.findall('информационного материала под названием'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['информационного', 'материала']

    RULE = morph_pipeline(['1 B.'])
    parser = Parser(RULE)
    assert parser.match('1 b .')
Пример #2
0
def update_rules(orgnames):

    ORGANIZATION = caseless_pipeline(orgnames).interpretation(
        Workplace.org_name)

    WORKPLACE_ELEM = rule(or_(PERIOD, ORGANIZATION, OCCUPATION))

    WORKPLACE = rule(
        PERIOD,
        or_(rule(ORGANIZATION, OCCUPATION.optional()),
            rule(ORGANIZATION.optional(), OCCUPATION),
            rule(OCCUPATION, ORGANIZATION.optional()),
            rule(OCCUPATION.optional(),
                 ORGANIZATION))).interpretation(Workplace)

    return WORKPLACE_ELEM, WORKPLACE
Пример #3
0
def test_pipeline():
    RULE = rule(
        pipeline(['a b c', 'b c']),
        'd'
    )
    parser = Parser(RULE)
    assert parser.match('b c d')
    assert parser.match('a b c d')

    RULE = rule(
        pipeline(['a b']).repeatable(),
        'c'
    )
    parser = Parser(RULE)
    assert parser.match('a b a b c')

    RULE = rule(
        caseless_pipeline(['A B']),
        'c'
    )
    parser = Parser(RULE)
    assert parser.match('A b c')

    RULE = morph_pipeline([
        'текст',
        'текст песни',
        'материал',
        'информационный материал',
    ])
    parser = Parser(RULE)
    matches = list(parser.findall('текстом песни музыкальной группы'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['текстом', 'песни']

    matches = list(parser.findall('информационного материала под названием'))
    assert len(matches) == 1
    match = matches[0]
    assert [_.value for _ in match.tokens] == ['информационного', 'материала']

    RULE = morph_pipeline(['1 B.'])
    parser = Parser(RULE)
    assert parser.match('1 b .')
Пример #4
0
from yargy import (rule, or_)
from yargy.interpretation import (fact)
from yargy.predicates import (dictionary, is_capitalized, eq, caseless)
from yargy.pipelines import caseless_pipeline, morph_pipeline

from natasha.extractors import Extractor

Position = fact('position', ['level', 'field', 'name'])

LEVEL = rule(
    caseless_pipeline([
        'junior', 'middle', 'senior', 'lead', 'chief', 'head', 'team lead',
        "старший", "младший", "руководитель направления"
    ]).interpretation(Position.level))

# TODO: нужно учесть жаргонные варианты (датасаентолог, датасатанист и т.д.) Скорее всего, придется парсить регулярками
NAME = rule(
    or_(
        caseless_pipeline([
            'data scientist', 'data engineer', 'engineer', 'analyst',
            'data analyst', 'data manager', 'scientist', 'researcher',
            "developer", "intern"
        ]), rule(dictionary(['DS', 'DE']), is_capitalized()),
        morph_pipeline(["аналитик", "разработчик",
                        "стажер"])).interpretation(Position.name.inflected()))

FIELD = rule(
    caseless_pipeline([
        'ML', 'DL', 'CV', 'computer vision', 'NLP', 'bi', 'machine learning',
        'deep learning', 'software', 'research', 'big data', 'python', 'c++',
        "scala", "java", 'ios', "android", 'devops', "backend", 'frontend'
DASH = eq('-')

RANGE_MONEY = rule(CURRENCY.optional(), AMOUNT,
                   CURRENCY.optional()).interpretation(Money)

RANGE_MIN = rule(eq('от').optional(), RANGE_MONEY.interpretation(Range.min))

RANGE_MAX = rule(
    # eq('до').optional(),
    RANGE_MONEY.interpretation(Range.max))
# TODO: пока не интерпретируется
TAXATION = rule(
    caseless_pipeline([
        'чистыми', "грязными", "до налогов", "после налогов", "на руки",
        "gross", "гросс", 'net', "нетто", "до НДФЛ", "после НДФЛ",
        "до вычета НДФЛ", "после вычета НДФЛ"
    ]))
FORK = rule(dictionary({'fork', 'Вилка', 'ЗП', 'Оклад'}), eq(':').optional())
RANGE = rule(
    FORK.optional(),
    RANGE_MIN,
    or_(DASH, eq('до')),  # раньше был DASH.optional(),
    RANGE_MAX,
    TAXATION.interpretation(Range.taxation).optional()).interpretation(Range)


def parse_money_emojis(message: dict):
    big_money_emojis = {
        "moneyparrot", "moneys", "moneybag", "money_with_wings",
        "printing-money", "money_mouth_face"
Пример #6
0
    'двести': 200,
    'триста': 300,
    'четыреста': 400,
    'пятьсот': 500,
    'шестьсот': 600,
    'семьсот': 700,
    'восемьсот': 800,
    'девятьсот': 900,
    'тысяча': 10**3,
    'миллион': 10**6,
    'миллиард': 10**9,
    'триллион': 10**12,
}
DOT = eq('.')
INT = type('INT')
THOUSANDTH = rule(caseless_pipeline(['тысячных', 'тысячная'])).interpretation(const(10**-3))
HUNDREDTH = rule(caseless_pipeline(['сотых', 'сотая'])).interpretation(const(10**-2))
TENTH = rule(caseless_pipeline(['десятых', 'десятая'])).interpretation(const(10**-1))
THOUSAND = or_(
    rule(caseless('т'), DOT),
    rule(caseless('тыс'), DOT.optional()),
    rule(normalized('тысяча')),
    rule(normalized('тыща'))
).interpretation(const(10**3))
MILLION = or_(
    rule(caseless('млн'), DOT.optional()),
    rule(normalized('миллион'))
).interpretation(const(10**6))
MILLIARD = or_(
    rule(caseless('млрд'), DOT.optional()),
    rule(normalized('миллиард'))
Пример #7
0
from .helpers import TOKENIZER, ID_TOKENIZER, load_named_entities
from .education import EducationExtractor
from .workplace import WorkplaceExtractor
from .hobby import HobbyExtractor

from yargy.parser import Parser
from yargy.pipelines import pipeline, caseless_pipeline


EXP_TITLE = pipeline(['Опыт работы'])
EDU_TITLE = pipeline(['Образование'])
EXTRA_EDU_TITLE = caseless_pipeline(['Курсы', 'Сертификаты'])
HOBBY_TITLE = caseless_pipeline(['Хобби', 'Увлечения'])


def parse(text):

    named_entities = load_named_entities(text)
    exp_tokens = edu_tokens = hobby_tokens = tokens = list(TOKENIZER(text))
    extra_edu_tokens = []

    parser = Parser(EXP_TITLE, tokenizer=ID_TOKENIZER)
    exp_title = parser.find(tokens)

    parser = Parser(EDU_TITLE, tokenizer=ID_TOKENIZER)
    edu_title = parser.find(tokens)

    parser = Parser(HOBBY_TITLE, tokenizer=ID_TOKENIZER)
    hobby_title = parser.find(tokens)

    if exp_title:
Пример #8
0
from yargy import (rule, or_, Parser)

from yargy.predicates import (eq, gram)
from yargy.pipelines import (caseless_pipeline, pipeline)
from yargy.interpretation import (fact, attribute)

from .helpers import ID_TOKENIZER, select_span_tokens, show_matches

Hobby = fact('Hobby', [attribute('name').repeatable()])

HYPHEN = rule(pipeline(['-', '—', '–']))
COLON = rule(eq(':'))
COMMA = rule(eq(','))
DOT = rule(eq('.'))

TITLES = caseless_pipeline(['Хобби', 'Увлечения'])

TITLE = rule(TITLES, or_(COLON, HYPHEN))

ITEM = rule(or_(gram('NOUN'),
                gram('ADJF')).repeatable(max=3)).interpretation(Hobby.name)

HOBBY_ITEMS = rule(or_(TITLE, ITEM, COMMA, DOT))

HOBBIES = rule(
    TITLE,
    rule(ITEM, or_(COMMA, DOT)).repeatable(),
).interpretation(Hobby)


class HobbyExtractor:
    def __init__(self, filename, university):

        self.filename = filename
        self.university = university

        self.rpd_task_and_goals = morph_pipeline([
            'цели и задачи', 'цели освоения', 'задачи освоения', 'аннотация',
            'краткое содержание', 'краткое описание'
        ])

        self.rpd_education_result = morph_pipeline(
            ['планируемый результат обучение', 'компетенции'])

        self.rpd_discipline_link = morph_pipeline(
            ['место учебный дисциплина', 'место дисциплины'])

        self.rpd_discipline_structure = caseless_pipeline(
            ['содержание дисциплины', 'структура дисциплины'])

        self.rpd_lecture_theme = morph_pipeline(['лекции'])

        self.rpd_practice_theme = morph_pipeline([
            'практические занятия', 'семинар', 'семинарские занятия',
            'лабораторные работы'
        ])

        self.rpd_selfwork_theme = morph_pipeline([
            'самостоятельная работа обучающихся по дисциплине',
            'самостоятельная работа студентов', 'домашняя работа'
        ])

        self.rpd_education_zyn = rule(dictionary({'Знать', 'Уметь',
                                                  'Владеть'}))
        self.section_rule = rule(
            dictionary({"раздел", "тема", "дисциплина", "наименование"}))

        self.prd_lectures = rule(
            morph_pipeline([
                'тема лекций', 'содержание занятий',
                'содержание лекционного занятия'
            ]))

        self.prd_practices = rule(
            morph_pipeline(
                ['наименование', 'содержание практического занятия', 'тема']))

        self.rpd_srs = rule(
            morph_pipeline([
                'СРС', 'содержание занятий', 'содержание задания', 'тема СРО',
                'тема СРС'
            ]))
        self.rpd_name = rule(
            morph_pipeline([
                'рабочая программа дисциплины', 'дисциплина',
                'программа дисциплины'
            ]))
        self.table_rpd_name = rule(dictionary({'дисциплина'}))

        self.rpd_lectures_optional = rule(morph_pipeline(['содержание']))
        self.rpd_practices_optional = rule(
            morph_pipeline(['содержание', 'cодержание практического занятия']))
        self.rpd_srs_optional = rule(
            morph_pipeline(['содержание', 'содержание задания']))

        self.documentText = dict()
        self.docs_headers = list()
        self.fullText = list()

        parser_RPD_task_and_goals = Parser(self.rpd_task_and_goals)
        parser_RPD_education_result = Parser(self.rpd_education_result)
        parser_RPD_discipline_link = Parser(self.rpd_discipline_link)
        parser_PRD_discipline_structure = Parser(self.rpd_discipline_structure)
        parser_PRD_lecture_theme = Parser(self.rpd_lecture_theme)
        parser_RPD_practice_theme = Parser(self.rpd_practice_theme)
        parser_RPD_selfwork_theme = Parser(self.rpd_selfwork_theme)
        parser_PRD_zyn_result = Parser(self.rpd_education_zyn)
        parser_PRD_themes = Parser(self.section_rule)
        parser_PRD_lectures = Parser(self.prd_lectures)
        parser_PRD_practices = Parser(self.prd_practices)
        parser_RPD_srs = Parser(self.rpd_srs)
        parser_RPD_name = Parser(self.rpd_name)
        self.parser_table_RPD_name = Parser(self.table_rpd_name)
        parser_RPD_lectures_desc = Parser(self.rpd_lectures_optional)
        parser_RPD_practices_desc = Parser(self.rpd_practices_optional)
        parser_RPD_srs_desc = Parser(self.rpd_srs_optional)

        self.get_rpd_text(filename)
        self.documentText['университет'] = self.university
        self.documentText['название дисциплины'] = self.get_rpd_name(
            parser_RPD_name)

        self.documentText[
            'направление подготовки'] = self.get_direction_of_preparation()

        self.documentText['цели и задачи'] = "".join(
            self.find_boundries(parser_RPD_task_and_goals))

        self.documentText['результаты обучения'] = self.find_boundries(
            parser_RPD_education_result)
        fgos_table = ""
        flag = True
        if self.documentText['результаты обучения'] is not None:
            for item in self.documentText['результаты обучения']:
                if "Таблица: " in item:
                    fgos_table = item[8:]
                    self.documentText['результаты обучения'] = item
            if fgos_table == "":
                fgos_table = self.documentText['результаты обучения']
                flag = False
        self.documentText['ЗУН'] = self.get_zyn_results(
            fgos_table, parser_PRD_zyn_result, flag)
        temp = ""
        for key, value in self.documentText['ЗУН'].items():
            temp += key + " "
            for item in value:
                temp += "".join(item) + " "
        self.documentText['ЗУН'] = temp.replace("~", "")

        self.documentText['компетенции'] = self.search_place_fgos(
            "".join(fgos_table))
        temp = ""
        for key, value in self.documentText['компетенции'].items():
            temp += key + " " + value
        self.documentText['компетенции'] = temp

        self.documentText['результаты обучения'] = "".join(
            self.documentText['результаты обучения']).replace("~",
                                                              '\t').replace(
                                                                  "@", '\n')

        self.documentText['связь дисциплины'] = "".join(
            self.find_boundries(parser_RPD_discipline_link)).replace(
                "Таблица: ", "").replace("~", "\t").replace("@", "\n")

        self.documentText['структура дисциплины'] = self.find_boundries(
            parser_PRD_discipline_structure)

        discipline_themes_table = ""
        for item in self.documentText['структура дисциплины']:
            if "Таблица: " in item:
                discipline_themes_table = item
                break
        self.documentText['структура дисциплины'] = "".join(
            self.documentText['структура дисциплины']).replace(
                "Таблица: ", '').replace("~", '\t').replace("@", "\n")

        self.documentText['темы структуры дисципилны'] = "".join(
            self.convert_string_to_table(discipline_themes_table[8:],
                                         parser_PRD_themes))

        self.documentText['лекции'] = self.find_boundries(
            parser_PRD_lecture_theme)
        if self.documentText['лекции'] is not None:
            discipline_lectures_table = ""
            for item in self.documentText['лекции']:
                if "Таблица: " in item:
                    discipline_lectures_table = item
                    break
            self.documentText['темы лекций'] = "".join(
                self.convert_string_to_table(discipline_lectures_table[8:],
                                             parser_PRD_lectures))
            self.documentText['описание лекций'] = "".join(
                self.convert_string_to_table(discipline_lectures_table[8:],
                                             parser_RPD_lectures_desc))
        self.documentText['лекции'] = "".join(
            self.documentText['лекции']).replace("Таблица: ", '').replace(
                "~", '\t').replace("@", '\n')

        self.documentText['практики'] = self.find_boundries(
            parser_RPD_practice_theme)
        if self.documentText['практики'] is not None:
            discipline_practises_table = ""
            for item in self.documentText['практики']:
                if "Таблица: " in item:
                    discipline_practises_table = item
                    break

            self.documentText['темы практик'] = "".join(
                self.convert_string_to_table(discipline_practises_table[8:],
                                             parser_PRD_practices))
            self.documentText['описание практик'] = ""
            # self.convert_string_to_table(discipline_lectures_table[8:],parser_RPD_practices_desc)
            self.documentText['практики'] = "".join(
                self.documentText['практики']).replace(
                    "Таблица: ", '').replace("~", '\t').replace("@", '\n')

        self.documentText['СРС'] = self.find_boundries(
            parser_RPD_selfwork_theme)
        if self.documentText['СРС'] is not None:
            discipline_srs_table = ""
            for item in self.documentText['СРС']:
                if "Таблица: " in item:
                    discipline_srs_table = item
                    break

            self.documentText['темы СРС'] = "".join(
                self.convert_string_to_table(discipline_srs_table[8:],
                                             parser_RPD_srs))
            self.documentText['описание СРС'] = ""
            # self.convert_string_to_table(discipline_srs_table[8:], parser_RPD_srs_desc)
            self.documentText['СРС'] = "".join(
                self.documentText['СРС']).replace("Таблица: ", '').replace(
                    "~", '\t').replace("@", '\n')
Пример #10
0
YEAR = and_(gte(1900), lte(2100)).interpretation(Date.year.custom(int))

DATE = rule(MONTH_NAME, YEAR).interpretation(Date)

Work_period = fact('Work_period', ['from_date', 'to_date'])

FROM_DATE = DATE.interpretation(Work_period.from_date)
TO_DATE = DATE.interpretation(Work_period.to_date)

PERIOD = rule(
    FROM_DATE, HYPHEN.optional(),
    or_(TO_DATE, pipeline([
        'н.в.', 'настоящее время', 'работает сейчас'
    ]))).interpretation(Work_period).interpretation(Workplace.period)

OCC_NAMES = caseless_pipeline(OCCUPATIONS)

OCCUPATION = rule(
    or_(
        rule(OCC_NAMES),
        rule(OCC_NAMES, COMMA,
             not_(eq(OCC_NAMES)).repeatable(max=3),
             OCC_NAMES))).interpretation(Workplace.occupation)


def update_rules(orgnames):

    ORGANIZATION = caseless_pipeline(orgnames).interpretation(
        Workplace.org_name)

    WORKPLACE_ELEM = rule(or_(PERIOD, ORGANIZATION, OCCUPATION))
Пример #11
0
    'max_load_index', 'season', 'spikes'
])
Vendor = fact('Vendor', ['id', 'name'])

# HELPERS
SEP = in_({'-', '/', '|', ':', ';', '.'})
NUM = type_('INT')
INT = NUM.interpretation(interp.custom(int))
FLOAT = rule(NUM.repeatable(), in_({',', '.'}), NUM,
             NUM.optional()).interpretation(interp.custom(to_float))

# TIRE_VENDORS
VENDORS_NAME, VENDORS_ID = get_vendor_dict(tires_vendors_path)
VENDOR = rule(
    caseless_pipeline(VENDORS_NAME).interpretation(
        Vendor.name.normalized().custom(
            VENDORS_NAME.get))).interpretation(Vendor)

# TIRE_HELPERS
DIAMETER_WITH_LETTER = rule(NUM, or_(eq('С'), eq('C')).optional())
STRUCTURE = or_(
    rule(or_(INT, FLOAT), SEP, or_(INT, FLOAT), SEP,
         or_(INT, DIAMETER_WITH_LETTER)),
    rule(or_(INT, FLOAT), SEP, or_(INT, FLOAT), eq('R'),
         or_(INT, DIAMETER_WITH_LETTER)))

# TIRE_WIDTH
WIDTH_PIPELINE = morph_pipeline(['шир', 'ширина', 'width', 'wid'])
WIDTH = or_(
    rule(WIDTH_PIPELINE, SEP,
         or_(INT, FLOAT).interpretation(Tire.width), SEP),
Пример #12
0
from .helpers import load_lines, ID_TOKENIZER, select_span_tokens

Date = fact('Date', ['month', 'year'])

Education = fact('Education', ['year', 'name', 'specialization'])
"""
Creating dicts
"""
FOLDER = os.path.dirname(__file__)

UNI_NAMES_GEN = load_lines(os.path.join(FOLDER, 'dicts/VUZY.txt'))
UNI_NAMES = set(UNI_NAMES_GEN)

SPECIALIZATIONS = load_lines(os.path.join(FOLDER, 'dicts/specs_only.txt'))

SPECIALIZATION = caseless_pipeline(SPECIALIZATIONS).interpretation(
    Education.specialization)
"""
"""

YEAR = and_(gte(1900), lte(2100)).interpretation(
    Date.year.custom(int)).interpretation(Education.year)

UNI_NAME_RULE = rule(
    or_(rule(or_(eq('Филиал'), eq('филиал')), morph_pipeline(UNI_NAMES)),
        caseless_pipeline(UNI_NAMES))).interpretation(Education.name)

ANON_COURSE_NAME = rule(
    and_(
        not_(eq('имени')),
        not_(eq('.')),
    ).repeatable(max=5)).interpretation(Education.name)
Пример #13
0
from yargy.interpretation import fact, attribute
from yargy.pipelines import caseless_pipeline

Type = fact('Type', [attribute('name', 'Практическое занятие')])

TYPES = {
    'лаб': 'Лабораторная',
    'лаб.': 'Лабораторная',
    'пр.з.': 'Практическое занятие',
    'пр. з.': 'Практическое занятие',
    'лек.': 'Лекция',
    'лек': 'Лекция',
    'лекц.': 'Лекция',
    'лекции': 'Лекция'
}

TYPE = caseless_pipeline(TYPES).interpretation(Type.name.custom(
    TYPES.get)).interpretation(Type)
Пример #14
0
            length_grt(1)).interpretation(Name.last.custom(str.capitalize))

FIRST = and_(gram('Name'),
             length_grt(1)).interpretation(Name.first.custom(str.capitalize))

MIDDLE = and_(gram('Patr'),
              length_grt(1)).interpretation(Name.middle.custom(str.capitalize))
ABBR = and_(length_eq(1), is_capitalized())

FIRST_ABBR = ABBR.interpretation(Name.first.custom(str.upper))

MIDDLE_ABBR = ABBR.interpretation(Name.middle.custom(str.upper))

unsubstantial = {'Бен Режеб Т.Б.К.'}

UNSUBSTANIAL = caseless_pipeline(unsubstantial).interpretation(Name)

NAME = or_(
    rule(UNSUBSTANIAL),
    rule(LAST, DOT, FIRST_ABBR, DOT, MIDDLE_ABBR, DOT),
    rule(LAST, FIRST_ABBR, DOT, MIDDLE_ABBR, DOT),
    rule(FIRST_ABBR, DOT, MIDDLE_ABBR, DOT, LAST),
    rule(LAST, FIRST_ABBR, DOT),
    rule(FIRST_ABBR, DOT, LAST),
    rule(FIRST, MIDDLE, LAST),
    rule(LAST, FIRST, MIDDLE),
    rule(FIRST, MIDDLE),
).interpretation(Name)
"""
degrees = {
    'ст.преп.': 'старший перподаватель',
Пример #15
0
subjectsPath = os.path.join(os.path.dirname(__file__), 'subjects')

subjects = {}
for subj in set(open(subjectsPath,
                     'r').read().lower().strip().split('\n')) - {''}:
    subj = subj.strip()
    a = subj.split('\\')
    if len(a) == 2:
        subj = a[1].strip()
        subjects[a[0].strip()] = subj
    subjects[subj] = subj

if __name__ == '__main__':
    """
    while True:
        s = input().lower()
        for i in subjects:
            i = i.lower()
            l = i.split()
            if len(l) == len(s) and all([s[i] == l[i][0] for i in range(len(s))]): print(i)
    """
    for i in subjects:
        if i == subjects[i]: print(i)
        else: print(i, '\\', subjects[i])

SUBJECT = rule(
    caseless_pipeline(subjects).interpretation(
        Subject.name.normalized().custom(
            subjects.get))).interpretation(Subject)