def get_second_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN= gram('NOUN') ADJF = gram('ADJF') ANIM=gram('anim') GENT=gram('gent') SGTM=gram('Sgtm') FEMN=gram('femn') CONJ=gram('CONJ') PATR=gram('Patr') NAME = gram('Name') PREP=gram('PREP') SURNAME_CONST=rule( and_( SGTM, ANIM, not_(NAME), not_ (PATR), not_(eq('по')), not_(eq('ленина')), not_(eq('ульянова')) ) ) SURNAME=or_( SURNAME_CONST, rule(eq('Иванов')), rule(eq('левченко')), rule(eq('эйхвальд')), rule(eq('зимина')), rule(eq('хитарьян')), rule(eq('моторин')), rule(eq('рукавишников')), rule(eq('деткино')), rule(eq('буланцев')), rule(eq('багров')), rule(eq('шерл')), rule(eq('белоцерковский')), rule(eq('степанов')), rule(eq('шляхов')), rule(eq('моисеев')), rule(eq('пузанков')), rule(eq('попиченко')), rule(eq('сергеев')), rule(eq('удовенко')), rule(eq('тютин')), rule(eq('удовенко')) ) COMPLICATED=rule( SURNAME.repeatable() ) FINAL = or_(COMPLICATED) return FINAL
def req_animacy(animacy: str = "любой"): if animacy == "любой": return yp.true() elif animacy == "одуш.": return y.or_( y.not_(yp.gram("inan")), yp.gram("anim"), yp.gram("NPRO"), yp.gram("ADJF") ) elif animacy == "неодуш.": return y.or_(yp.gram("inan"), yp.gram("anim"), yp.gram("NPRO"), yp.gram("ADJF")) else: raise ValueError("Incorrect Animacy Type")
def __init__(self, logger = None, env = 'local'): self.env = env if logger is None: self.logger = logging.getLogger("AdsExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("ads_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.texttools = texttools.TextTools(self.logger) self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() EXCLUDE = morph_pipeline([ 'без', 'не', 'вправе отказаться', 'может отказаться', 'услуга' ]) AGREEMENT = morph_pipeline([ 'соглашаться с получением' ]) SUBJECT = morph_pipeline([ 'рассылка', 'предложение' ]) KIND = morph_pipeline([ 'рекламный' ]) SPECIALS = morph_pipeline([ 'рекламныя цель' ]) ADS = or_( rule(KIND, SUBJECT), rule(SUBJECT, KIND), or_(SPECIALS, AGREEMENT) ) self.ads_parser = Parser(ADS) self.exclude_parser = Parser(rule(EXCLUDE))
def get_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN = gram('NOUN') ADJF = gram('ADJF') GEO = gram('Geox') PREP = gram('PREP') CONJ = gram('CONJ') NAME = rule(and_(gram('Name'), not_(PREP), not_(GEO))) NOUN_NOT_CONJ = rule(and_(NOUN, not_(CONJ))) STREET_SUFFIXS = morph_pipeline([ 'улица', 'тракт', 'бульвар', 'проспект', 'микрорайон', 'проезд', 'шоссе', 'парк' ]) SPECIAL_STREET_SUFFIXS = morph_pipeline(['шоссе', 'тракт']) SIMPLE_STREETS_FROM_ARRAY = morph_pipeline([ 'краснопресненская', 'республике', 'маршала захарова', 'доватора', 'мичурина', 'зеленые аллеи', 'бехтеева', 'октябрьская', 'новогиреевская', 'югорская', 'артема', 'парковая', 'зеленые аллеи', 'алтуфьевское', 'горького', 'Кавказский', 'хамовнический вал', 'Кусковская', 'марьинский парк', 'московская', 'береговая', 'антонова овсиенко', 'школьная', 'юнтоловский', 'гагарина' ]) EXCEPTIONAL_STREET_CONST = morph_pipeline(['Кавказский']) NOUN_NOT_APPART = rule(not_(or_(eq('дом'), eq('квартира'), INT, CONJ))) COMPLICATED_STREETS = or_( rule(STREET_SUFFIXS, INT, NOUN, NOUN), rule(STREET_SUFFIXS, INT, ADJF, NOUN), rule(STREET_SUFFIXS, NOUN_NOT_CONJ, NOUN_NOT_APPART, NAME.optional()), rule(NAME, NOUN_NOT_APPART), rule(ADJF, NAME), rule(STREET_SUFFIXS, ADJF, NOUN_NOT_APPART), rule(STREET_SUFFIXS, CONJ, NOUN, NOUN)) SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(STREET_SUFFIXS, NOUN_NOT_APPART) SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX = rule(ADJF, SPECIAL_STREET_SUFFIXS) SIMPLE_STREETS = or_(SPECIAL_SIMPLE_STREETS_WITH_STREET_SUFFIX, SIMPLE_STREETS_WITH_STREET_SUFFIX, SIMPLE_STREETS_FROM_ARRAY) FINAL_STREET = or_(COMPLICATED_STREETS, SIMPLE_STREETS) return FINAL_STREET
def req_argument(): return y.and_( y.not_( y.or_( ## prohibits arguments from being any of following parts-of-speech yp.gram("PREP"), yp.gram("CONJ"), yp.gram("PRCL"), yp.gram("INTJ"), yp.gram("ADJF"), ) ), y.or_(yp.gram("NOUN"), yp.gram("NPRO")), )
def req_deverbal(require_deverbal_noun: str = "?"): if require_deverbal_noun == "1": ## strictly deverbal noun return y.and_(yp.gram("NOUN"), yp.in_caseless(deverbal_nouns)) elif require_deverbal_noun == "0": ## strictly regular verb return y.or_(yp.gram("VERB"), yp.gram("INFN")) elif require_deverbal_noun == "?": ## anything return y.or_( y.and_(yp.gram("NOUN"), yp.in_caseless(deverbal_nouns)), yp.gram("VERB"), yp.gram("INFN"), ) else: raise ValueError("Incorrect deverbal status")
def get_first_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN= gram('NOUN') ADJF = gram('ADJF') ANIM=gram('anim') GENT=gram('gent') SGTM=gram('Sgtm') CONJ=gram('CONJ') PATR=gram('Patr') NAME = gram('Name') PREP=gram('PREP') STATE=or_( eq('моторин'), eq('юрок'), eq('вакула'), eq('эйхвальд'), eq('иммуно'), eq('из'), eq('славы'), eq('хайбулаев'), eq('михална'), eq('валиде'), eq('шиян'), eq('сим'), eq('мазитов'), eq('хамидов') ) NAME_CONST=rule( and_( NAME, ANIM, not_( SGTM ), not_(STATE) ) ) COMPLICATED=rule( NAME_CONST.repeatable() ) FINAL = or_(COMPLICATED) return FINAL
def req_preposition(preposition: str = None): if preposition == "None": return y.empty() else: return y.or_( y.and_(yp.gram("PREP"), yp.eq(preposition)), y.not_(yp.gram("PREP")) )
def _abbreviate(word: str, abbrs: List[str], opt=False): abbrs, dashed = partition(lambda abbr: '-' in abbr, abbrs) dashed = map( lambda a: rule(*map(caseless, intersperse('-', a.split('-')))), dashed) original_word = rule(normalized(word)) dashed_sequence = rule(or_(*dashed)) abbr_with_dot = rule( or_(*map(caseless, abbrs)), eq('.').optional(), ) result = or_(original_word, dashed_sequence, abbr_with_dot) \ .interpretation(interpretation.const(word)) return result.optional() if opt else result
def update_rules(orgnames): ORGANIZATION = caseless_pipeline(orgnames).interpretation( Workplace.org_name) WORKPLACE_ELEM = rule(or_(PERIOD, ORGANIZATION, OCCUPATION)) WORKPLACE = rule( PERIOD, or_(rule(ORGANIZATION, OCCUPATION.optional()), rule(ORGANIZATION.optional(), OCCUPATION), rule(OCCUPATION, ORGANIZATION.optional()), rule(OCCUPATION.optional(), ORGANIZATION))).interpretation(Workplace) return WORKPLACE_ELEM, WORKPLACE
def update_rules(name): NAME = pipeline(name).interpretation(Socdem.name) SOCDEM_ELEMS = rule(or_(NAME, GENDER, date.DATE, AGE, LOCATION)) SOCDEM = rule( NAME, GENDER.optional(), or_( rule(AGE.optional(), date.DATE.interpretation(Socdem.date_of_birth).optional()), rule( date.DATE.interpretation(Socdem.date_of_birth).optional(), AGE.optional()), ), LOCATION.optional()).interpretation(Socdem) return SOCDEM_ELEMS, SOCDEM
def __init__(self, names: list = [], version_numbers: list = [], version_names: list = [], consoles: list = []): rules = rule(morph_pipeline(names).interpretation(self.__game.name.const(names[0])), morph_pipeline(version_numbers).interpretation(self.__game.version_number).optional(), morph_pipeline(version_names).interpretation(self.__game.version_name).optional(), morph_pipeline(consoles).interpretation(self.__game.console).optional()) game = or_(rules).interpretation(self.__game) self.parser = Parser(game)
def yargy_smart_home(msg): Do = fact('Entity', ['action', 'object', 'place']) Actions = dictionary({'Включи', 'Отключи', 'Выключи'}) Objects = dictionary( {'Лампочку', 'Свет', 'Розетку', 'Видеокамеру', 'Камеру'}) ObjectsList = or_( rule(Objects), rule(Objects, Objects), ) Prep = dictionary({'в', 'на'}) Place = dictionary({ 'Гостевой', 'Ванной', 'спальной', 'спальне', 'холле', 'коридоре', 'кухне' }) Room = {'комната'} ActionPhrase = or_( rule(Actions.interpretation(Do.action.normalized()), Objects.interpretation(Do.object.normalized()), Prep.optional(), Place.interpretation(Do.place.normalized()), rule(normalized('комната')).optional()), rule(Actions.interpretation(Do.action.normalized()), Objects.interpretation(Do.object.normalized()), Prep.optional(), Place.interpretation(Do.place.normalized())), rule(Prep.optional(), Place.interpretation(Do.place.normalized()), rule(normalized('комната')).optional(), Actions.interpretation(Do.action.normalized()), Objects.interpretation( Do.object.normalized()))).interpretation(Do) res = [] parser = Parser(ActionPhrase) for match in parser.findall(msg): res.append({ 'Действие': match.fact.action, 'Объект': match.fact.object, 'Место': match.fact.place, }) return res
def get_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN = gram('NOUN') ADJF = gram('ADJF') CONJ = gram('CONJ') NAME = gram('Name') PREP = gram('PREP') NPRO = gram('NPRO') #GEO=gram('Geox') GEO = rule( and_( gram('Geox'), not_( or_( eq('артема'), eq('фармана'), eq('оскол'), eq('мунарева'), )))) NAME_OR_NOUN = or_(NAME, NOUN) HOUSE = morph_pipeline(['дом', 'корпус', 'квартира', 'строение', 'ст']) CITY_EXEP = rule(morph_pipeline(['артем', 'фармана', 'оскол'])) HOUSE_NOT = rule(and_(not_(ADJF))) HOUSE1 = morph_pipeline(['a', 'а', '/', 'б']) UNIT1 = or_( rule(and_(INT, not_(eq('3'))), HOUSE1.optional(), HOUSE_NOT.optional(), INT.optional())) DOUBLED = rule(RU, RU) UNIT = or_(rule(HOUSE.optional(), UNIT1)) COMPLICATED_HOUSE = rule(UNIT.repeatable()) FINAL_HOUSE = or_(COMPLICATED_HOUSE) return FINAL_HOUSE
def yargy_get_genre(msg): Genre = fact('Genre', ['genre']) GENRES = { 'ужасы', 'ужастики', 'мелодрама', 'комедия', 'боевик', 'триллер', 'мультик', 'мультфильм', 'драма' } GENRES_NAME = dictionary(GENRES) GENRES_WORDS = or_(rule(normalized('жанр')), rule(normalized('раздел'))) GENRE_PHRASE = or_(rule(GENRES_NAME, GENRES_WORDS.optional()), rule(GENRES_WORDS.optional(), GENRES_NAME)).interpretation( Genre.genre.inflected()).interpretation(Genre) res = [] parser = Parser(GENRE_PHRASE) for match in parser.findall(msg): res.append(match.fact.genre) return res
def req_predicate(word: str = "?", predicate_type: str = "глаг"): # add predicate_type handling if predicate_type == "глаг": predicate = y.or_(yp.gram("VERB"), yp.gram("INFN")) elif predicate_type == "сущ": predicate = y.or_(yp.gram("INFN"), yp.gram("NOUN")) elif predicate_type == "любой": predicate = y.or_(yp.gram("VERB"), yp.gram("INFN"), yp.gram("NOUN")) else: raise ValueError("predicate_type must be глаг or сущ or любой") if word != "?": if "|" not in word: # single-word scope predicate = y.and_(yp.normalized(word), predicate) else: predicate_words = word.split("|") scope_rule = list(map(yp.normalized, predicate_words)) scope_rule = y.or_(*scope_rule) predicate = y.and_(scope_rule, predicate) return predicate
def get_hyperonyms(main_word): HYPONYM = eq(utils.deaccent(main_word)) RULE = or_(rule(HYPONYM, ATAKJE, START, MID, END), rule(HYPONYM, MID, END), rule(START_S, END, KAK, HYPONYM), rule(END, INCLUDING, HYPONYM)) parser = Parser(RULE) text = utils.deaccent(wikipedia.summary(main_word)) print(text) text = re.sub(r'\(.+?\)', '', text) text = text.lower().replace('* сергии радонежскии* ', '') for idx, match in enumerate(parser.findall(text.lower())): k = [_.value for _ in match.tokens] print(k)
def test_predicate(): tokenizer = MorphTokenizer() predicate = or_(normalized('московским'), and_(gram('NOUN'), not_(gram('femn')))) predicate = predicate.activate(tokenizer) tokens = tokenizer('московский зоопарк') values = [predicate(_) for _ in tokens] assert values == [True, True] tokens = tokenizer('московская погода') values = [predicate(_) for _ in tokens] assert values == [True, False]
def yargy_get_channel(msg): Channel = fact('Channel', ['name']) CNANNELS = { 'Первый', 'Россия', 'ТВЦ', 'НТВ', 'ТНТ', 'СТС', 'Культура', 'Дождь', 'Спас' } CNANNELS_NAME = dictionary(CNANNELS) CHANNEL_WORDS = or_(rule(normalized('канал')), rule(normalized('программа'))) CHANNEL_PHRASE = or_( rule(CHANNEL_WORDS, CNANNELS_NAME), rule(CNANNELS_NAME, CHANNEL_WORDS.optional())).interpretation( Channel.name.inflected()).interpretation(Channel) res = [] parser = Parser(CHANNEL_PHRASE) for match in parser.findall(msg): # print(match.fact) for channel in CNANNELS: if channel.lower() in match.fact.name: res.append(channel) return res
def make_rule_from_station(title: str) -> Rule: title = title.replace('1', '').replace('2', '').lower().strip() phrase = [] for token in title.split(' '): word = Abbrs.get(token) if Abbrs.is_abbr(token) \ else normalized(token).interpretation(meaning.const(token)) phrase.append(word.interpretation(Array.element)) phrase = rule(*phrase).means(Array).interpretation( meaning.custom(lambda p: Restore.get(' '.join(p.element)))).means( StationTitle.value) if Synonyms.has(title): synonym = Synonyms.get(title).interpretation( meaning.custom(lambda p: Restore.get(p))).means(StationTitle.value) return or_(synonym, phrase) return phrase
def test_predicate(): tokenizer = MorphTokenizer() predicate = or_( normalized('московским'), and_( gram('NOUN'), not_(gram('femn')) ) ) context = Context(tokenizer) predicate = predicate.activate(context) tokens = tokenizer('московский зоопарк') values = [predicate(_) for _ in tokens] assert values == [True, True] tokens = tokenizer('московская погода') values = [predicate(_) for _ in tokens] assert values == [True, False]
def __init__(self, logger=None, env='local'): self.env = env if logger is None: self.logger = logging.getLogger("ThirdPartyExtractor") self.logger.setLevel(logging.DEBUG) handler = RotatingFileHandler("thirdparty_extractor.log", mode='a', encoding='utf-8', backupCount=5, maxBytes=1 * 1024 * 1024) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) else: self.logger = logger self.texttools = texttools.TextTools(self.logger) self.tokenizer = MorphTokenizer() self.morph = pymorphy2.MorphAnalyzer() EXCLUDE = morph_pipeline(['не передавать']) SUBJECT = morph_pipeline( ['передача третьим лицам', 'поручать аффилированным лицам']) SPECIALS = morph_pipeline([ # 'рекламныя цель', # 'получение сообщений', # 'рассылка', # 'предложение услуг', # 'продвижение товаров', # 'продвижение услуг' ]) ADS = or_(rule(SUBJECT), rule(SPECIALS)) self.thirdp_parser = Parser(ADS) self.exclude_parser = Parser(rule(EXCLUDE))
def get_mid_rules(): RU = type('RU') INT = type('INT') NONE = type('NONE') NOUN= gram('NOUN') ADJF = gram('ADJF') CONJ=gram('CONJ') PATR=gram('Patr') NAME = gram('Name') PREP=gram('PREP') PATRONYMIC=rule( PATR ) COMPLICATED=rule( PATRONYMIC.repeatable() ) FINAL = or_(COMPLICATED) return FINAL
MONTH = and_( gte(1), lte(12) ).interpretation( Date.month.custom(int) ) DAY = and_( gte(1), lte(31) ).interpretation( Date.day.custom(int) ) YEAR_WORD = or_( rule('г', eq('.').optional()), rule(normalized('год')) ) YEAR = and_( gte(1000), lte(2100) ).interpretation( Date.year.custom(int) ) YEAR_SHORT = and_( length_eq(2), gte(0), lte(99) ).interpretation( Date.year.custom(lambda _: 1900 + int(_))
# TODO COUNTRY_VALUE = dictionary({ 'россия', 'украина' }) ABBR_COUNTRY_VALUE = in_caseless({ 'рф' }) COUNTRY = or_( COUNTRY_VALUE, ABBR_COUNTRY_VALUE ).interpretation( Country.name ).interpretation( Country ) ############# # # FED OKRUGA # ############ FED_OKRUG_NAME = or_( rule( dictionary({
'президент', 'сопрезидент', 'вице-президент', 'экс-президент', 'председатель', 'руководитель', 'директор', 'глава', ]) GENT = gram('gent') WHERE = or_( rule(GENT), rule(GENT, GENT), rule(GENT, GENT, GENT), rule(GENT, GENT, GENT, GENT), rule(GENT, GENT, GENT, GENT, GENT), ) POSITION = or_( POSITION, rule(POSITION, WHERE) ).interpretation( Person.position ) NAME = NAME.interpretation( Person.name )
DOT = eq('.') INT = type('INT') ######## # # CURRENCY # ########## EURO = or_( normalized('евро'), eq('€') ).interpretation( const(dsl.EURO) ) DOLLARS = or_( normalized('доллар'), eq('$') ).interpretation( const(dsl.DOLLARS) ) RUBLES = or_( rule(normalized('рубль')), rule( or_( caseless('руб'),
FEDERAL_DISTRICT = rule( rule(caseless('северо'), '-').optional(), dictionary({ 'центральный', 'западный', 'южный', 'кавказский', 'приволжский', 'уральский', 'сибирский', 'дальневосточный', }).match(gnc), or_( rule( dictionary({'федеральный'}).match(gnc), dictionary({'округ'}).match(gnc), ), rule('ФО'), ), ).interpretation(Location.name.inflected()) gnc = gnc_relation() AUTONOMOUS_DISTRICT = rule( gram('ADJF').match(gnc).repeatable(), or_( rule( dictionary({'автономный'}).match(gnc), dictionary({'округ'}).match(gnc), ), rule('АО'),
ABBR = gram('Abbr') SURN = gram('Surn') NAME = and_( gram('Name'), not_(ABBR) ) PATR = and_( gram('Patr'), not_(ABBR) ) FIRST = and_( NAME_CRF, or_( NAME, IN_MAYBE_FIRST, IN_FIRST ) ).interpretation( Name.first.inflected() ).match(gnc) FIRST_ABBR = and_( ABBR, TITLE ).interpretation( Name.first ).match(gnc) ##########
'стартап', 'нотариальная контора', 'букмекерская контора', 'авиазавод', 'автозавод', 'винзавод', 'подстанция', 'гидроэлектростанция', ]) gnc = gnc_relation() ADJF_PREFIX = rule( or_( rule(gram('ADJF').match(gnc)), # международное rule( # историко-просветительское true(), eq('-'), gram('ADJF').match(gnc), ), ), or_(caseless('и'), eq(',')).optional(), ).repeatable() case = case_relation() GENT_GROUP = rule( gram('gent').match(case) ).repeatable().optional() QUOTED = rule( TYPE, in_(QUOTES), not_(in_(QUOTES)).repeatable(),