def run(self, key, ctx: Context): from jsonpath_ng import jsonpath, parse from sagas.nlu.inspector_wordnet import predicate from sagas.nlu.ruleset_procs import cached_chunks lang = ctx.lang domain_name = f"{self.domains}_domains" # like: 'verb_domains' parsers = [parse(normal_path(expr)) for expr in self.paths] results = [] engine = cf.engine(lang) if self.engine is None else self.engine chunks = cached_chunks(ctx.sents, lang, engine) for chunk in chunks[domain_name]: json_data = chunk # for expr in exprs: for idx, parser in enumerate(parsers): # print([(match.value, str(match.full_path)) for match in parser.find(json_data)]) word = '/'.join( [match.value for match in parser.find(json_data)]) pred_r = predicate(self.kind, word, lang, self.pos) # tc.emp('yellow' if not pred_r else 'green', f".. {word} is {self.kind}: {pred_r}") logger.debug(f".. {word} is {self.kind}: {pred_r}") results.append(pred_r) if pred_r: ctx.add_result(self.name(), 'default', f"{self.domains}:{self.paths[idx]}", { 'category': self.kind, 'pos': self.pos, **word_values(word, lang) }, delivery_type='sentence') logger.debug(f"{results}") return any(results) if self.match_method == 'any' else all(results)
def vis_domains(sents, lang, domain=None, engine=None, all_subsents=False): """ >>> from sagas.kit.analysis_kit import vis_domains >>> sents='What do you think about the war?' >>> lang='en' >>> domain='subj_domains' # 'verb_domains', 'aux_domains' >>> vis_domains(sents, lang, domain) :param sents: :param lang: :param domain: :return: """ from sagas.nlu.ruleset_procs import cached_chunks, get_main_domains from sagas.conf.conf import cf engine = cf.engine(lang) if engine is None else engine if domain is None: domain, domains = get_main_domains(sents, lang, engine) else: chunks = cached_chunks(sents, lang, engine) domains = chunks[domain] if len(domains) == 0: return None if not all_subsents: el = domains[0] return vis_domains_data(domain, el) else: return [vis_domains_data(domain, el) for el in domains]
def check_clause_sub(sents:Text, lang:Text, domain:Text, cla:Text, rel:Text, cats:Union[Text, Set, List]): """ >>> from sagas.nlu.inspector_clauses import check_clause_sub >>> check_clause_sub(sents, 'pt', 'verb_domains', 'obl', 'cop', {'be'}) :param sents: :param lang: :param domain: :param cla: :param rel: :param cats: :return: """ from sagas.nlu.uni_chunks import get_chunk from sagas.nlu.ruleset_procs import cached_chunks # cla = 'obl', rel = 'cop', cat='be' chunks = cached_chunks(sents, lang, cf.engine(lang)) result = get_chunk(chunks, domain, cla, lambda w: {'rel': w.dependency_relation, 'pos': w.upos.lower(), 'word': f"{w.text}/{w.lemma}"}) word = next((w['word'] for w in result if w['rel'] == rel), None) if word: if isinstance(cats, str): return check_chain(cats, word, '*', lang) else: return any([check_chain(cat, word, '*', lang) for cat in cats]) return False
def vis_doc(sents, lang): from sagas.nlu.ruleset_procs import cached_chunks from sagas.nlu.uni_remote_viz import list_contrast, display_doc_deps from sagas.conf.conf import cf chunks = cached_chunks(sents, lang, cf.engine(lang)) return display_doc_deps(chunks['doc'], None)
def test_class_matcher(): from sagas.nlu.uni_chunks import get_chunk from pampy import match, _ from dataclasses import dataclass @dataclass class WordData: index: int rel: str pos: str word: str # She denied being my mother sents = 'Ela negou ser minha mãe.' lang = 'pt' domain = 'verb_domains' chunks = cached_chunks(sents, lang, cf.engine(lang)) cla = 'obl' ana = get_chunk( chunks, domain, cla, lambda w: WordData(index=w.index, rel=w.dependency_relation, pos=w.upos.lower(), word=f"{w.text}/{w.lemma}")) t_rs = [] for word_data in ana: r = match(word_data, WordData(_, _, 'aux', _), lambda *arg: f"aux: {arg[2]}", WordData(_, 'obl', 'noun', _), lambda *arg: arg, _, None) t_rs.append(r) assert t_rs == ['aux: ser/ser', None, (5, 'mãe/mãe')]
def run(self, key, ctx: Context): from sagas.nlu.ruleset_procs import list_words, cached_chunks, get_main_domains from sagas.conf.conf import cf logger.debug(f".. check against {key}") if key not in ctx.indexes: return False # lemma = ctx.lemmas[key] sents = ctx.sents lang = ctx.lang chunks = cached_chunks(sents, lang, cf.engine(lang)) doc = chunks['doc'] ents = get_entities(sents) prt = ctx.indexes[key] indexes = get_children_index(doc, prt) idx_ent = { el['index']: el['entity'] for el in get_entity_mapping(sents, doc, ents) } children_ents = [(idx, idx_ent[idx] if idx in idx_ent else '_') for idx in indexes] result = self.test_ent in {e[1] for e in children_ents} if result: ctx.add_result(self.name(), 'default', key, idx_ent) return result
def parse(data): if 'engine' not in data: data['engine'] = cf.engine(data['lang']) engine = data['engine'] response = requests.post(f'{cf.servant(engine)}/verb_domains', json=data) if response.status_code == 200: return response.json() return None
def get_domains(self, ctx:Context): from sagas.nlu.ruleset_procs import cached_chunks from sagas.conf.conf import cf # dn = lambda domain: f'{domain}_domains' if domain != 'predicts' else domain chunks = cached_chunks(ctx.sents, ctx.lang, cf.engine(ctx.lang)) domains = chunks[ctx.domain_type] return domains
def is_noun_desc(ctx: Context, domain): sents, lang = ctx.sents, ctx.lang chunks = cached_chunks(sents, lang, cf.engine(lang)) domains = chunks[domain] domain = domains[0] comps = [k for k, v in domain.items() if isinstance(v, list)] logger.debug(f'.. {comps}') return domain['upos']=='NOUN' and \ all(c for c in comps if c.endswith('mod') or c in ('punct'))
def run(self, key, ctx:Context): from sagas.nlu.ruleset_procs import list_words, cached_chunks, get_main_domains from sagas.conf.conf import cf chunks = cached_chunks(ctx.sents, ctx.lang, cf.engine(ctx.lang)) index = next((x[1] for x in ctx.domains if x[0] == self.part), -1) if index!=-1: rs=self.collect_children(chunks, ctx.lang, index+1) if rs: ctx.add_result(self.name(), 'default', self.part, rs) return True
def parse_comps(sents, source): sents = fix_sents(sents, source) engine = cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) if len(resp['predicts']) > 0: rs = resp['predicts'] else: rs = get_chunks(doc_jsonify) return rs
def analyse_domains(self, sents, lang, engine=None, domain=None): from sagas.nlu.ruleset_procs import cached_chunks, get_main_domains from sagas.conf.conf import cf engine = cf.engine(lang) if engine is None else engine if domain is None: domain, domains = get_main_domains(sents, lang, engine) else: chunks = cached_chunks(sents, lang, engine) domains = chunks[domain] return domains
def get_feats_map(sents, lang, domain, path): domain_name = f'{domain}_domains' if domain != 'predicts' else domain from sagas.nlu.ruleset_procs import cached_chunks chunks = cached_chunks(sents, lang, cf.engine(lang)) parser = parse(feats_for_path(path)) results = [] for chunk in chunks[domain_name]: vals = [match.value for match in parser.find(chunk)] if vals: results.extend([feats_map(val) for val in vals]) return results
def check_aux(self, sents, lang): """ $ python -m sagas.nlu.predicts_cli check_aux 'what will be the weather in three days?' en :param sents: :param lang: :return: """ data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_aux_domain(doc_jsonify) ps = PredictSamples() tc.emp('cyan', f"result: {ps.check_domains(domains, lang)}")
def check_rule(self, sents, lang, rule, engine=None): """ $ nlu check_rule '彼のパソコンは便利じゃない。' ja \ "subj('adj',ガ=kindof('artifact', 'n'))" :param sents: :param lang: :param rule: :return: """ from sagas.tool.dynamic_rules import DynamicRules data = {'lang': lang, "sents": sents} DynamicRules().predict(data, rule, engine=engine or cf.engine(lang))
def check_subj(self, sents, lang): """ $ python -m sagas.nlu.predicts_cli check_subj 'Яблоко - это здоровый фрукт.' ru :param sents: :param lang: :return: """ data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_subj_domain(doc_jsonify) ps=PredictSamples() tc.emp('cyan', f"result: {ps.check_domains(domains, lang)}")
def parse_subj(): lang = 'ru' sents = 'Яблоко - это здоровый фрукт.' st.write(sents) data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_subj_domain(doc_jsonify) testing(domains, 'ru') gv = display_doc_deps(doc_jsonify, resp) st.graphviz_chart(gv) st.write(domains)
def parse_aux(): lang = 'en' sents = 'what will be the weather in three days?' st.write(sents) data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_aux_domain(doc_jsonify) testing(domains, 'en') # show analyse graph gv = display_doc_deps(doc_jsonify, resp) st.graphviz_chart(gv) st.write(domains)
def parse_deps(text, lang, translit=None): text = fix_sents(text, lang) engine = cf.engine(lang) # g = sentence_view(lang, text, engine=engine, translit_lang=lang, enable_contrast=True) doc_jsonify, resp = dep_parse(text, lang, engine, ['predicts']) if doc_jsonify is not None: list_chunks(doc_jsonify, resp, lang, enable_contrast=True) g = display_doc_deps(doc_jsonify, resp, translit_lang=lang) st.graphviz_chart(g) if translit is not None: st.text(f"♤ {translit}") words = [word.text for word in doc_jsonify.words] tools.contrast(text, lang, word_map=words)
def get_source(sents, lang, domain_type=None)-> Observable: from sagas.nlu.ruleset_procs import cached_chunks, get_main_domains from sagas.conf.conf import cf import rx engine=cf.engine(lang) if domain_type is None: domain_type, domains=get_main_domains(sents, lang, engine) else: chunks = cached_chunks(sents, lang, engine) domains = chunks[domain_type] table_rs = [] for ds in domains: flat_table(ds, '', table_rs) return rx.of(*table_rs)
def descrip(self, sents, lang, engine=None): """ $ python -m sagas.nlu.anal_corpus descrip 'Karpet di kantor saya abu-abu.' id $ sid 'Celana ini bisa diperbesar.' :param sents: :param lang: :param engine: :return: """ from sagas.nlu.anal import build_anal_tree, Doc, AnalNode from sagas.conf.conf import cf f = build_anal_tree(sents, lang, cf.engine(lang)) f.draw() model = f.model() model_info(model)
def ex_chunk(key: Text, cnt: Text, comp: Text, ctx: cla_meta_intf, clo): from sagas.nlu.uni_chunks import get_chunk from sagas.nlu.ruleset_procs import list_words, cached_chunks from sagas.conf.conf import cf # get_chunk(f'verb_domains', 'xcomp/obj', lambda w: w.upos) chunks = cached_chunks(ctx.sents, ctx.lang, cf.engine(ctx.lang)) domain, path = key.split(':') result = get_chunk(chunks, f'{domain}_domains' if domain != 'predicts' else domain, path, clo=clo) logger.debug(f"extract chunk: {domain}, {path}, {result}") if len(result) > 0: ctx.add_result(extractor, comp, key, result) return True return False
def clip_parse(self, source, sents='', specified='default', do_test=False): """ >> clip text: یک آبجو مى خواهم. $ nlu clip_parse fa $ engine='stanford' nluc ar $ nlu clip_parse fi 'Tuolla ylhäällä asuu vanha nainen.' $ nluc nl 'De vrouw heeft verschillende appels.' $ nluc id 'Ini adalah judul buku yang saya baca.' aux $ nluc fi 'Voiko täältä lainata aurinkovarjoa?' default True :param source: :return: """ from sagas.nlu.uni_remote import dep_parse from sagas.nlu.common import get_from_clip from sagas.conf.conf import cf from sagas.nlu.uni_remote_viz import list_chunks from sagas.nlu.utils import fix_sents if sents=='': sents = get_from_clip() if sents.strip()=='': tc.info('no text avaliable in clipboard.') return sents=fix_sents(sents, source) tc.info(sents) # Parse the sentence and display it's chunks, domains and contrast translations. engine=cf.engine(source) doc_jsonify, resp = dep_parse(sents, source, engine, ['predicts']) if doc_jsonify is None: raise Exception(f'Cannot parse sentence for lang {source}') list_chunks(doc_jsonify, resp, source, enable_contrast=True, specified=None if specified=='default' else specified) words = [word.text for word in doc_jsonify.words] self.contrast(sents, source, word_map=words) ## visual tree self.main_domains(sents, source, engine, False) ## add rulesets procs from sagas.nlu.inferencer import do_infers cli_cmd, pats = do_infers(sents, source) if do_test: for pat in pats: self.check_rule(sents, source, pat)
def print_sents(self, sents, lang, engine=None): """ $ python -m sagas.nlu.ruleset_procs print_sents 'I want to play music.' en $ python -m sagas.nlu.ruleset_procs print_sents "クモは4つの右の目をしています。" ja corenlp :param sents: :param lang: :return: """ # lang = 'en' if engine is None: engine = cf.engine(lang) data = {'lang': lang, "sents": sents, 'engine': engine} doc_jsonify, resp = parse_sents(data) rs = get_chunks(doc_jsonify) if lang in non_spaces: delim = '' else: delim = ' ' for serial, r in enumerate(rs): meta = build_meta(r, data) domains = r['domains'] # print([(x[0], x[2]) for x in domains]) # keys = {x[0] for x in domains} grp = lambda p, idx: [x[idx] for x in domains if x[0] == p] tokens = {x: grp(x, 2) for x in keys} words = {x: delim.join(grp(x, 2)) for x in keys} lemmas = {x: delim.join(grp(x, 3)) for x in keys} print('meta keys', meta.keys()) print('tokens', tokens) print('words', meta['word'], words) print('lemmas', lemmas) # ctx = Context(meta, domains) # print(ctx.lemmas) print('chunks', ctx._chunks) g = display_doc_deps(doc_jsonify, resp, translit_lang=lang) print(*[(w.index, w.text, w.governor, doc_jsonify.words[w.governor - 1].text) for w in doc_jsonify.words], sep='\n') tc.gv(g)
def doc(self, sents, lang, engine=None): """ $ nlu doc 'これを作ってあげました。' ja analspa $ nlu doc '主FAX番号はありますか' ja analspa $ nlu doc '你在北京的公司的主要传真号码是什么' zh analz $ nlu doc '你在北京的公司的主要传真号码是什么' zh analspa $ nlu doc "Alex Smith was working at Acme Corp Inc." en spacy $ nlu doc 'this is a digital good' en :param sents: :param lang: :param engine: :return: """ from sagas.nlu.ruleset_procs import parse_sents data = {'lang': lang, "sents": sents, 'engine': engine or cf.engine(lang)} doc_jsonify, resp = parse_sents(data) pprint(doc_jsonify.as_json)
def run(self, key: Text, ctx: Context) -> bool: from sagas.nlu.predicts import predicate from sagas.nlu.operators import ud final_rs = [] sents, lang = ctx.sents, ctx.lang chunks = cached_chunks(sents, lang, cf.engine(lang)) domains = chunks[self.domain] for el in domains: # logger.debug(f"`{el['lemma']}` >> *{el['dc']['lemma']}*") # r1 = predicate(el, ud.__text('will') >> [ud.nsubj('what'), ud.dc_cat('weather')], lang) rs: List[Any] = predicate(el, self.checker, lang) # r2=predicate(el, ud.__cat('be') >> [ud.nsubj('what'), ud.dc_cat('animal/object')], lang) result = all([r[0] for r in rs]) final_rs.append(result) logger.debug(f'{[r[0] for r in rs]}, {result}') return any(final_rs)
def asserts(self, sents, lang='en'): """ $ python -m sagas.nlu.ruleset_procs asserts 'I want to play music.' en :param sents: :param lang: :return: """ import sagas.nlu.ruleset_fixtures as rf data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) v_domains = get_verb_domain(doc_jsonify) host = create_host() for v in v_domains: r1 = host.assert_fact('verbs', v) pprint(r1)
def verb_domains(self, sents, lang='en', engine=None): """ $ python -m sagas.tool.misc verb_domains 'Мы написали три книги за год.' ru $ python -m sagas.tool.misc verb_domains 'Ivan is the best dancer .' en $ python -m sagas.tool.misc verb_domains 'Ivan is the best dancer .' en spacy $ domains 'Die Aufnahmen begannen im November.' de $ domains '伊万是最好的舞者' zh ltp $ domains '现在是几点' zh ltp $ domains '现在是几点?' zh corenlp :param sents: :param lang: :return: """ data = { 'lang': lang, "sents": sents, 'engine': engine or cf.engine(lang) } get_verb_domains(data)
def has_pos_in_part(part: Text, pos: Union[list, str]): from sagas.nlu.uni_chunks import get_chunk from sagas.nlu.ruleset_procs import list_words, cached_chunks from sagas.conf.conf import cf chunks = cached_chunks(ctx.sents, ctx.lang, cf.engine(ctx.lang)) domain, path = part.split(':') result = get_chunk( chunks, f'{domain}_domains' if domain != 'predicts' else domain, path, lambda w: (w.upos.lower(), w.text)) if isinstance(pos, str): pos = [pos] succ = False for el in result: if el[0] in pos: ctx.add_result(self.name(), f'has_pos_{"_or_".join(pos)}', part, el[1]) succ = True return succ
def __init__(self, meta, domains, name=''): self.meta = meta self.name = name self.domains = domains self.domain_type = meta['domain_type'] self.domain_name = self.domain_type.replace('_domains', '') # self.chunks = {x[0]: x[4] for x in domains} self._chunks = [Chunk(x[0], x[4]) for x in domains] # all universal syntactic relations self.rels = {x[0] for x in domains} self._stems = meta['stems'] if len(self._stems) == 0: self._stems = [(x[0], x[4]) for x in domains] self._lang = meta['lang'] if self._lang in non_spaces: self.delim = '' else: self.delim = ' ' self._sents = meta['sents'] if 'sents' in meta else '' self._engine = meta['engine'] if 'engine' in meta else cf.engine( self._lang) # self.lemmas = {x[0]: x[3] for x in domains} # self.words = {x[0]: x[2] for x in domains} # Support repeated keys keys = {x[0] for x in domains} self.indexes = {x[0]: x[1] for x in domains} grp = lambda p, idx: [x[idx] for x in domains if x[0] == p] grp_join = lambda p, idx1, idx2: [ f"{x[idx1]}/{x[idx2]}" for x in domains if x[0] == p ] # self.tokens = {x: grp(x, 2) for x in keys} self.tokens = {x: grp_join(x, 2, 3) for x in keys} self.words = {x: self.delim.join(grp(x, 2)) for x in keys} self.lemmas = {x: self.delim.join(grp(x, 3)) for x in keys} self.feats = {x[0]: x[5] for x in domains} # self.meta['intermedia']={} self._results = []