def display_doc_deps(doc_jsonify, resp=None, translit_lang=None): tc.emp('cyan', f"✁ dependency-graph. {'-' * 25}") cv = EnhancedViz(shape='egg', size='8,5', fontsize=20, translit_lang=translit_lang) return cv.analyse_doc(doc_jsonify, None, console=False)
def nlu_parse(self, sents, lang='en'): """ Nlu parse routines $ python -m saai.saai_cli nlu_parse "Shenzhen ist das Silicon Valley für Hardware-Firmen" de $ python -m saai.saai_cli nlu_parse '附近有什么好吃的' zh $ python -m saai.saai_cli nlu_parse '六安市公安局裕安分局平桥派出所接到辖区居民戴某报警' zh $ python -m saai.saai_cli nlu_parse '一直打喷嚏怎么办' zh $ python -m saai.saai_cli nlu_parse "I was born in Beijing." en $ python -m saai.saai_cli nlu_parse "Я хочу поехать в москву" ru $ python -m saai.saai_cli nlu_parse "Jokowi pergi ke Singapura." id :param sents: :param lang: :return: """ from sagas.conf.conf import cf from sagas.nlu.rasa_procs import invoke_nlu import json import sagas.tracker_fn as tc endpoint = cf.ensure('nlu_multilang_servant') print('.. with endpoing', endpoint) result = invoke_nlu(endpoint, lang, "current", sents) tc.emp('yellow', result) if result != None and len(result) > 0: print(json.dumps(result, indent=4, ensure_ascii=False)) intent = result["intent"] print('%s -> %f' % (intent['name'], intent['confidence'])) entities = result['entities'] print('entities ->', [ent['entity'] for ent in entities])
def process_intents(self, sents, lang, intents, do_action: bool): from sagas.nlu.ruleset_actions import ruleset_actions print('intents: ', intents) action_binds = ruleset_actions.get_intents() if self.verbose: pprint(action_binds) schedules = [] for intent_item in intents: intent = intent_item['intent'] acts = [ ac['action'] for ac in action_binds if ac['intent'] == intent ] tc.emp('green', f"action for intent {intent}: {acts}") if len(acts) > 0: schedules.append({ 'intent': intent, 'action': acts, 'sents': sents, 'lang': lang, 'object_type': intent_item['object_type'], 'parameters': {}, }) if len(schedules) > 0: self.invoke_actions(schedules, do_action) else: tc.emp("yellow", 'no scheduled actions.')
def color_print(color:str, text): # from termcolor import colored if isinstance(text, list): for t in text: tc.emp(color, t) else: tc.emp(color, text)
def corpus_search_bar(): from sagas.corpus.searcher import CorpusSearcher, search_in_list from augmentor.viz_fun import parse_deps text = st.text_input("Input a sentence", "I eat rice") langs=select_langs() top_result = st.number_input('Insert a number', value=5) searcher=CorpusSearcher(model_file=f'{cf.conf_dir}/stack/spacy-2.2/data/embedded_corpus.pkl') relevant_quotes, relevant_chapters = searcher.search(text, ['text', 'chapter'], top_result) for q in range(top_result): # tc.emp('magenta', '>' + relevant_quotes[q]) st.subheader('>' + relevant_quotes[q]) tc.emp('green', relevant_chapters[q]) if langs is not None: # search_in_list('I write a letter.', ['ja', 'fa', 'id']) results = search_in_list(relevant_quotes[q], langs) if st.checkbox(f"{q}. show result in lang {', '.join(langs)}"): st.write(results) sent_rs=parse_controls(results) if st.button(f"{q}. parse sents in lang {', '.join(langs)}"): for sent in sent_rs: parse_deps(sent[0], sent[1], sent[2]) searcher.end()
def retrieve(word, indicator, pos='*'): from sagas.nlu.synonyms import synonyms word_syn = synonyms.match(word, lang) # print(f".. subs {word}: {word_syn}") if word_syn is not None: rs = retrieve_word_info('get_synsets', word_syn, 'en', pos=pos) else: rs = retrieve_word_info('get_synsets', word, lang, pos=pos) if len(rs) > 0: mean = get_possible_mean(rs) if collect: resp.append({ 'word': word, 'indicator': indicator, 'spec': mean, 'comments': rs }) else: comments = ', '.join(rs)[:25] # tc.info('♥ %s(%s): %s...' % (colored(word, 'magenta'), indicator, comments)) tc.emp( 'magenta', '♥ %s(%s, %s): %s...' % (word, indicator, mean, comments)) resp.append('♥ %s(%s): %s...' % (word, indicator, comments)) return True return False
def execute(self): ''' The execute method default implementation :return: ''' if len(self.matched) > 0: matched_info = {k: len(v.results) for k, v in self.matched.items()} tc.emp('blue', f"♯ matched id rules: {matched_info}")
def display_root_predicate(doc_jsonify, resp): from sagas.nlu.ruleset_procs import root_predicate root_pred = root_predicate(doc_jsonify, resp['predicts'] if 'predicts' in resp else []) if root_pred: tc.emp( 'yellow', f".. root predicate: {root_pred['index']}.{root_pred['lemma']}")
def match_agent(target, agent, verbose=False): import sagas.tracker_fn as tc rset = [(cond, target.match(cond)) for cond in agent.meta.cond] succ = all([r for c, r in rset]) if verbose: tc.emp('green' if succ else 'white', '✔' if succ else '✘', agent.meta.ds.name, rset) return succ
def root_tree(self): from sagas.nlu.nlu_tools import vis_tree from sagas.nlu.ruleset_procs import cached_chunks chunks = cached_chunks(self.meta.sents, source=self.meta.lang, engine=self.meta.engine) tc.emp('cyan', f"✁ root tree {self.meta.engine} {'-' * 25}") ds = chunks['root_domains'][0] vis_tree(ds, self.meta.lang, trans=cf.is_enabled('trans_tree'))
def check_domains(self, domains, lang): final_rs=[] for el in domains: tc.emp('yellow', f"`{el['lemma']}` >> *{el['dc']['lemma']}*") r1 = predicate(el, ud.__text('will') >> [ud.nsubj('what'), ud.dc_cat('weather')], lang) # r2=predicate(el, ud.__cat('be') >> [ud.nsubj('what'), ud.dc_cat('animal/object')], lang) result=all([r[0] for r in r1]) final_rs.append(result) tc.emp('green' if result else 'red', [r[0] for r in r1], result) return any(final_rs)
def _descriptors(results: List[Any], data: Dict[Text, Any]): from sagas.nlu.descriptor import Descriptor logger.debug(f"data keys: {list(data.keys())}") dsp = Descriptor() pats = dsp.build(results) tc.emp('cyan', f"✁ render patterns {len(pats)}. {'-' * 25}") for i, pat in enumerate(pats.values()): tc.emp('magenta', f"{i}. {pat}") if cf.is_enabled('print_detail'): pprint(list(dsp.value_map.values()) or '.. no rendered results')
def get_domains(sents, lang, engine='corenlp', options=None): """ >>> from sagas.nlu.legacy.aiobj_kit import get_domains >>> get_domains('你有几台笔记本电脑?', 'zh', 'ltp') >>> get_domains('列出上周编辑的文件。', 'zh', 'ltp', DomainGetOptions(enable_predicts=True)) :param sents: :param lang: :param engine: :param options: :return: """ # from IPython.display import display if options is None: options=DomainGetOptions() pipelines=['predicts'] if options.enable_predicts else [] doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) result_set=[] if doc_jsonify is not None: tc.emp('cyan', resp) if resp is not None and 'predicts' in resp and len(resp['predicts'])>0: rs=resp['predicts'] # print(rs) else: # print(doc_jsonify.words_string()) rs = get_chunks(doc_jsonify) if len(rs)>0: if options.list_chunks: list_rs(rs, lang) if options.deps_graph: # display(display_doc_deps(doc_jsonify, resp)) tc.gv(display_doc_deps(doc_jsonify, resp, translit_lang=lang if lang in ('ja', 'ko', 'zh', 'fa', 'ar', 'he') else None)) # rs_represent(rs, data = {'lang': lang, "sents": sents, 'engine': engine, # 'pipelines':pipelines}) data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines':pipelines} for r in rs: # fixture.print_table(r, False) # print(f"lemma: {r['lemma']}") # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) # display(df) domains = r['domains'] common = {'lemma': r['lemma'], 'word': r['word'], 'stems': r['stems']} meta = {'rel': r['rel'], **common, **data} result_set.append((domains, meta)) else: tc.emp('red', '.. no found predefined chunk-patterns.') tc.info(doc_jsonify.words_string()) tc.info(doc_jsonify.dependencies_string()) return result_set
def check_aux(self, sents, lang): """ $ python -m sagas.nlu.predicts_cli check_aux 'what will be the weather in three days?' en :param sents: :param lang: :return: """ data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_aux_domain(doc_jsonify) ps = PredictSamples() tc.emp('cyan', f"result: {ps.check_domains(domains, lang)}")
def do_infers(text: Text, source: Text) -> (Text, List[Text]): infers = Inferencer(source) pats = infers.infer(text) # generate cli command shortcuts = {'ja': 'sj', 'zh': 'sz'} cli_head = shortcuts[source] if source in shortcuts else f"s{source}" cli_cmd = f"# $ {cli_head} '{text}'" tc.emp('white', cli_cmd) for pat in pats: tc.emp('yellow', pat) return cli_cmd, pats
def check_subj(self, sents, lang): """ $ python -m sagas.nlu.predicts_cli check_subj 'Яблоко - это здоровый фрукт.' ru :param sents: :param lang: :return: """ data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_subj_domain(doc_jsonify) ps=PredictSamples() tc.emp('cyan', f"result: {ps.check_domains(domains, lang)}")
def view_file(self, file, lang='en'): """ $ python -m saai.nlu_data_procs view_file ./nlu_multilang/en/nlu_data.md $ python -m saai.nlu_data_procs view_file /pi/ws/knowledgebasebot/data/nlu.md :param file: :param lang: :return: """ from pprint import pprint # files = ['./nlu_multilang/en/nlu_data.md'] td = training_data_from_paths([file], language=lang) print('.. examples') # print(*[e.text for e in td.training_examples], sep='\n') print(*[(e.get("intent"), e.text) for e in td.training_examples], sep='\n') tc.emp('green', '.. intents') for intent in td.intents: tc.emp('yellow', f" - {intent}") tc.emp('green', '.. entities') print(td.entities) tc.emp('green', '.. lookup_tables') pprint(td.lookup_tables)
def proc_word(type_name, word, head, lang): from sagas.nlu.translator import translate_try res, _ = translate_try(word, source=lang, target=target_lang(lang)) target = '' if head != '': res_t, _ = translate_try(head, source=lang, target=target_lang(lang), options={'disable_correct'}) target = f" ⊙︿⊙ {res_t}({head}{translit_chunk(head, lang)})" # target = f" ⊙︿⊙ {res_t}({head})" result = f"[{type_name}]({word}{translit_chunk(word, lang)}) {res}{target}" tc.emp('magenta', result) return [result]
def slots(self, bucket: Text, user=None): """ $ preqs: launch sagas-ai/bots/agent_dispatcher/Procfile_mod $ saai talk '/dump_info{"object":"rr"}' samlet_default $ python -m sagas.nlu.sinker_rasa_tracker slots default $ sj '新幹線で東京から大阪まで行きました。' $ python -m sagas.nlu.sinker_rasa_tracker slots transport :param bucket: :param user: :return: """ import sagas.tracker_fn as tc values = rasa_tracker_store.slot_values(bucket, user) tc.emp('green', [f"{k}: {v}" for k, v in values.items() if v is not None])
def proc_children_column(partcol, textcol, lang, indent='\t'): from sagas.nlu.translator import translate_try result = [] # print(partcol, textcol) for id, (name, r) in enumerate(zip(partcol, textcol)): if name not in ('punct', 'head_root'): # if len(r)>1: # sent=' '.join(r) if lang not in ('ja','zh') else ''.join(r) sent = join_text(r, lang) res, _ = translate_try(sent, source=lang, target=target_lang(lang), options={'disable_correct'}) chunk = f"{indent}[{name}]({sent}{translit_chunk(sent, lang)}) {res}" result.append(chunk) tc.emp('cyan', chunk) return result
def list_rs(rs, lang): # from IPython.display import display from termcolor import colored tc.emp('cyan', f"✁ chunks. {'-' * 25}") for serial, r in enumerate(rs): df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) if 'head' in r: cla = "%s/%s(%s)" % (r['head_word'], r['head'], r['head_pos']) else: cla = '_' tc.info(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']), cla) # sagas.print_df(df) tc.dfs(trunc_cols(df)) print_stem_chunks(r) list_synsets(r, lang)
def console_watch(sender, **kw): import datetime from sagas.nlu.nlu_tools import NluTools ds: ResultDataset = kw['dataset'] meta: RequestMeta = kw['meta'] print(f"****** watch {sender}") tc.emp('magenta', meta) tools = NluTools() if cf.is_enabled('print_tree'): tools.main_domains(meta.sents, lang=meta.lang, engine=meta.engine, print_domains=False) return datetime.datetime.now()
def get_word_def(self, word, lang='en', pos='*'): """ 在终端上输出单词的定义和继承链. $ python -m sagas.nlu.nlu_cli get_word_def menina pt n $ python -m sagas.nlu.nlu_cli get_word_def cepillar es $ python -m sagas.nlu.nlu_cli get_word_def Krieg de $ def krieg de $ def iste tr v $ def 建筑 zh v :param word: :param lang: :param pos: :return: """ from termcolor import colored resp = get_word_sets(word, lang, pos) if resp['result'] == 'success': sets: List[Any] = resp['data'] if sets: for s in sets: print("%s -> %s" % (colored(s['name'], 'green'), s['definition'])) for exa in s['examples']: print('\t', exa) domains = s['domains'] print('\t', domains) # print('\t', s['lemmas']) for key, les in s['lemmas'].items(): if les: # print('\t', '[%s] %s'%(key, ', '.join('_' if w is None else w for w in les))) print('\t', '[%s] %s' % (key, ', '.join(les))) else: tc.emp('red', f'no synsets for {word}.') if lang != 'en': print(colored('✁ --------------------------', 'red')) word_r = self.get_word_trans(word, lang, pos) if word_r: tc.emp('cyan', f"3. chains for {word_r}:") self.get_chains(word_r, 'en', pos=pos) print(colored('✁ --------------------------', 'red')) self.get_chains(word, lang, pos)
def model_info(model): """ >>> from sagas.nlu.anal import build_anal_tree, Doc, AnalNode >>> from sagas.nlu.anal_corpus import model_info >>> f=build_anal_tree('We expect them to change their minds', 'en', 'stanza') >>> f.draw() root: expect (expect; , verb) |-- nsubj: We (we, pron) |-- obj: them (they, pron) |-- xcomp: change (change; , verb) | |-- mark: to (to, part) | +-- obj: minds (mind, noun) | +-- nmod:poss: their (they, pron) +-- punct: . (., punct) >>> model=f.rels('xcomp')[0].model() >>> model_info(model) :param model: :return: """ tc.emp('cyan', type(model).__name__, '-' * 10, '✁') target = model.target if target: tc.emp('cyan', '1. target:', target.spec(), target.axis, target.types) else: tc.emp('white', '1. no target.') # tc.emp('white', f.model()) if isinstance(model, Behave): subj = model.subj.types if model.subj and not model.subj.is_pron( ) else '_' indicators = [] if model.negative: indicators.append('not') if model.behave.pred_enable: indicators.append('enable') behave_ds = model.behave.types or model.behave.spec( ) or model.behave.axis tc.emp( 'white', f"2. {model.behave.lemma}[{','.join(indicators)}]: {behave_ds} ☜ {subj}" ) elif isinstance(model, Desc): tc.emp('white', f"2. desc: {model.desc.types or model.desc.spec('*')}")
def induce_pattern(self, pat: DomainToken, ds, enable_verbose=False) -> Text: if enable_verbose: tc.emp('magenta', pat) def gen_verb(ind='verb', prefix='behave'): spec = [d for d in ds if d['indicator'] == f'[{ind}]'] if spec: ref = spec[0]['spec'] else: ref = pat.translate.replace(' ', '_') return f"pat(5, name='{prefix}_{ref}').verb" def gen_root(): spec = [d for d in ds if d['indicator'] == '[root]'] if spec: ref = spec[0]['spec'] else: ref = pat.translate.replace(' ', '_') return f"pat(5, name='ana_{ref}').root" def gen_cop(): spec = [d for d in ds if d['indicator'] == 'head'] if spec: ref = spec[0]['spec'] else: ref = pat.head_trans.replace( ' ', '_') if pat.lang != 'en' else pat.head.replace( ' ', '_') return f"pat(5, name='desc_{ref}').cop" domap = { 'verb_domains': gen_verb, 'aux_domains': gen_cop, 'subj_domains': gen_cop, 'root_domains': gen_root, 'predicate': lambda: gen_verb('predicate', 'predict'), } return domap[pat.type]().lower()
def _(self, lang, text, *sents): tc.info(type(self).__name__, isinstance(self, Keeper), text, lang) # data = {'lang': lang, "sents": text, 'engine': 'corenlp', 'disable_predicts': False} # domains, meta = self.request_domains(data) # engine = 'ltp' if lang == 'zh' else 'corenlp' engine = cf.engine(lang) domain_set = get_domains(text, lang, engine) for domains, meta in domain_set: # print(f"{meta['lemma']} ({meta['phonetic']}, {meta['word']})") # print(f"{meta['lemma']}") # execute rulesets tc.info('rules', [r.name for r in self.rulesets]) for i, ruleset in enumerate(self.rulesets): # print(colored(f"✁ {i}. {'-' * 25}", 'cyan')) tc.emp('cyan', f"✁ {i}. {'-' * 25}") rule_rs = ruleset(domains, meta, self, sents) display_result_df(rule_rs) if isinstance(self, Keeper): return self.callback(text) return None
def rs_repr(rs, data): from pprint import pprint from sagas.nlu.sinkers import Sinkers feats=[] sinkers=Sinkers(data, rs[0]['type']) for serial, r in enumerate(rs): # common = {'lemma': r['lemma'], 'word': r['word'], # 'stems': r['stems']} # meta = {'rel': r['rel'], **common, **data} meta=build_meta(r, data) lang=data['lang'] # if lang in lang_specs: # lang_specs[lang](meta, r['domains']) # else: # tc.emp('red', f'no special patterns for lang {lang}') mod_rs=langspecs.check_langspec(lang, meta, r['domains'], type_name = r['type']) tc.emp('magenta', f"✁ {'_' * 40} {lang}.result") if len(mod_rs)>0: sinkers.add_module_results(mod_rs) for mod,matched in mod_rs.items(): matched_info = {k: len(v.results) for k, v in matched.items()} for ctx in matched.values(): # matched value type is Context pprint(ctx.results) tc.emp('yellow', f"{mod} -> {matched_info}") feats.extend(matched.keys()) tc.emp('green', f"features -> {feats}") sinkers.process_with_sinkers()
def __call__(self, domains, meta, ctx=None, param_sents=None): rule_rs = self.rules(domains, meta) # .. parts {'sbv': '你', 'vob': '电脑', 'wp': '?'} tc.info('.. parts', {k: v for k, v in rule_rs[0][3].lemmas.items()}) if all([val[1] for val in rule_rs]): results = [el for r in rule_rs for el in r[3].results] # .. results # ('ins_rasa', 'vob', {'intent': 'how_many', 'confidence': 0.9721028208732605}) if len(results) > 0: tc.info('.. results') tc.info([f"{r[0]}/{r[1]}" for r in results]) # color_print('blue', json.dumps(results, indent=2, ensure_ascii=False)) tc.emp('blue', results) # 如果kwargs不为空, 则利用kwargs的规则集来检测param_sents, # 将得到的inspectors结果集放入对应的参数名中, # 与rules的结果集results一起作为参数值来调用executor. if len(self.parameters) > 0: tc.emp('red', 'parameters -> %s' % ', '.join(self.parameters.keys())) if param_sents is not None: tc.emp('yellow', '; '.join(param_sents)) # .. matched: how_many_artifact if ctx is not None: self.executor(ctx) else: self.executor(self.name) return rule_rs
def check_langspec(self, lang:Text, meta:Dict[Text,Any], domains, type_name:Text) -> Dict[Text,Any]: # lang = data['lang'] mod_rs={} if lang in self.lang_specs: # 1. prepare phrase proc_comps=self.lang_specs[lang] for proc in proc_comps: proc.prepare(meta) # prepare method will modify the sents or other meta-info # 2. rules evaluate phrase doc, _=parse_sents(meta) # from termcolor import colored tc.emp('cyan', f"✁ lang.spec for {lang}.{type_name} {'-' * 25}") for c in proc_comps: ci=c(meta, domains, doc=doc) exec_rules_by_type(ci, type_name) ci.execute() mod_rs[c.__name__]=ci.matched else: tc.emp('red', f'no special patterns for lang {lang}') return mod_rs
def vis_trees(trees: List[Dict[Text, Any]], word_info=True): from anytree.importer import DictImporter from anytree import RenderTree importer = DictImporter() for index, data in enumerate(trees): word = data['word'] if word_info: pprint(word) tree_root = importer.import_(data['tree']) tree = RenderTree(tree_root) tc.emp( 'green', f"Display #{index} sememe tree: {word['en_word']}|{word['ch_word']}.{word['No']}" ) for pre, fill, node in tree: if node.role and node.role != 'None': cl = 'magenta' role = node.role else: cl = 'yellow' role = '✔' tc.emp(cl, "%s%s: %s" % (pre, role, node.name))