def vis_doc(sents, lang): from sagas.nlu.ruleset_procs import cached_chunks from sagas.nlu.uni_remote_viz import list_contrast, display_doc_deps from sagas.conf.conf import cf chunks = cached_chunks(sents, lang, cf.engine(lang)) return display_doc_deps(chunks['doc'], None)
def get_domains(sents, lang, engine='corenlp', options=None): """ >>> from sagas.nlu.legacy.aiobj_kit import get_domains >>> get_domains('你有几台笔记本电脑?', 'zh', 'ltp') >>> get_domains('列出上周编辑的文件。', 'zh', 'ltp', DomainGetOptions(enable_predicts=True)) :param sents: :param lang: :param engine: :param options: :return: """ # from IPython.display import display if options is None: options=DomainGetOptions() pipelines=['predicts'] if options.enable_predicts else [] doc_jsonify, resp = dep_parse(sents, lang, engine, pipelines) result_set=[] if doc_jsonify is not None: tc.emp('cyan', resp) if resp is not None and 'predicts' in resp and len(resp['predicts'])>0: rs=resp['predicts'] # print(rs) else: # print(doc_jsonify.words_string()) rs = get_chunks(doc_jsonify) if len(rs)>0: if options.list_chunks: list_rs(rs, lang) if options.deps_graph: # display(display_doc_deps(doc_jsonify, resp)) tc.gv(display_doc_deps(doc_jsonify, resp, translit_lang=lang if lang in ('ja', 'ko', 'zh', 'fa', 'ar', 'he') else None)) # rs_represent(rs, data = {'lang': lang, "sents": sents, 'engine': engine, # 'pipelines':pipelines}) data = {'lang': lang, "sents": sents, 'engine': engine, 'pipelines':pipelines} for r in rs: # fixture.print_table(r, False) # print(f"lemma: {r['lemma']}") # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) # display(df) domains = r['domains'] common = {'lemma': r['lemma'], 'word': r['word'], 'stems': r['stems']} meta = {'rel': r['rel'], **common, **data} result_set.append((domains, meta)) else: tc.emp('red', '.. no found predefined chunk-patterns.') tc.info(doc_jsonify.words_string()) tc.info(doc_jsonify.dependencies_string()) return result_set
def parse_subj(): lang = 'ru' sents = 'Яблоко - это здоровый фрукт.' st.write(sents) data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_subj_domain(doc_jsonify) testing(domains, 'ru') gv = display_doc_deps(doc_jsonify, resp) st.graphviz_chart(gv) st.write(domains)
def parse_aux(): lang = 'en' sents = 'what will be the weather in three days?' st.write(sents) data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) domains = get_aux_domain(doc_jsonify) testing(domains, 'en') # show analyse graph gv = display_doc_deps(doc_jsonify, resp) st.graphviz_chart(gv) st.write(domains)
def parse_deps(text, lang, translit=None): text = fix_sents(text, lang) engine = cf.engine(lang) # g = sentence_view(lang, text, engine=engine, translit_lang=lang, enable_contrast=True) doc_jsonify, resp = dep_parse(text, lang, engine, ['predicts']) if doc_jsonify is not None: list_chunks(doc_jsonify, resp, lang, enable_contrast=True) g = display_doc_deps(doc_jsonify, resp, translit_lang=lang) st.graphviz_chart(g) if translit is not None: st.text(f"♤ {translit}") words = [word.text for word in doc_jsonify.words] tools.contrast(text, lang, word_map=words)
def print_sents(self, sents, lang, engine=None): """ $ python -m sagas.nlu.ruleset_procs print_sents 'I want to play music.' en $ python -m sagas.nlu.ruleset_procs print_sents "クモは4つの右の目をしています。" ja corenlp :param sents: :param lang: :return: """ # lang = 'en' if engine is None: engine = cf.engine(lang) data = {'lang': lang, "sents": sents, 'engine': engine} doc_jsonify, resp = parse_sents(data) rs = get_chunks(doc_jsonify) if lang in non_spaces: delim = '' else: delim = ' ' for serial, r in enumerate(rs): meta = build_meta(r, data) domains = r['domains'] # print([(x[0], x[2]) for x in domains]) # keys = {x[0] for x in domains} grp = lambda p, idx: [x[idx] for x in domains if x[0] == p] tokens = {x: grp(x, 2) for x in keys} words = {x: delim.join(grp(x, 2)) for x in keys} lemmas = {x: delim.join(grp(x, 3)) for x in keys} print('meta keys', meta.keys()) print('tokens', tokens) print('words', meta['word'], words) print('lemmas', lemmas) # ctx = Context(meta, domains) # print(ctx.lemmas) print('chunks', ctx._chunks) g = display_doc_deps(doc_jsonify, resp, translit_lang=lang) print(*[(w.index, w.text, w.governor, doc_jsonify.words[w.governor - 1].text) for w in doc_jsonify.words], sep='\n') tc.gv(g)
def row_view(row): text = row[1] if display_translit and len(row) > 2: label = row[2] else: label = text if st.button(f"{label} ✁ {row[0]}"): text = fix_sents(text, lang) engine = get_engine(lang) # g = sentence_view(lang, text, engine=engine, translit_lang=lang, enable_contrast=True) doc_jsonify, resp = dep_parse(text, lang, engine, ['predicts']) if doc_jsonify is not None: list_chunks(doc_jsonify, resp, lang, enable_contrast=True) g=display_doc_deps(doc_jsonify, resp, translit_lang=lang) st.graphviz_chart(g) if len(row) > 2: st.text(f"♤ {row[2]}") words = [word.text for word in doc_jsonify.words] tools.contrast(text, lang, word_map=words)
def verbs(self, sents, lang='en', do_action=False): """ 单词的wordnet匹配使用专门的ruleset来评估, 匹配成功的rule会写入状态, 比如spec_xcomp_obj='music', 此处的music对应knowledgebase的object_type. 有效成分的单词wordnet引用依次放入ruleset评估, 这样会得到一个状态集, 这个状态集会放入句子结构, 供sents_ruleset评估. sents_ruleset会收集到多个intents保存到状态中, 遍历intents, 如果intent有action可触发, 则触发这个action. $ python -m sagas.nlu.ruleset_procs verbs 'I want to play music.' en ``` [{'ref': 'want/want', 'upos': 'verb'}, {'ref_xcomp': 'play/play', 'upos': 'verb'}, {'ref_xcomp_obj': 'music/music', 'upos': 'noun'}] ``` $ python -m sagas.nlu.ruleset_procs verbs 'I want to play video.' en $ python -m sagas.nlu.ruleset_procs verbs 'I would like to play video.' en $ python -m sagas.nlu.ruleset_procs verbs "i'd like to play sound." en $ verbs 'I want to play music.' en True :param sents: :param lang: :return: """ import sagas.nlu.ruleset_fixtures as rf data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) v_domains = get_verb_domain(doc_jsonify) if self.verbose: tc.gv(display_doc_deps(doc_jsonify, resp)) pprint(v_domains) json_utils.write_json_to_file('./out/v_domain.json', v_domains[0]) # list words tc.emp('cyan', f"✁ list words. {'-' * 25}") intents = [] host = create_host() for d in v_domains: tokens = list_words(d, lang, with_chains=True) if self.verbose: pprint(tokens) # specs evaluate tc.emp('cyan', f"✁ specs evaluate. {'-' * 25}") r3 = {} for token in tokens: r3 = host.assert_fact('chains', token) pprint(r3) # the last result is all state [r3.pop(key) for key in ['$s', 'id', 'sid']] tc.emp('red', f"specs state - {r3}") # sents evaluate tc.emp('cyan', f"✁ sents evaluate. {'-' * 25}") sents_data = {**d, **r3} tc.emp('cyan', f" keys: {', '.join(sents_data.keys())}") result = host.assert_fact('sents', sents_data) tc.emp('red', f"sents state - {result}") if 'intents' in result: intents.extend(result['intents']) self.process_intents(sents, lang, intents, do_action)