def write_feed_json(self, file_name, out_file_name, lang='ja'): import json_utils # out_file_name = "./data/rss/user_video_286700005.json" feed_json = self.get_feed_json(file_name, lang) # print(json.dumps(feed_json, indent=2, ensure_ascii=False)) json_utils.write_json_to_file(out_file_name, feed_json)
def practice(self, lang_tr='fr', use_latest=False): """ $ python -m sagas.nlu.nlu_tools practice fr :param lang: :param lang_tr: :param dataf: :return: """ from sagas.nlu.tts_utils import say_lang import json_utils lang = 'en' dataf=corpus_resources[lang_tr] rows=[] if use_latest: rows = json_utils.read_json_file('./out/latest_%s.json' % lang_tr) else: rows=load_corpus(dataf) json_utils.write_json_to_file('./out/latest_%s.json' % lang_tr, rows.tolist()) for r in rows: sents = str(r[0]) tr_lang = str(r[1].strip()) say_lang(sents, lang) print('♥', sents) say_lang(tr_lang, lang_tr) print('♡', tr_lang)
def extract_assocs(self): nums = self.driver.find_elements_by_class_name("words") # print(f"total sections {len(nums)}") texts = [] for el in nums: # print(el.text) lines=el.text.split('\n') if (len(lines) % 2) != 0: lines = lines[1:] texts.extend(lines) print(f"total sents {len(texts)}") texts_count = len(texts) if (texts_count % 2) != 0: texts = texts[1:] text_assocs = convert_list(texts) print(f"current assocs {len(text_assocs)}") # for k,v in text_assocs.items(): # print(f"{k} -> {v}") json_s = json.dumps(text_assocs, indent=2, ensure_ascii=False) # print(json_s) self.put_assocs(text_assocs) print(f"total assocs {len(self.all_assocs)}") # print(json.dumps(self.all_assocs, indent=2, ensure_ascii=False)) json_utils.write_json_to_file("./data/corpus/mondly_assocs.json", self.all_assocs)
def build_words_map(self, to_clip=True): """ $ python -m sagas.ru.ru_procs build_words_map $ build-ru :param to_clip: :return: """ import clipboard import json_utils words_map = {} rev_map = {} rs = [] all_words = self.get_all_words() for word in all_words: key = self.get_norm_key(word) if len(key) > 0: if key in words_map: words_map[key].append(word) else: words_map[key] = [word] rev_map[word] = key rs.append('%s %s' % (key, word)) print('write to', self.target_file) json_utils.write_json_to_file(self.target_file, words_map) json_utils.write_json_to_file(self.target_file_rev, rev_map) if to_clip: print('copy to clipboard ...') clipboard.copy('\n'.join(rs)) print('done')
def generate(self): """ $ python -m sagas.nlu.treebanks generate :return: """ df = self.treebanks_df() rs = json.loads(df.to_json(orient='records')) # find all models folders = io_utils.list_subdirectories(f'{cf.conf_dir}/ai/corenlp/') model_dirs = [] suffix = '_models' for folder in folders: if folder.endswith(suffix): model_dirs.append((folder)) # print(model_dirs) suffix_len = len(suffix) all_models = [] for dir in model_dirs: model_name = os.path.basename(dir)[:-suffix_len] lang, meta = self.get_bank_meta(model_name, rs) model_idx = { 'name': meta['LANGUAGE CODE'], 'lang': lang, 'model': model_name, 'version': meta['VERSION'], 'treebank': meta['TREEBANK'] } all_models.append(model_idx) all_models = sorted(all_models, key=lambda el: el['name']) target_file = './sagas/conf/treebanks.json' json_utils.write_json_to_file(target_file, all_models) print('write to', target_file)
def write_timestamps(self): """ $ python -m saai.nlu_mod_procs write_timestamps :return: """ last_timestamps = self.get_timestamps() print(last_timestamps) json_utils.write_json_to_file(self.timestamps_file, last_timestamps)
def create_resources_data(self): """ $ python -m sagas.ofbiz.resources create_resources_data :return: """ import json_utils props = self.get_all_properties() json_utils.write_json_to_file('~/pi/stack/data/labels/labels.json', properties_json(props))
def create_lang_feeds( self, tr='fr', dataf=f"{cf.conf_dir}/ai/seq2seq/fra-eng-2019/fra.txt", outf='./data/graph/fra_eng_feed.json', parse_zh=False): """ $ python -m sagas.graph.graph_manager create_lang_feeds $ python -m sagas.graph.graph_manager create_lang_feeds ja ~/pi/ai/seq2seq/jpn-eng-2019/jpn.txt ./data/graph/jpn_eng_feed.json :return: """ from sagas.nlu.corpus_helper import filter_term, lines, divide_chunks import numpy import spacy logger.info('loading spacy english model ...') nlp_spacy = spacy.load('en_core_web_sm') print('loading corpus ...') pairs = lines(dataf) total = len(pairs) print('total', total) array = numpy.array(pairs) random_rows = numpy.random.randint(total, size=10) print('pickup', random_rows) print(array[random_rows, :]) print('analyse ...') rows = array[random_rows, :] dataset = [] for r in rows: sents = str(r[0]) tr_lang = str(r[1].strip()) props = {} props['sents@en'] = sents props['sents@' + tr] = tr_lang doc = nlp_spacy(sents) props['lemmas'] = get_lemmas(doc) verbs = get_verb_lemmas(doc) if len(verbs) > 0: props['verbs'] = ' '.join(verbs) props['verbs|count'] = len(verbs) put_entities(doc, props) put_chunks(doc, props) if parse_zh: # self.hanlp.put_deps(tr_lang, props) hanlp_c = self.hanlp() hanlp_c.put_deps(tr_lang, props) dataset.append(props) print(json.dumps(dataset, indent=2, ensure_ascii=False)) print('write to json file %s ...' % outf) json_utils.write_json_to_file(outf, dataset) print('done.')
def write_recs(self, entity): """ $ python -m sagas.ofbiz.entity_graph_relations write_recs 'Survey' $ python -m sagas.ofbiz.entity_graph_relations write_recs 'Example' :param entity: :return: """ total = oc.delegator.findCountByCondition(entity, None, None, None) print('total records for %s: %d' % (entity, total)) rs = proc_entity_recs(entity) # json_utils.write_json_to_file('./out/rs_Survey.json', rs) json_utils.write_json_to_file('./data/ofbiz/rs_%s.json' % entity, rs) print('done.')
def extract_verbs(self): import json_utils verbs = self.driver.find_elements_by_class_name("translation-verb") # print('total section', len(verbs)) for v in verbs: # print(v.text) for t in v.text.split('\n'): self.put_verb(t) # print(verb_assocs) # print(jsonpickle.encode(verb_assocs)) json_data = {k: list(v) for k, v in self.verb_assocs.items()} print(f"total verbs {len(self.verb_assocs)}") # print(json.dumps(json_data, # indent=2, ensure_ascii=False)) json_utils.write_json_to_file("./data/corpus/mondly_verbs.json", json_data)
def build_voc(): voc_file=f'{cf.conf_dir}/langs/voc/ru-voc.json' voc_dicts=f'{cf.conf_dir}/langs/voc/ru-voc-dicts.json' words=json_utils.read_json_file(voc_file) all_dicts=build_dicts() print('filter by voc-words ...') skips=[] rs=filter_set(all_dicts, words, 'ru', skips) json_utils.write_json_to_file(voc_dicts, rs) # print('done.') print('absents %d, see these words in file ru-voc-absents.json'%len(skips)) json_utils.write_json_to_file(f'{cf.conf_dir}/langs/voc/ru-voc-absents.json', skips) return rs, skips
def build(self, feat_name, location_matcher): """ $ python -m sagas.nlu.expanders build samples 'ProductEntityLabels.xml' :param feat_name: :param location_matcher: :return: """ from os import path from sagas.ofbiz.resources import ResourceDigester outf = "./data/feats/%s.npy" % feat_name if path.exists(outf): print(f"Target file {outf} has already exists, exit.") return rd = ResourceDigester(True) props = rd.get_all_properties() if location_matcher == '*': labels = [k for k, v in props.items()] else: labels = [ k for k, v in props.items() if v.location.endswith(location_matcher) ] print('total', len(labels)) arranger = [] for label in labels: arranger.extend(pickups(label, props[label], ['de', 'fr'])) print(len(arranger)) bm = BertManager() doc_vecs = bm.bc.encode([row['value'] for row in arranger]) print(len(doc_vecs), doc_vecs[:2]) # feat_name = 'samples' save_docs(doc_vecs, outf) meta_outf = "./data/feats/%s.json" % feat_name json_utils.write_json_to_file(meta_outf, arranger)
def verbs(self, sents, lang='en', do_action=False): """ 单词的wordnet匹配使用专门的ruleset来评估, 匹配成功的rule会写入状态, 比如spec_xcomp_obj='music', 此处的music对应knowledgebase的object_type. 有效成分的单词wordnet引用依次放入ruleset评估, 这样会得到一个状态集, 这个状态集会放入句子结构, 供sents_ruleset评估. sents_ruleset会收集到多个intents保存到状态中, 遍历intents, 如果intent有action可触发, 则触发这个action. $ python -m sagas.nlu.ruleset_procs verbs 'I want to play music.' en ``` [{'ref': 'want/want', 'upos': 'verb'}, {'ref_xcomp': 'play/play', 'upos': 'verb'}, {'ref_xcomp_obj': 'music/music', 'upos': 'noun'}] ``` $ python -m sagas.nlu.ruleset_procs verbs 'I want to play video.' en $ python -m sagas.nlu.ruleset_procs verbs 'I would like to play video.' en $ python -m sagas.nlu.ruleset_procs verbs "i'd like to play sound." en $ verbs 'I want to play music.' en True :param sents: :param lang: :return: """ import sagas.nlu.ruleset_fixtures as rf data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)} doc_jsonify, resp = parse_sents(data) v_domains = get_verb_domain(doc_jsonify) if self.verbose: tc.gv(display_doc_deps(doc_jsonify, resp)) pprint(v_domains) json_utils.write_json_to_file('./out/v_domain.json', v_domains[0]) # list words tc.emp('cyan', f"✁ list words. {'-' * 25}") intents = [] host = create_host() for d in v_domains: tokens = list_words(d, lang, with_chains=True) if self.verbose: pprint(tokens) # specs evaluate tc.emp('cyan', f"✁ specs evaluate. {'-' * 25}") r3 = {} for token in tokens: r3 = host.assert_fact('chains', token) pprint(r3) # the last result is all state [r3.pop(key) for key in ['$s', 'id', 'sid']] tc.emp('red', f"specs state - {r3}") # sents evaluate tc.emp('cyan', f"✁ sents evaluate. {'-' * 25}") sents_data = {**d, **r3} tc.emp('cyan', f" keys: {', '.join(sents_data.keys())}") result = host.assert_fact('sents', sents_data) tc.emp('red', f"sents state - {result}") if 'intents' in result: intents.extend(result['intents']) self.process_intents(sents, lang, intents, do_action)
def save_voc(self): import json_utils all_words = self.get_all_words() json_utils.write_json_to_file(f'{cf.conf_dir}/langs/voc/ru-voc.json', list(all_words))