示例#1
0
    def write_feed_json(self, file_name, out_file_name, lang='ja'):
        import json_utils
        # out_file_name = "./data/rss/user_video_286700005.json"
        feed_json = self.get_feed_json(file_name, lang)

        # print(json.dumps(feed_json, indent=2, ensure_ascii=False))
        json_utils.write_json_to_file(out_file_name, feed_json)
示例#2
0
    def practice(self, lang_tr='fr', use_latest=False):
        """
        $ python -m sagas.nlu.nlu_tools practice fr
        :param lang:
        :param lang_tr:
        :param dataf:
        :return:
        """
        from sagas.nlu.tts_utils import say_lang
        import json_utils

        lang = 'en'
        dataf=corpus_resources[lang_tr]
        rows=[]
        if use_latest:
            rows = json_utils.read_json_file('./out/latest_%s.json' % lang_tr)
        else:
            rows=load_corpus(dataf)
            json_utils.write_json_to_file('./out/latest_%s.json' % lang_tr, rows.tolist())
        for r in rows:
            sents = str(r[0])
            tr_lang = str(r[1].strip())
            say_lang(sents, lang)
            print('♥', sents)
            say_lang(tr_lang, lang_tr)
            print('♡', tr_lang)
示例#3
0
 def extract_assocs(self):
     nums = self.driver.find_elements_by_class_name("words")
     # print(f"total sections {len(nums)}")
     texts = []
     for el in nums:
         # print(el.text)
         lines=el.text.split('\n')
         if (len(lines) % 2) != 0:
             lines = lines[1:]
         texts.extend(lines)
     print(f"total sents {len(texts)}")
     texts_count = len(texts)
     if (texts_count % 2) != 0:
         texts = texts[1:]
     text_assocs = convert_list(texts)
     print(f"current assocs {len(text_assocs)}")
     # for k,v in text_assocs.items():
     #     print(f"{k} -> {v}")
     json_s = json.dumps(text_assocs, indent=2, ensure_ascii=False)
     # print(json_s)
     self.put_assocs(text_assocs)
     print(f"total assocs {len(self.all_assocs)}")
     # print(json.dumps(self.all_assocs, indent=2, ensure_ascii=False))
     json_utils.write_json_to_file("./data/corpus/mondly_assocs.json",
                                   self.all_assocs)
示例#4
0
    def build_words_map(self, to_clip=True):
        """
        $ python -m sagas.ru.ru_procs build_words_map
        $ build-ru
        :param to_clip:
        :return:
        """
        import clipboard
        import json_utils
        words_map = {}
        rev_map = {}
        rs = []
        all_words = self.get_all_words()
        for word in all_words:
            key = self.get_norm_key(word)
            if len(key) > 0:
                if key in words_map:
                    words_map[key].append(word)
                else:
                    words_map[key] = [word]
                rev_map[word] = key
                rs.append('%s %s' % (key, word))

        print('write to', self.target_file)
        json_utils.write_json_to_file(self.target_file, words_map)
        json_utils.write_json_to_file(self.target_file_rev, rev_map)
        if to_clip:
            print('copy to clipboard ...')
            clipboard.copy('\n'.join(rs))
        print('done')
示例#5
0
    def generate(self):
        """
        $ python -m sagas.nlu.treebanks generate
        :return:
        """
        df = self.treebanks_df()
        rs = json.loads(df.to_json(orient='records'))

        # find all models
        folders = io_utils.list_subdirectories(f'{cf.conf_dir}/ai/corenlp/')
        model_dirs = []
        suffix = '_models'
        for folder in folders:
            if folder.endswith(suffix):
                model_dirs.append((folder))

        # print(model_dirs)
        suffix_len = len(suffix)
        all_models = []
        for dir in model_dirs:
            model_name = os.path.basename(dir)[:-suffix_len]
            lang, meta = self.get_bank_meta(model_name, rs)
            model_idx = {
                'name': meta['LANGUAGE CODE'],
                'lang': lang,
                'model': model_name,
                'version': meta['VERSION'],
                'treebank': meta['TREEBANK']
            }
            all_models.append(model_idx)
        all_models = sorted(all_models, key=lambda el: el['name'])
        target_file = './sagas/conf/treebanks.json'
        json_utils.write_json_to_file(target_file, all_models)
        print('write to', target_file)
示例#6
0
 def write_timestamps(self):
     """
     $ python -m saai.nlu_mod_procs write_timestamps
     :return:
     """
     last_timestamps = self.get_timestamps()
     print(last_timestamps)
     json_utils.write_json_to_file(self.timestamps_file, last_timestamps)
示例#7
0
 def create_resources_data(self):
     """
     $ python -m sagas.ofbiz.resources create_resources_data
     :return:
     """
     import json_utils
     props = self.get_all_properties()
     json_utils.write_json_to_file('~/pi/stack/data/labels/labels.json',
                                   properties_json(props))
示例#8
0
    def create_lang_feeds(
            self,
            tr='fr',
            dataf=f"{cf.conf_dir}/ai/seq2seq/fra-eng-2019/fra.txt",
            outf='./data/graph/fra_eng_feed.json',
            parse_zh=False):
        """
        $ python -m sagas.graph.graph_manager create_lang_feeds
        $ python -m sagas.graph.graph_manager create_lang_feeds ja ~/pi/ai/seq2seq/jpn-eng-2019/jpn.txt ./data/graph/jpn_eng_feed.json
        :return:
        """
        from sagas.nlu.corpus_helper import filter_term, lines, divide_chunks
        import numpy
        import spacy

        logger.info('loading spacy english model ...')
        nlp_spacy = spacy.load('en_core_web_sm')
        print('loading corpus ...')
        pairs = lines(dataf)
        total = len(pairs)
        print('total', total)
        array = numpy.array(pairs)
        random_rows = numpy.random.randint(total, size=10)
        print('pickup', random_rows)
        print(array[random_rows, :])

        print('analyse ...')
        rows = array[random_rows, :]
        dataset = []
        for r in rows:
            sents = str(r[0])
            tr_lang = str(r[1].strip())
            props = {}
            props['sents@en'] = sents
            props['sents@' + tr] = tr_lang

            doc = nlp_spacy(sents)
            props['lemmas'] = get_lemmas(doc)
            verbs = get_verb_lemmas(doc)
            if len(verbs) > 0:
                props['verbs'] = ' '.join(verbs)
                props['verbs|count'] = len(verbs)
            put_entities(doc, props)
            put_chunks(doc, props)

            if parse_zh:
                # self.hanlp.put_deps(tr_lang, props)
                hanlp_c = self.hanlp()
                hanlp_c.put_deps(tr_lang, props)

            dataset.append(props)

        print(json.dumps(dataset, indent=2, ensure_ascii=False))

        print('write to json file %s ...' % outf)
        json_utils.write_json_to_file(outf, dataset)
        print('done.')
    def write_recs(self, entity):
        """
        $ python -m sagas.ofbiz.entity_graph_relations write_recs 'Survey'
        $ python -m sagas.ofbiz.entity_graph_relations write_recs 'Example'
        :param entity:
        :return:
        """
        total = oc.delegator.findCountByCondition(entity, None, None, None)
        print('total records for %s: %d' % (entity, total))

        rs = proc_entity_recs(entity)
        # json_utils.write_json_to_file('./out/rs_Survey.json', rs)
        json_utils.write_json_to_file('./data/ofbiz/rs_%s.json' % entity, rs)
        print('done.')
示例#10
0
 def extract_verbs(self):
     import json_utils
     verbs = self.driver.find_elements_by_class_name("translation-verb")
     # print('total section', len(verbs))
     for v in verbs:
         # print(v.text)
         for t in v.text.split('\n'):
             self.put_verb(t)
     # print(verb_assocs)
     # print(jsonpickle.encode(verb_assocs))
     json_data = {k: list(v) for k, v in self.verb_assocs.items()}
     print(f"total verbs {len(self.verb_assocs)}")
     # print(json.dumps(json_data,
     #                 indent=2, ensure_ascii=False))
     json_utils.write_json_to_file("./data/corpus/mondly_verbs.json", json_data)
示例#11
0
def build_voc():
    voc_file=f'{cf.conf_dir}/langs/voc/ru-voc.json'
    voc_dicts=f'{cf.conf_dir}/langs/voc/ru-voc-dicts.json'
    words=json_utils.read_json_file(voc_file)
    all_dicts=build_dicts()

    print('filter by voc-words ...')
    skips=[]
    rs=filter_set(all_dicts, words, 'ru', skips)
    json_utils.write_json_to_file(voc_dicts, rs)
    # print('done.')
    print('absents %d, see these words in file ru-voc-absents.json'%len(skips))
    json_utils.write_json_to_file(f'{cf.conf_dir}/langs/voc/ru-voc-absents.json', skips)

    return rs, skips
示例#12
0
    def build(self, feat_name, location_matcher):
        """
        $ python -m sagas.nlu.expanders build samples 'ProductEntityLabels.xml'
        :param feat_name:
        :param location_matcher:
        :return:
        """
        from os import path
        from sagas.ofbiz.resources import ResourceDigester

        outf = "./data/feats/%s.npy" % feat_name
        if path.exists(outf):
            print(f"Target file {outf} has already exists, exit.")
            return

        rd = ResourceDigester(True)
        props = rd.get_all_properties()
        if location_matcher == '*':
            labels = [k for k, v in props.items()]
        else:
            labels = [
                k for k, v in props.items()
                if v.location.endswith(location_matcher)
            ]

        print('total', len(labels))

        arranger = []
        for label in labels:
            arranger.extend(pickups(label, props[label], ['de', 'fr']))
        print(len(arranger))

        bm = BertManager()
        doc_vecs = bm.bc.encode([row['value'] for row in arranger])
        print(len(doc_vecs), doc_vecs[:2])

        # feat_name = 'samples'
        save_docs(doc_vecs, outf)
        meta_outf = "./data/feats/%s.json" % feat_name
        json_utils.write_json_to_file(meta_outf, arranger)
示例#13
0
    def verbs(self, sents, lang='en', do_action=False):
        """
        单词的wordnet匹配使用专门的ruleset来评估, 匹配成功的rule会写入状态, 比如spec_xcomp_obj='music',
        此处的music对应knowledgebase的object_type.
        有效成分的单词wordnet引用依次放入ruleset评估, 这样会得到一个状态集, 这个状态集会放入句子结构,
        供sents_ruleset评估.
	    sents_ruleset会收集到多个intents保存到状态中, 遍历intents, 如果intent有action可触发,
	    则触发这个action.

        $ python -m sagas.nlu.ruleset_procs verbs 'I want to play music.' en
            ```
            [{'ref': 'want/want', 'upos': 'verb'},
             {'ref_xcomp': 'play/play', 'upos': 'verb'},
             {'ref_xcomp_obj': 'music/music', 'upos': 'noun'}]
            ```
        $ python -m sagas.nlu.ruleset_procs verbs 'I want to play video.' en
        $ python -m sagas.nlu.ruleset_procs verbs 'I would like to play video.' en
        $ python -m sagas.nlu.ruleset_procs verbs "i'd like to play sound." en
        $ verbs 'I want to play music.' en True

        :param sents:
        :param lang:
        :return:
        """
        import sagas.nlu.ruleset_fixtures as rf

        data = {'lang': lang, "sents": sents, 'engine': cf.engine(lang)}
        doc_jsonify, resp = parse_sents(data)
        v_domains = get_verb_domain(doc_jsonify)
        if self.verbose:
            tc.gv(display_doc_deps(doc_jsonify, resp))
            pprint(v_domains)
            json_utils.write_json_to_file('./out/v_domain.json', v_domains[0])

            # list words
            tc.emp('cyan', f"✁ list words. {'-' * 25}")

        intents = []
        host = create_host()
        for d in v_domains:
            tokens = list_words(d, lang, with_chains=True)
            if self.verbose:
                pprint(tokens)

            # specs evaluate
            tc.emp('cyan', f"✁ specs evaluate. {'-' * 25}")
            r3 = {}
            for token in tokens:
                r3 = host.assert_fact('chains', token)
                pprint(r3)  # the last result is all state
            [r3.pop(key) for key in ['$s', 'id', 'sid']]
            tc.emp('red', f"specs state - {r3}")

            # sents evaluate
            tc.emp('cyan', f"✁ sents evaluate. {'-' * 25}")
            sents_data = {**d, **r3}
            tc.emp('cyan', f"  keys: {', '.join(sents_data.keys())}")
            result = host.assert_fact('sents', sents_data)
            tc.emp('red', f"sents state - {result}")
            if 'intents' in result:
                intents.extend(result['intents'])

        self.process_intents(sents, lang, intents, do_action)
示例#14
0
 def save_voc(self):
     import json_utils
     all_words = self.get_all_words()
     json_utils.write_json_to_file(f'{cf.conf_dir}/langs/voc/ru-voc.json',
                                   list(all_words))