Exemplo n.º 1
0
def result_df(rs):
    from sagas.conf.conf import cf
    import sagas

    # print_not_matched=cf.is_enabled('print_not_matched')
    print_not_matched = True
    recs = []
    for r in rs:
        if not print_not_matched and not r[1]:
            pass
        else:
            recs.append(('✔' if r[1] else '✖', r[0]))
    return sagas.to_df(recs, ['match', 'options'])
Exemplo n.º 2
0
def simple():
    from pymongo import MongoClient
    import sagas

    # uri = 'mongodb://samlet.com/langs'
    uri = 'mongodb://localhost/langs'
    # uri = 'mongodb://192.168.0.101/langs'
    client = MongoClient(uri)
    db = client.get_default_database()
    print(db.name)

    rs = []
    for r in db.trans.find({'source': 'id'}):
        rs.append((r['text'], r['target']))
    sagas.print_df(sagas.to_df(rs, ['text', 'target']))
Exemplo n.º 3
0
def translations_df(trans, idx=0):
    import sagas
    # fill with default field names
    size = len(trans[idx][2][0])
    # print('size', size)
    listOfStrings = ['ext_' + str(i) for i in range(size)]
    if size==2:
        listOfStrings[0:2] = ['word', 'translations']
    elif size==4:
        listOfStrings[0:4] = ['word', 'translations', 'c', 'freq']
    else:
        listOfStrings[0:size] = ['word', 'translations',
                                 *[f'ex_{idx}' for idx in range(size-2)]]

    return sagas.to_df(trans[idx][2], listOfStrings)
Exemplo n.º 4
0
    def word_info_df(self, word):
        import sagas
        import json_utils
        rev_map = json_utils.read_json_file(self.target_file_rev)
        words_map = json_utils.read_json_file(self.target_file)

        tuples = []
        ipa_xs = self.epi.xsampa_list(word)
        ipa = self.epi.transliterate(word)
        key = rev_map[word]
        tuples.append((word, ipa, ''.join(ipa_xs), key))
        for w in words_map[key]:
            ipa_xs = self.epi.xsampa_list(w)
            ipa = self.epi.transliterate(w)
            tuples.append((w, ipa, ''.join(ipa_xs), key))
        return sagas.to_df(tuples, ['word', 'ipa', 'xsampa_list', 'key'])
Exemplo n.º 5
0
def list_contrast(rs, lang):
    result = []
    for serial, r in enumerate(rs):
        type_name = r['type']
        df = sagas.to_df(
            r['domains'],
            ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        rec = {
            'type': type_name,
            'word': r['word'],
            'head': r['head'] if 'head' in r else ''
        }
        rec['domains'] = make_map(df['rel'], df['children'])
        rec['synsets'] = list_synsets(r, lang, True)
        result.append(rec)
    return result
Exemplo n.º 6
0
 def print_table(self, rs):
     import sagas
     # from IPython.display import display
     # df_set=[]
     for r in rs:
         for k, v in r.items():
             if k != 'domains':
                 logging.debug('%s=%s' % (k, v))
         df = sagas.to_df(
             r['domains'],
             ['rel', 'index', 'text', 'lemma', 'children', 'features'])
         # df_set.append(df)
         # if console:
         #     sagas.print_df(df)
         # else:
         #     display(df)
         tc.dfs(df)
Exemplo n.º 7
0
def rs_summary(rs, console=True):
    from sagas.tool.misc import print_stem_chunks
    from IPython.display import display
    import sagas
    for serial, r in enumerate(rs):
        df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        if 'head' in r:
            cla = "%s(%s)" % (r['head'], r['head_pos'])
        else:
            cla = '_'
        print('%s(%s)' % (r['type'], r['lemma']), cla)
        # sagas.print_df(df)
        if not console:
            display(df)
        else:
            sagas.print_df(df)
        print_stem_chunks(r)
Exemplo n.º 8
0
    def train(self, lang_col, corpus_file, model_file):
        """
        $ python -m sagas.corpus.index_trainer train id '~/pi/ai/seq2seq/ind-eng/ind.txt' '~/pi/data/bert/embedded_id.pkl'
        :param corpus_file:
        :return:
        """
        from sagas.train.parallel_corpus import load_corpus, take_samples
        import sagas
        items = load_corpus(corpus_file)
        corpus_df = sagas.to_df(items, ['en', lang_col])
        print('.. head rows')
        print(corpus_df.head())

        searcher = CorpusSearcher(model_file=model_file)
        print(f'.. training {corpus_file}')
        searcher.train(corpus_df, 'en')
        print('done.')
Exemplo n.º 9
0
def choose_lang_and_corpus():

    language = st.sidebar.selectbox(
        'Which language do you choose?',
         list(all_labels.keys()))

    cur_lang=all_labels[language]
    corpus=[f for f in glob.glob(f'{corpus_prefix}/*_{cur_lang}_*.txt')]
    df=sagas.to_df(corpus, ['file'])

    cur_file = st.sidebar.selectbox(
        'Which corpus do you choose?',
         df['file'],
        format_func=lambda k: k.replace(corpus_prefix+'/', '').replace('.txt', '') ,
    )
    st.sidebar.text(f"Current: {cur_lang}, {cur_file}")
    return cur_lang, cur_file
Exemplo n.º 10
0
 def ents(self, sents, lang='en', simple=True):
     """
     $ python -m sagas.nlu.spacy_procs ents 'New York'
     $ python -m sagas.nlu.spacy_procs ents 'I am from China'
     $ python -m sagas.nlu.spacy_procs ents "Ada Lovelace was born in London"
     :param sents:
     :param lang:
     :return:
     """
     import sagas
     rs = []
     doc = self.spacy_doc(sents, lang, simple=simple)
     for ent in doc.ents:
         rs.append((ent.text, ent.start_char, ent.end_char, ent.label_,
                    ent.kb_id_))
     r = sagas.to_df(rs, ['word', 'start', 'end', 'entity', 'kb'])
     sagas.print_df(r)
Exemplo n.º 11
0
def intents_tool_panel(lang):
    from sagas.tool.intents_tool import intents_tool
    import sagas

    item = intents_tool.get_chapters()
    chapter = st.selectbox('which chapter to modify', item['chapters'])
    field = f'lang_{lang}' if lang != 'en' else 'text'
    text_list = get_records(lang, chapter, field)
    opts = [t[0] for t in text_list if t[2] == '']

    # intent modify function only available when the lang==en
    if lang == 'en' and len(opts) > 0:
        sents = st.selectbox('which sentence to modify', opts)
        entry = next(t for t in text_list if t[0] == sents)
        st.markdown(f"{entry[0]} `{entry[2]}`")

        sel_intents = st.multiselect('choose or input a intent',
                                     list(get_all_intents()))
        st.write(sel_intents)
        text_intent = st.text_input(
            "intent", sel_intents[0] if len(sel_intents) > 0 else '')
        if text_intent.strip() != '':
            # sel_intents.append(text_intent)
            target_intent = text_intent.strip()
        elif len(sel_intents) > 0:
            target_intent = sel_intents[0]
        else:
            target_intent = None

        if target_intent is not None:
            if st.button("store"):
                st.write(f'.. store {target_intent}')
                intents_tool.set_intent_by_text(sents, target_intent)

                # refresh list
                text_list = get_records(lang, chapter, field)

    # for entry in text_list:
    #     st.markdown(f"{entry[0]} `{entry[1]}`")
    st.table(
        sagas.to_df(text_list,
                    columns=[
                        f'text_{lang}',
                        'text_en' if lang != 'en' else 'location', 'intent'
                    ]))
Exemplo n.º 12
0
def list_rs(rs, lang):
    # from IPython.display import display
    from termcolor import colored
    tc.emp('cyan', f"✁ chunks. {'-' * 25}")
    for serial, r in enumerate(rs):
        df = sagas.to_df(
            r['domains'],
            ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        if 'head' in r:
            cla = "%s/%s(%s)" % (r['head_word'], r['head'], r['head_pos'])
        else:
            cla = '_'
        tc.info(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']),
                cla)
        # sagas.print_df(df)
        tc.dfs(trunc_cols(df))
        print_stem_chunks(r)
        list_synsets(r, lang)
Exemplo n.º 13
0
def universal_viz(intp, sents):
    from sagas.nlu.uni_parser import get_chunks
    from sagas.tool.misc import print_stem_chunks
    import sagas

    doc = intp(sents)
    doc.build_dependencies()
    # print(doc.dependencies)
    rs = get_chunks(doc)
    # print(rs)
    for r in rs:
        df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        tc.info('%s(%s)' % (r['type'], r['lemma']))
        tc.dfs(df)
        # display(df)
        print_stem_chunks(r)

    cv = EnhancedViz(shape='egg', size='8,5', fontsize=20)
    return cv.analyse_doc(doc, None)
Exemplo n.º 14
0
    def nltk_locales(self):
        """
        $ python -m sagas.nlu.locales nltk_locales
        :return:
        """
        from nltk.corpus import wordnet as wn
        from iso639 import languages
        import sagas
        langs = wn.langs()
        print(len(langs), sorted(langs))
        rs = []
        excepts = ['qcn']
        for lang in langs:
            if lang not in excepts:
                loc = languages.get(part3=lang)
                rs.append((loc.part3, loc.macro, loc.name))

        df=sagas.to_df(rs, ['code', 'micro', 'name'])
        sagas.print_df(df)
Exemplo n.º 15
0
    def treebanks_df(self):
        """
        $ python -m sagas.nlu.treebanks treebanks_df to-string
        :return:
        """
        import sagas

        # prepare the languages table
        langs = []
        lang_tab = []
        for bank in treebanks_defs[1:]:
            parts = bank.split('\t')
            if parts[0].strip() != '':
                langs.append(parts[0])
            lang_tab.append(parts)

        cols = treebanks_defs[0].split('\t')
        df = sagas.to_df(lang_tab, cols)
        return df
Exemplo n.º 16
0
 def extract_df(self, text):
     """
     extract_df('Россия, Вологодская обл. г. Череповец, пр.Победы 93 б')
     :param text:
     :return:
     """
     import sagas
     result = []
     for extractor in self.extractors:
         matches = extractor(text)
         if len(matches) > 0:
             ex_name = type(extractor).__name__.replace('Extractor',
                                                        '').lower()
             for match in matches:
                 start, stop = match.span
                 result.append(
                     (text[start:stop], start, stop, ex_name, match.fact))
     return sagas.to_df(result,
                        ['word', 'start', 'stop', 'extractor', 'fact'])
Exemplo n.º 17
0
    def list_chunk_entities(self, sents, lang='en'):
        """
        $ python -m sagas.nlu.chunk_entities list_chunk_entities 'Apple is looking at buying U.K. startup for $1 billion.'
        $ python -m sagas.nlu.chunk_entities list_chunk_entities "Where's the president?"
        $ python -m sagas.nlu.chunk_entities list_chunk_entities "διαμένω στη Νέα Υόρκη" el

        :param sents:
        :return:
        """
        import sagas

        doc = self.core_nlp(sents)
        doc_s = doc.sentences[0]

        tokens = tokenize(sents, doc_s)
        for tok in tokens:
            print(tok.index, '\t', tok.word, tok.word_offset, tok.positions)
        ent_pos = self.entity_positions(sents, lang)
        print(ent_pos)
        # process spans and overlaps
        chunks = []
        r = self.get_verb_domain(doc.sentences[0])
        # r = self.get_chunks(doc.sentences[0])
        if len(r) > 0:
            for el in r[0]['domains']:
                span_id = el[0]
                span_pos = el[4]
                start_mark = tokens[span_pos[0] - 1]
                end_mark = tokens[span_pos[-1] - 1]
                word_range = [
                    start_mark.positions['start'], end_mark.positions['end']
                ]
                entities = get_included_entities(word_range, ent_pos)
                chunks.append((span_id, span_pos, word_range,
                               sents[word_range[0]:word_range[1]],
                               [ent['entity'] for ent in entities]))
            df = sagas.to_df(
                chunks,
                ['rel', 'positions', 'range', 'chunk text', 'entities'])
            sagas.print_df(df[['rel', 'chunk text', 'entities']])
        else:
            # print("no chunks.")
            print("no verbs.")
Exemplo n.º 18
0
def parse_sentence(sentence, filters):
    import sagas
    words = ltp.segmentor.segment(sentence)
    postags = ltp.postagger.postag(words)
    arcs = ltp.parser.parse(words, postags)
    roles = ltp.labeller.label(words, postags, arcs)
    netags = ltp.recognizer.recognize(words, postags)

    root = ''
    root_idx = 0
    collector = []
    verbs = []
    for i in range(len(words)):
        rel = arcs[i].relation
        pos = postags[i]
        if rel == 'HED':
            root = words[i]
            root_idx = i
        if pos == 'v':
            verbs.append(words[i])

    # print('root', root, root_idx)
    collector.append(('root', root))

    rs = []
    for i in range(len(words)):
        print("%s --> %s|%s|%s|%s" % (words[int(arcs[i].head) - 1], words[i], \
                                      arcs[i].relation, postags[i], netags[i]))
        pos = postags[i]
        dep_idx = int(arcs[i].head) - 1
        head = words[dep_idx]
        rel = arcs[i].relation
        rs.append((head, words[i], \
                   rel, pos, netags[i]))
        if dep_idx == root_idx and in_filters(rel, filters):
            collector.append((rel.lower(), words[i]))

    df = sagas.to_df(rs, ['弧头', '弧尾', '依存关系', '词性', '命名实体'])
    return df, collector, verbs
Exemplo n.º 19
0
            def do_infers(ctx: Context, ds, filters):
                from sagas.nlu.inspectors_dataset import get_interrogative
                if 'head' in r:
                    # $ se 'you are dead'  # true
                    # $ spt 'Com licença, onde é o banheiro?'  # false
                    logger.debug(
                        f"head: {r['head']}, filter: {'head' in filters}")
                    rep = get_interrogative(r['head'], self.lang)
                    if rep:
                        pats.append((5, f"interr_root('{rep}')"))

                df = sagas.to_df(
                    r['domains'],
                    ['rel', 'index', 'text', 'lemma', 'children', 'features'])
                pat = self.proc_word(type_name, r['word'],
                                     r['head'] if 'head' in r else '',
                                     r['index'], r, self.lang)
                pat['rels'] = [sub[0] for sub in r['domains']]
                pat['stems'] = self.stem_chunks(r)
                pat['ctx'] = ctx

                domain = DomainToken(**pat)
                logger.debug(
                    f".. proc word {r['word']}, "
                    f"verb in filter ({'[verb]' in filters}), "
                    f"predicate in filter ({'[predicate]' in filters}), "
                    f"stems: {domain.stems}")
                if '[verb]' not in filters and '[predicate]' not in filters:
                    self.induce_domain_from_exts(domain, 'verb', pats)

                pat_r = self.induce_pattern(domain, ds, verbose)
                parts = self.proc_children_column(df, self.lang)
                for part in parts:
                    # logger.debug(f"{part.name}: {part.word}")
                    if part.name not in filters:
                        part.domain = domain
                        self.induce_part(part, pats, type_name, verbose)
                # display_synsets(f"[{theme}]", meta, r, data['lang'])
                return pat_r
Exemplo n.º 20
0
    def disp_by_offset(self, lang, offset, pos='n'):
        """
        $ python -m sagas.nlu.omw_extended disp_by_offset ru 9918554
        $ python -m sagas.nlu.omw_extended disp_by_offset de 9918554
        :param offset:
        :return:
        """
        import sagas

        offset = str(offset)
        id = '%s-%s' % (offset.zfill(8), pos)
        rs = []
        print('search for', id)
        if lang in langsets:
            data = self.load_dicts(lang)
            for row in data:
                if row[0] == id:
                    rs.append((row[0], row[2]))
            df = sagas.to_df(rs, ['id', 'word'])
            sagas.print_df(df)
        else:
            print('no data.')
Exemplo n.º 21
0
    def verb_domains(self, sents, lang='en'):
        """
        $ python -m sagas.nlu.uni_parser verb_domains "Barack Obama was born in Hawaii." en
        # 我有一只阿比西尼亚猫
        $ python -m sagas.nlu.uni_parser verb_domains "I have an Abyssinian cat." en

        $ python -m sagas.nlu.uni_parser verb_domains 'Что ты обычно ешь на ужин?' ru
        $ python -m sagas.nlu.uni_parser verb_domains 'Die Zeitschrift erscheint monatlich.' de

        # 测试多个动词(过滤掉从句的动词):
        $ python -m sagas.nlu.uni_parser verb_domains 'Tu as choisi laquelle tu vas manger ?' fr
        :param sents:
        :param lang:
        :return:
        """
        from sagas.nlu.corenlp_helper import get_nlp
        serial_numbers = '❶❷❸❹❺❻❼❽❾❿'
        nlp = get_nlp(lang)
        doc = nlp(sents)
        # 分析依赖关系, 自下而上, 可用于抽取指定关系的子节点集合, 比如此例中的'nsubj:pass'和'obl'
        # word.governor即为当前word的parent
        sent = doc.sentences[0]
        rs = get_verb_domain(sent)
        # r=rs[0]
        for num, r in enumerate(rs):
            # print(json.dumps(r, indent=2, ensure_ascii=False))
            print(serial_numbers[num], '-' * 50)
            # print(r['verb'], r['index'])
            print(r['word'], r['index'])
            # df=sagas.to_df(r[0]['domains'], ['rel', 'index', 'text', 'children'])
            df = sagas.to_df(
                r['domains'],
                ['rel', 'index', 'text', 'lemma', 'children', 'features'])
            sagas.print_df(df)
            for stem in r['stems']:
                if stem[0] == 'obj':
                    print('object ->', ' '.join(stem[1]))
Exemplo n.º 22
0
 def all_voices(self, lang=None):
     """
     $ python -m sagas.nlu.nlu_tools all_voices
     $ nlu all-voices ru
     :return:
     """
     import pyttsx3
     import sagas
     engine = pyttsx3.init()
     voices: collections.Iterable = engine.getProperty('voices')
     rs=[]
     for voice in voices:
         if lang is not None:
             if voice.languages[0].startswith(lang):
                 print(voice)
         else:
             print(voice, voice.id, voice.languages[0])
             rs.append((voice.id.replace('com.apple.speech.synthesis.',''),
                        voice.name,
                        voice.languages,
                        voice.gender
                        ))
     rs=sorted(rs, key=lambda el: el[2][0])
     sagas.print_df(sagas.to_df(rs, ['id', 'name', 'lang', 'gender']))
Exemplo n.º 23
0
def get_role_defs():
    desc_rs = []
    for desc in descs.split('\n')[1:]:
        desc_rs.append(desc.split('\t'))
    return sagas.to_df(desc_rs, ['mark', 'description'])
Exemplo n.º 24
0
 def entities_df(self, doc):
     import sagas
     rs = []
     for ent in doc.ents:
         rs.append((ent.text, ent.start_char, ent.end_char, ent.label_))
     return sagas.to_df(rs, ['word', 'start', 'end', 'entity'])
Exemplo n.º 25
0
    def analyse(self, sentence, verbose=False, show_roles=True):
        """
        $ python -m sagas.zh.ltp_procs analyse '中国进出口银行与中国银行加强合作。'
        $ python -m sagas.zh.ltp_procs analyse '国务院总理李克强调研上海外高桥时提出,支持上海积极探索新机制。'
        $ ltp '我是学生'
        :param sentence:
        :return:
        """
        from colorama import Fore, Back, Style
        from tabulate import tabulate
        import sagas

        pd.set_option('display.unicode.ambiguous_as_wide', True)
        pd.set_option('display.unicode.east_asian_width', True)

        # words = self.segmentor.segment(sentence)
        # postags = self.postagger.postag(words)
        # arcs = self.parser.parse(words, postags)
        # roles = self.labeller.label(words, postags, arcs)
        # netags = self.recognizer.recognize(words, postags)
        words, postags, arcs, roles, netags = self.parse(sentence)

        # roles
        print('❶ roles for', " ".join(words))
        # roles = self.labeller.label(words, postags, arcs)

        predicts = extract_predicates(words,
                                      roles,
                                      postags,
                                      arcs,
                                      verbose=False)
        print(predicts)

        # dep-parse
        if not show_roles:
            print('❷ dep parse')
            rs = []
            for i in range(len(words)):
                if verbose:
                    print("%s --> %s|%s|%s|%s" % (words[int(arcs[i].head) - 1], words[i], \
                                                  arcs[i].relation, postags[i], netags[i]))
                rs.append((words[int(arcs[i].head) - 1], words[i], \
                           arcs[i].relation, postags[i], netags[i]))
            df = sagas.to_df(rs, ['弧头', '弧尾', '依存关系', '词性', '命名实体'])
            df['命名实体'] = Fore.RED + Style.BRIGHT + df['命名实体'].astype(
                str) + Style.RESET_ALL
            # print(df)
            print(tabulate(df, headers='keys', tablefmt='psql'))

        else:
            print('❷ tokens')

            # arrange roles to a column
            # https://github.com/HIT-SCIR/pyltp/issues/152
            srl_as_tag_matrix = [['*'] * len(roles) for _ in sentence]
            for predicate_id, role in enumerate(roles):
                srl_as_tag_matrix[role.index][predicate_id] = '(V*)'
                for arg in role.arguments:
                    srl_as_tag_matrix[
                        arg.range.start][predicate_id] = '(%s*' % arg.name
                    srl_as_tag_matrix[arg.range.end][predicate_id] += ')'

            rs = []
            for id, (word, pos, arc, ne, role) in enumerate(
                    zip(words, postags, arcs, netags, srl_as_tag_matrix)):
                rs.append(([
                    str(id), word, pos,
                    str(arc.head), arc.relation, ne, ', '.join(role)
                ]))
            sagas.print_rs(rs,
                           ['id', 'word', 'pos', 'head', 'rel', 'ne', 'role'])
Exemplo n.º 26
0
default_labels = ["Dutch", "Persian", "Afrikaans"]
langs = st.sidebar.multiselect("Available langs", list(all_labels.keys()),
                               default_labels)
sel_langs = {all_labels[l] for l in langs}


def is_sel(f):
    for l in sel_langs:
        if f"_{l}_" in f:
            return True
    return False


corpus = [f for f in glob.glob('*.txt') if is_sel(f)]
df = sagas.to_df(corpus, ['file'])

option = st.sidebar.selectbox('Which corpus do you choose?', df['file'])

cur_lang = option[3:5]
'Current corpus:', option, f", language code: {cur_lang}", f", available lang: {langs}"

# text_raw=''''''.split('►')
text_raw = io_utils.read_file(option).split('►')

rows = []
for t in text_raw:
    # st.write(t)
    rows.append([
        l for l in t.split('\n')
        if l.strip() != '' and not l.startswith('#') and not l.startswith('⊕')
Exemplo n.º 27
0
def get_dep_defs():
    def_rs = []
    for dep in dep_defs:
        def_rs.append(dep.split('\t'))
    return sagas.to_df(def_rs, ['type', 'tag', 'desc', 'example'])
Exemplo n.º 28
0
    def analyse_doc(self, sentence, node_maps=None, console=True):
        from sagas.nlu.uni_intf import sub_comps
        import unicodedata

        segs = []
        # omit {word.feats}
        if console:
            tc.info(*[f'index: {word.index}\ttext: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for word in sentence.words], sep='\n')
        else:
            # from IPython.display import display
            import sagas
            df=sagas.to_df([(word.index, word.text, word.lemma, word.upos, word.xpos, word.entity) for word in sentence.words],
                           ['index', 'text', 'lemma', 'upos', 'xpos', 'entity'])
            tc.dfs(df)

        def translit_chunk(chunk:str, lang):
            from sagas.nlu.transliterations import translits
            # if upos=='PUNCT':
            #     return chunk
            if chunk.strip() in (',','.',';','?','!'):
                return chunk
            # if lang in ('ko', 'ja', 'fa', 'hi', 'ar'):
            if translits.is_available_lang(lang):
                if sa_env.runtime!='default':
                    return word.text+'\n'+translits.translit(chunk, lang)
                return translits.translit(chunk, lang)
            return chunk

        if node_maps is None:
            node_maps = {}
            for word in sentence.words:
                pos_attrs=f"({word.upos.lower()}, {word.xpos.lower()})"
                node_text=word.text if self.translit_lang is None or word.upos=='PUNCT' \
                    else translit_chunk(word.text, self.translit_lang)
                # node_text=unicodedata.normalize('NFKC', node_text) if word.upos=='PUNCT' else node_text
                norm=lambda t: unicodedata.normalize('NFKC', t).encode('ascii', 'ignore').decode("utf-8")
                node_text = norm(node_text) if word.upos == 'PUNCT' else node_text
                if node_text=='':
                    node_text='_'
                # verbose
                if word.text!=node_text:
                    print('# ', f"{word.text} -> {node_text}")
                node_maps[word.text] = node_text if not self.enable_node_pos else f"{node_text}\\n{pos_attrs}"

                # self.f.attr(color='black')
        prop_sets = {'VERB': lambda f: f.attr('node', style='filled', color='lightgrey'),
                     'PRON': lambda f: f.attr('node', style='dashed', color='red'),
                     'AUX': lambda f: f.attr('node', style='dashed', color='green'),
                     'NOUN': lambda f: f.attr('node', style='solid', color='blue'),
                     }
        # sentence = doc.sentences[0]
        for word in sentence.words:
            rel = word.dependency_relation
            if rel in sub_comps:
                if word.upos == 'VERB':
                    self.f.attr('node', style='filled', color='antiquewhite')
                elif word.upos in prop_sets:
                    prop_sets[word.upos](self.f)
                else:
                    self.default_node()

            # for all languages
            elif rel.endswith('comp'):
                self.f.attr('node', style='filled', color='antiquewhite')
            elif word.upos in prop_sets:
                prop_sets[word.upos](self.f)
            else:
                self.default_node()

            head = ''
            if word.governor == 0:
                head = '_root_'
            else:
                head_word = sentence.words[word.governor - 1]
                head = head_word.text
            # print(f"{word.text} -> {rel}, {word.governor}, {head}")
            self.f.node(node_maps[word.text])
            segs.append(node_maps[word.text])

        # self.f.node_attr.update(color='black')
        self.default_node()
        self.print_dependencies(sentence, segs, node_maps)
        return self.f
Exemplo n.º 29
0
def rs_represent(rs: List[Any], data: Dict[Text, Any], return_df=False):
    import sagas
    from sagas.nlu.rules import verb_patterns, aux_patterns, subj_patterns, predict_patterns
    from sagas.nlu.rules_lang_spec import langspecs
    from sagas.nlu.nlu_cli import NluCli
    from sagas.nlu.sinkers import Sinkers

    df_set = []
    result = []
    sinkers = Sinkers(data, rs[0]['type'])
    for serial, r in enumerate(rs):
        type_name = r['type']
        meta = build_meta(r, data)
        if type_name == 'verb_domains':
            theme = '[verb]'
            tc.info(
                serial_numbers[serial],
                theme,
                # r['lemma'], r['index'],
                f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}",
                '(%s, %s)' % (r['rel'], r['governor']))
            # meta = {'rel': r['rel'], **common, **data}
            verb_patterns(meta, r['domains'])
        elif type_name == 'aux_domains':
            theme = '[aux]'
            # 'rel': word.dependency_relation, 'governor': word.governor, 'head': dc.text
            delegator = '☇' if not r['delegator'] else '☌'
            tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'],
                    delegator, "%s(%s)" % (r['head'], r['head_pos']))
            # verb_patterns(r['domains'])
            # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data}
            aux_patterns(meta, r['domains'])
        elif type_name == 'subj_domains':
            theme = '[subj]'
            tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'], '☇',
                    f"{r['head']}")
            # verb_patterns(r['domains'])
            # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data}
            subj_patterns(meta, r['domains'])
        elif type_name == 'predicate':
            theme = '[predicates]'
            tc.info(serial_numbers[serial], theme,
                    f"{r['lemma']} ({r['phonetic']}, {r['word']})")
            # meta = {'rel': r['rel'], **common, **data}
            predict_patterns(meta, r['domains'])
        elif type_name == 'root_domains':
            theme = '[root]'
            tc.info(
                serial_numbers[serial], theme,
                f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}",
                '(%s, %s)' % (r['rel'], r['governor']))
            # meta = {'rel': r['rel'], **common, **data}
            # verb_patterns(meta, r['domains'])
            # check_langspec(data['lang'], meta, r['domains'], type_name)
        else:
            # meta = {}
            raise Exception(
                'Cannot process specific type: {}'.format(type_name))

        # process language special rules
        logger.debug(f"meta keys {meta.keys()}")
        mod_rs = langspecs.check_langspec(data['lang'], meta, r['domains'],
                                          type_name)
        sinkers.add_module_results(mod_rs)

        # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'children', 'features'])
        df = sagas.to_df(
            r['domains'],
            ['rel', 'index', 'text', 'lemma', 'children', 'features'])
        df_set.append(df)

        if not return_df:

            result.extend(
                proc_word(type_name, r['word'],
                          r['head_word'] if 'head_word' in r else '',
                          data['lang']))
            result.extend(
                proc_children_column(df['rel'], df['children'], data['lang']))

            # print('.......')
            # where 1 is the axis number (0 for rows and 1 for columns.)
            # df = df.drop('children', 1)
            # df['children'] = df['children'].apply(lambda x: ', '.join(x)[:15] + "..")
            # df['features'] = df['features'].apply(lambda x: ', '.join(x)[:15] + "..")
            trunc_cols(df)
            tc.dfs(df)
            print_stem_chunks(r)

            if print_def:
                NluCli().get_word_def(r['lemma'], data['lang'])
            if print_synsets:
                r = display_synsets(theme, meta, r, data['lang'])
                result.extend(r)

    sinkers.process_with_sinkers()
    return result, df_set