def result_df(rs): from sagas.conf.conf import cf import sagas # print_not_matched=cf.is_enabled('print_not_matched') print_not_matched = True recs = [] for r in rs: if not print_not_matched and not r[1]: pass else: recs.append(('✔' if r[1] else '✖', r[0])) return sagas.to_df(recs, ['match', 'options'])
def simple(): from pymongo import MongoClient import sagas # uri = 'mongodb://samlet.com/langs' uri = 'mongodb://localhost/langs' # uri = 'mongodb://192.168.0.101/langs' client = MongoClient(uri) db = client.get_default_database() print(db.name) rs = [] for r in db.trans.find({'source': 'id'}): rs.append((r['text'], r['target'])) sagas.print_df(sagas.to_df(rs, ['text', 'target']))
def translations_df(trans, idx=0): import sagas # fill with default field names size = len(trans[idx][2][0]) # print('size', size) listOfStrings = ['ext_' + str(i) for i in range(size)] if size==2: listOfStrings[0:2] = ['word', 'translations'] elif size==4: listOfStrings[0:4] = ['word', 'translations', 'c', 'freq'] else: listOfStrings[0:size] = ['word', 'translations', *[f'ex_{idx}' for idx in range(size-2)]] return sagas.to_df(trans[idx][2], listOfStrings)
def word_info_df(self, word): import sagas import json_utils rev_map = json_utils.read_json_file(self.target_file_rev) words_map = json_utils.read_json_file(self.target_file) tuples = [] ipa_xs = self.epi.xsampa_list(word) ipa = self.epi.transliterate(word) key = rev_map[word] tuples.append((word, ipa, ''.join(ipa_xs), key)) for w in words_map[key]: ipa_xs = self.epi.xsampa_list(w) ipa = self.epi.transliterate(w) tuples.append((w, ipa, ''.join(ipa_xs), key)) return sagas.to_df(tuples, ['word', 'ipa', 'xsampa_list', 'key'])
def list_contrast(rs, lang): result = [] for serial, r in enumerate(rs): type_name = r['type'] df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) rec = { 'type': type_name, 'word': r['word'], 'head': r['head'] if 'head' in r else '' } rec['domains'] = make_map(df['rel'], df['children']) rec['synsets'] = list_synsets(r, lang, True) result.append(rec) return result
def print_table(self, rs): import sagas # from IPython.display import display # df_set=[] for r in rs: for k, v in r.items(): if k != 'domains': logging.debug('%s=%s' % (k, v)) df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) # df_set.append(df) # if console: # sagas.print_df(df) # else: # display(df) tc.dfs(df)
def rs_summary(rs, console=True): from sagas.tool.misc import print_stem_chunks from IPython.display import display import sagas for serial, r in enumerate(rs): df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) if 'head' in r: cla = "%s(%s)" % (r['head'], r['head_pos']) else: cla = '_' print('%s(%s)' % (r['type'], r['lemma']), cla) # sagas.print_df(df) if not console: display(df) else: sagas.print_df(df) print_stem_chunks(r)
def train(self, lang_col, corpus_file, model_file): """ $ python -m sagas.corpus.index_trainer train id '~/pi/ai/seq2seq/ind-eng/ind.txt' '~/pi/data/bert/embedded_id.pkl' :param corpus_file: :return: """ from sagas.train.parallel_corpus import load_corpus, take_samples import sagas items = load_corpus(corpus_file) corpus_df = sagas.to_df(items, ['en', lang_col]) print('.. head rows') print(corpus_df.head()) searcher = CorpusSearcher(model_file=model_file) print(f'.. training {corpus_file}') searcher.train(corpus_df, 'en') print('done.')
def choose_lang_and_corpus(): language = st.sidebar.selectbox( 'Which language do you choose?', list(all_labels.keys())) cur_lang=all_labels[language] corpus=[f for f in glob.glob(f'{corpus_prefix}/*_{cur_lang}_*.txt')] df=sagas.to_df(corpus, ['file']) cur_file = st.sidebar.selectbox( 'Which corpus do you choose?', df['file'], format_func=lambda k: k.replace(corpus_prefix+'/', '').replace('.txt', '') , ) st.sidebar.text(f"Current: {cur_lang}, {cur_file}") return cur_lang, cur_file
def ents(self, sents, lang='en', simple=True): """ $ python -m sagas.nlu.spacy_procs ents 'New York' $ python -m sagas.nlu.spacy_procs ents 'I am from China' $ python -m sagas.nlu.spacy_procs ents "Ada Lovelace was born in London" :param sents: :param lang: :return: """ import sagas rs = [] doc = self.spacy_doc(sents, lang, simple=simple) for ent in doc.ents: rs.append((ent.text, ent.start_char, ent.end_char, ent.label_, ent.kb_id_)) r = sagas.to_df(rs, ['word', 'start', 'end', 'entity', 'kb']) sagas.print_df(r)
def intents_tool_panel(lang): from sagas.tool.intents_tool import intents_tool import sagas item = intents_tool.get_chapters() chapter = st.selectbox('which chapter to modify', item['chapters']) field = f'lang_{lang}' if lang != 'en' else 'text' text_list = get_records(lang, chapter, field) opts = [t[0] for t in text_list if t[2] == ''] # intent modify function only available when the lang==en if lang == 'en' and len(opts) > 0: sents = st.selectbox('which sentence to modify', opts) entry = next(t for t in text_list if t[0] == sents) st.markdown(f"{entry[0]} `{entry[2]}`") sel_intents = st.multiselect('choose or input a intent', list(get_all_intents())) st.write(sel_intents) text_intent = st.text_input( "intent", sel_intents[0] if len(sel_intents) > 0 else '') if text_intent.strip() != '': # sel_intents.append(text_intent) target_intent = text_intent.strip() elif len(sel_intents) > 0: target_intent = sel_intents[0] else: target_intent = None if target_intent is not None: if st.button("store"): st.write(f'.. store {target_intent}') intents_tool.set_intent_by_text(sents, target_intent) # refresh list text_list = get_records(lang, chapter, field) # for entry in text_list: # st.markdown(f"{entry[0]} `{entry[1]}`") st.table( sagas.to_df(text_list, columns=[ f'text_{lang}', 'text_en' if lang != 'en' else 'location', 'intent' ]))
def list_rs(rs, lang): # from IPython.display import display from termcolor import colored tc.emp('cyan', f"✁ chunks. {'-' * 25}") for serial, r in enumerate(rs): df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) if 'head' in r: cla = "%s/%s(%s)" % (r['head_word'], r['head'], r['head_pos']) else: cla = '_' tc.info(serial_numbers[serial], '%s(%s)' % (r['type'], r['lemma']), cla) # sagas.print_df(df) tc.dfs(trunc_cols(df)) print_stem_chunks(r) list_synsets(r, lang)
def universal_viz(intp, sents): from sagas.nlu.uni_parser import get_chunks from sagas.tool.misc import print_stem_chunks import sagas doc = intp(sents) doc.build_dependencies() # print(doc.dependencies) rs = get_chunks(doc) # print(rs) for r in rs: df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) tc.info('%s(%s)' % (r['type'], r['lemma'])) tc.dfs(df) # display(df) print_stem_chunks(r) cv = EnhancedViz(shape='egg', size='8,5', fontsize=20) return cv.analyse_doc(doc, None)
def nltk_locales(self): """ $ python -m sagas.nlu.locales nltk_locales :return: """ from nltk.corpus import wordnet as wn from iso639 import languages import sagas langs = wn.langs() print(len(langs), sorted(langs)) rs = [] excepts = ['qcn'] for lang in langs: if lang not in excepts: loc = languages.get(part3=lang) rs.append((loc.part3, loc.macro, loc.name)) df=sagas.to_df(rs, ['code', 'micro', 'name']) sagas.print_df(df)
def treebanks_df(self): """ $ python -m sagas.nlu.treebanks treebanks_df to-string :return: """ import sagas # prepare the languages table langs = [] lang_tab = [] for bank in treebanks_defs[1:]: parts = bank.split('\t') if parts[0].strip() != '': langs.append(parts[0]) lang_tab.append(parts) cols = treebanks_defs[0].split('\t') df = sagas.to_df(lang_tab, cols) return df
def extract_df(self, text): """ extract_df('Россия, Вологодская обл. г. Череповец, пр.Победы 93 б') :param text: :return: """ import sagas result = [] for extractor in self.extractors: matches = extractor(text) if len(matches) > 0: ex_name = type(extractor).__name__.replace('Extractor', '').lower() for match in matches: start, stop = match.span result.append( (text[start:stop], start, stop, ex_name, match.fact)) return sagas.to_df(result, ['word', 'start', 'stop', 'extractor', 'fact'])
def list_chunk_entities(self, sents, lang='en'): """ $ python -m sagas.nlu.chunk_entities list_chunk_entities 'Apple is looking at buying U.K. startup for $1 billion.' $ python -m sagas.nlu.chunk_entities list_chunk_entities "Where's the president?" $ python -m sagas.nlu.chunk_entities list_chunk_entities "διαμένω στη Νέα Υόρκη" el :param sents: :return: """ import sagas doc = self.core_nlp(sents) doc_s = doc.sentences[0] tokens = tokenize(sents, doc_s) for tok in tokens: print(tok.index, '\t', tok.word, tok.word_offset, tok.positions) ent_pos = self.entity_positions(sents, lang) print(ent_pos) # process spans and overlaps chunks = [] r = self.get_verb_domain(doc.sentences[0]) # r = self.get_chunks(doc.sentences[0]) if len(r) > 0: for el in r[0]['domains']: span_id = el[0] span_pos = el[4] start_mark = tokens[span_pos[0] - 1] end_mark = tokens[span_pos[-1] - 1] word_range = [ start_mark.positions['start'], end_mark.positions['end'] ] entities = get_included_entities(word_range, ent_pos) chunks.append((span_id, span_pos, word_range, sents[word_range[0]:word_range[1]], [ent['entity'] for ent in entities])) df = sagas.to_df( chunks, ['rel', 'positions', 'range', 'chunk text', 'entities']) sagas.print_df(df[['rel', 'chunk text', 'entities']]) else: # print("no chunks.") print("no verbs.")
def parse_sentence(sentence, filters): import sagas words = ltp.segmentor.segment(sentence) postags = ltp.postagger.postag(words) arcs = ltp.parser.parse(words, postags) roles = ltp.labeller.label(words, postags, arcs) netags = ltp.recognizer.recognize(words, postags) root = '' root_idx = 0 collector = [] verbs = [] for i in range(len(words)): rel = arcs[i].relation pos = postags[i] if rel == 'HED': root = words[i] root_idx = i if pos == 'v': verbs.append(words[i]) # print('root', root, root_idx) collector.append(('root', root)) rs = [] for i in range(len(words)): print("%s --> %s|%s|%s|%s" % (words[int(arcs[i].head) - 1], words[i], \ arcs[i].relation, postags[i], netags[i])) pos = postags[i] dep_idx = int(arcs[i].head) - 1 head = words[dep_idx] rel = arcs[i].relation rs.append((head, words[i], \ rel, pos, netags[i])) if dep_idx == root_idx and in_filters(rel, filters): collector.append((rel.lower(), words[i])) df = sagas.to_df(rs, ['弧头', '弧尾', '依存关系', '词性', '命名实体']) return df, collector, verbs
def do_infers(ctx: Context, ds, filters): from sagas.nlu.inspectors_dataset import get_interrogative if 'head' in r: # $ se 'you are dead' # true # $ spt 'Com licença, onde é o banheiro?' # false logger.debug( f"head: {r['head']}, filter: {'head' in filters}") rep = get_interrogative(r['head'], self.lang) if rep: pats.append((5, f"interr_root('{rep}')")) df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) pat = self.proc_word(type_name, r['word'], r['head'] if 'head' in r else '', r['index'], r, self.lang) pat['rels'] = [sub[0] for sub in r['domains']] pat['stems'] = self.stem_chunks(r) pat['ctx'] = ctx domain = DomainToken(**pat) logger.debug( f".. proc word {r['word']}, " f"verb in filter ({'[verb]' in filters}), " f"predicate in filter ({'[predicate]' in filters}), " f"stems: {domain.stems}") if '[verb]' not in filters and '[predicate]' not in filters: self.induce_domain_from_exts(domain, 'verb', pats) pat_r = self.induce_pattern(domain, ds, verbose) parts = self.proc_children_column(df, self.lang) for part in parts: # logger.debug(f"{part.name}: {part.word}") if part.name not in filters: part.domain = domain self.induce_part(part, pats, type_name, verbose) # display_synsets(f"[{theme}]", meta, r, data['lang']) return pat_r
def disp_by_offset(self, lang, offset, pos='n'): """ $ python -m sagas.nlu.omw_extended disp_by_offset ru 9918554 $ python -m sagas.nlu.omw_extended disp_by_offset de 9918554 :param offset: :return: """ import sagas offset = str(offset) id = '%s-%s' % (offset.zfill(8), pos) rs = [] print('search for', id) if lang in langsets: data = self.load_dicts(lang) for row in data: if row[0] == id: rs.append((row[0], row[2])) df = sagas.to_df(rs, ['id', 'word']) sagas.print_df(df) else: print('no data.')
def verb_domains(self, sents, lang='en'): """ $ python -m sagas.nlu.uni_parser verb_domains "Barack Obama was born in Hawaii." en # 我有一只阿比西尼亚猫 $ python -m sagas.nlu.uni_parser verb_domains "I have an Abyssinian cat." en $ python -m sagas.nlu.uni_parser verb_domains 'Что ты обычно ешь на ужин?' ru $ python -m sagas.nlu.uni_parser verb_domains 'Die Zeitschrift erscheint monatlich.' de # 测试多个动词(过滤掉从句的动词): $ python -m sagas.nlu.uni_parser verb_domains 'Tu as choisi laquelle tu vas manger ?' fr :param sents: :param lang: :return: """ from sagas.nlu.corenlp_helper import get_nlp serial_numbers = '❶❷❸❹❺❻❼❽❾❿' nlp = get_nlp(lang) doc = nlp(sents) # 分析依赖关系, 自下而上, 可用于抽取指定关系的子节点集合, 比如此例中的'nsubj:pass'和'obl' # word.governor即为当前word的parent sent = doc.sentences[0] rs = get_verb_domain(sent) # r=rs[0] for num, r in enumerate(rs): # print(json.dumps(r, indent=2, ensure_ascii=False)) print(serial_numbers[num], '-' * 50) # print(r['verb'], r['index']) print(r['word'], r['index']) # df=sagas.to_df(r[0]['domains'], ['rel', 'index', 'text', 'children']) df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) sagas.print_df(df) for stem in r['stems']: if stem[0] == 'obj': print('object ->', ' '.join(stem[1]))
def all_voices(self, lang=None): """ $ python -m sagas.nlu.nlu_tools all_voices $ nlu all-voices ru :return: """ import pyttsx3 import sagas engine = pyttsx3.init() voices: collections.Iterable = engine.getProperty('voices') rs=[] for voice in voices: if lang is not None: if voice.languages[0].startswith(lang): print(voice) else: print(voice, voice.id, voice.languages[0]) rs.append((voice.id.replace('com.apple.speech.synthesis.',''), voice.name, voice.languages, voice.gender )) rs=sorted(rs, key=lambda el: el[2][0]) sagas.print_df(sagas.to_df(rs, ['id', 'name', 'lang', 'gender']))
def get_role_defs(): desc_rs = [] for desc in descs.split('\n')[1:]: desc_rs.append(desc.split('\t')) return sagas.to_df(desc_rs, ['mark', 'description'])
def entities_df(self, doc): import sagas rs = [] for ent in doc.ents: rs.append((ent.text, ent.start_char, ent.end_char, ent.label_)) return sagas.to_df(rs, ['word', 'start', 'end', 'entity'])
def analyse(self, sentence, verbose=False, show_roles=True): """ $ python -m sagas.zh.ltp_procs analyse '中国进出口银行与中国银行加强合作。' $ python -m sagas.zh.ltp_procs analyse '国务院总理李克强调研上海外高桥时提出,支持上海积极探索新机制。' $ ltp '我是学生' :param sentence: :return: """ from colorama import Fore, Back, Style from tabulate import tabulate import sagas pd.set_option('display.unicode.ambiguous_as_wide', True) pd.set_option('display.unicode.east_asian_width', True) # words = self.segmentor.segment(sentence) # postags = self.postagger.postag(words) # arcs = self.parser.parse(words, postags) # roles = self.labeller.label(words, postags, arcs) # netags = self.recognizer.recognize(words, postags) words, postags, arcs, roles, netags = self.parse(sentence) # roles print('❶ roles for', " ".join(words)) # roles = self.labeller.label(words, postags, arcs) predicts = extract_predicates(words, roles, postags, arcs, verbose=False) print(predicts) # dep-parse if not show_roles: print('❷ dep parse') rs = [] for i in range(len(words)): if verbose: print("%s --> %s|%s|%s|%s" % (words[int(arcs[i].head) - 1], words[i], \ arcs[i].relation, postags[i], netags[i])) rs.append((words[int(arcs[i].head) - 1], words[i], \ arcs[i].relation, postags[i], netags[i])) df = sagas.to_df(rs, ['弧头', '弧尾', '依存关系', '词性', '命名实体']) df['命名实体'] = Fore.RED + Style.BRIGHT + df['命名实体'].astype( str) + Style.RESET_ALL # print(df) print(tabulate(df, headers='keys', tablefmt='psql')) else: print('❷ tokens') # arrange roles to a column # https://github.com/HIT-SCIR/pyltp/issues/152 srl_as_tag_matrix = [['*'] * len(roles) for _ in sentence] for predicate_id, role in enumerate(roles): srl_as_tag_matrix[role.index][predicate_id] = '(V*)' for arg in role.arguments: srl_as_tag_matrix[ arg.range.start][predicate_id] = '(%s*' % arg.name srl_as_tag_matrix[arg.range.end][predicate_id] += ')' rs = [] for id, (word, pos, arc, ne, role) in enumerate( zip(words, postags, arcs, netags, srl_as_tag_matrix)): rs.append(([ str(id), word, pos, str(arc.head), arc.relation, ne, ', '.join(role) ])) sagas.print_rs(rs, ['id', 'word', 'pos', 'head', 'rel', 'ne', 'role'])
default_labels = ["Dutch", "Persian", "Afrikaans"] langs = st.sidebar.multiselect("Available langs", list(all_labels.keys()), default_labels) sel_langs = {all_labels[l] for l in langs} def is_sel(f): for l in sel_langs: if f"_{l}_" in f: return True return False corpus = [f for f in glob.glob('*.txt') if is_sel(f)] df = sagas.to_df(corpus, ['file']) option = st.sidebar.selectbox('Which corpus do you choose?', df['file']) cur_lang = option[3:5] 'Current corpus:', option, f", language code: {cur_lang}", f", available lang: {langs}" # text_raw=''''''.split('►') text_raw = io_utils.read_file(option).split('►') rows = [] for t in text_raw: # st.write(t) rows.append([ l for l in t.split('\n') if l.strip() != '' and not l.startswith('#') and not l.startswith('⊕')
def get_dep_defs(): def_rs = [] for dep in dep_defs: def_rs.append(dep.split('\t')) return sagas.to_df(def_rs, ['type', 'tag', 'desc', 'example'])
def analyse_doc(self, sentence, node_maps=None, console=True): from sagas.nlu.uni_intf import sub_comps import unicodedata segs = [] # omit {word.feats} if console: tc.info(*[f'index: {word.index}\ttext: {word.text+" "}\tlemma: {word.lemma}\tupos: {word.upos}\txpos: {word.xpos}' for word in sentence.words], sep='\n') else: # from IPython.display import display import sagas df=sagas.to_df([(word.index, word.text, word.lemma, word.upos, word.xpos, word.entity) for word in sentence.words], ['index', 'text', 'lemma', 'upos', 'xpos', 'entity']) tc.dfs(df) def translit_chunk(chunk:str, lang): from sagas.nlu.transliterations import translits # if upos=='PUNCT': # return chunk if chunk.strip() in (',','.',';','?','!'): return chunk # if lang in ('ko', 'ja', 'fa', 'hi', 'ar'): if translits.is_available_lang(lang): if sa_env.runtime!='default': return word.text+'\n'+translits.translit(chunk, lang) return translits.translit(chunk, lang) return chunk if node_maps is None: node_maps = {} for word in sentence.words: pos_attrs=f"({word.upos.lower()}, {word.xpos.lower()})" node_text=word.text if self.translit_lang is None or word.upos=='PUNCT' \ else translit_chunk(word.text, self.translit_lang) # node_text=unicodedata.normalize('NFKC', node_text) if word.upos=='PUNCT' else node_text norm=lambda t: unicodedata.normalize('NFKC', t).encode('ascii', 'ignore').decode("utf-8") node_text = norm(node_text) if word.upos == 'PUNCT' else node_text if node_text=='': node_text='_' # verbose if word.text!=node_text: print('# ', f"{word.text} -> {node_text}") node_maps[word.text] = node_text if not self.enable_node_pos else f"{node_text}\\n{pos_attrs}" # self.f.attr(color='black') prop_sets = {'VERB': lambda f: f.attr('node', style='filled', color='lightgrey'), 'PRON': lambda f: f.attr('node', style='dashed', color='red'), 'AUX': lambda f: f.attr('node', style='dashed', color='green'), 'NOUN': lambda f: f.attr('node', style='solid', color='blue'), } # sentence = doc.sentences[0] for word in sentence.words: rel = word.dependency_relation if rel in sub_comps: if word.upos == 'VERB': self.f.attr('node', style='filled', color='antiquewhite') elif word.upos in prop_sets: prop_sets[word.upos](self.f) else: self.default_node() # for all languages elif rel.endswith('comp'): self.f.attr('node', style='filled', color='antiquewhite') elif word.upos in prop_sets: prop_sets[word.upos](self.f) else: self.default_node() head = '' if word.governor == 0: head = '_root_' else: head_word = sentence.words[word.governor - 1] head = head_word.text # print(f"{word.text} -> {rel}, {word.governor}, {head}") self.f.node(node_maps[word.text]) segs.append(node_maps[word.text]) # self.f.node_attr.update(color='black') self.default_node() self.print_dependencies(sentence, segs, node_maps) return self.f
def rs_represent(rs: List[Any], data: Dict[Text, Any], return_df=False): import sagas from sagas.nlu.rules import verb_patterns, aux_patterns, subj_patterns, predict_patterns from sagas.nlu.rules_lang_spec import langspecs from sagas.nlu.nlu_cli import NluCli from sagas.nlu.sinkers import Sinkers df_set = [] result = [] sinkers = Sinkers(data, rs[0]['type']) for serial, r in enumerate(rs): type_name = r['type'] meta = build_meta(r, data) if type_name == 'verb_domains': theme = '[verb]' tc.info( serial_numbers[serial], theme, # r['lemma'], r['index'], f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}", '(%s, %s)' % (r['rel'], r['governor'])) # meta = {'rel': r['rel'], **common, **data} verb_patterns(meta, r['domains']) elif type_name == 'aux_domains': theme = '[aux]' # 'rel': word.dependency_relation, 'governor': word.governor, 'head': dc.text delegator = '☇' if not r['delegator'] else '☌' tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'], delegator, "%s(%s)" % (r['head'], r['head_pos'])) # verb_patterns(r['domains']) # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data} aux_patterns(meta, r['domains']) elif type_name == 'subj_domains': theme = '[subj]' tc.info(serial_numbers[serial], theme, r['lemma'], r['rel'], '☇', f"{r['head']}") # verb_patterns(r['domains']) # meta = {'pos': r['head_pos'], 'head': r['head'], **common, **data} subj_patterns(meta, r['domains']) elif type_name == 'predicate': theme = '[predicates]' tc.info(serial_numbers[serial], theme, f"{r['lemma']} ({r['phonetic']}, {r['word']})") # meta = {'rel': r['rel'], **common, **data} predict_patterns(meta, r['domains']) elif type_name == 'root_domains': theme = '[root]' tc.info( serial_numbers[serial], theme, f"{r['word']}/{r['lemma']}, pos: {r['upos']}/{r['xpos']}, idx: {r['index']}", '(%s, %s)' % (r['rel'], r['governor'])) # meta = {'rel': r['rel'], **common, **data} # verb_patterns(meta, r['domains']) # check_langspec(data['lang'], meta, r['domains'], type_name) else: # meta = {} raise Exception( 'Cannot process specific type: {}'.format(type_name)) # process language special rules logger.debug(f"meta keys {meta.keys()}") mod_rs = langspecs.check_langspec(data['lang'], meta, r['domains'], type_name) sinkers.add_module_results(mod_rs) # df = sagas.to_df(r['domains'], ['rel', 'index', 'text', 'children', 'features']) df = sagas.to_df( r['domains'], ['rel', 'index', 'text', 'lemma', 'children', 'features']) df_set.append(df) if not return_df: result.extend( proc_word(type_name, r['word'], r['head_word'] if 'head_word' in r else '', data['lang'])) result.extend( proc_children_column(df['rel'], df['children'], data['lang'])) # print('.......') # where 1 is the axis number (0 for rows and 1 for columns.) # df = df.drop('children', 1) # df['children'] = df['children'].apply(lambda x: ', '.join(x)[:15] + "..") # df['features'] = df['features'].apply(lambda x: ', '.join(x)[:15] + "..") trunc_cols(df) tc.dfs(df) print_stem_chunks(r) if print_def: NluCli().get_word_def(r['lemma'], data['lang']) if print_synsets: r = display_synsets(theme, meta, r, data['lang']) result.extend(r) sinkers.process_with_sinkers() return result, df_set