def arr_to_sentence(arr: np.ndarray, vocab: MorphVocab) -> nlp.Sentence: morphemes = {} a2m = defaultdict(list) t2a = defaultdict(set) t2t = {} analysis_arr = np.resize(arr, (-1, arr.shape[2])) for morpheme_arr in analysis_arr: if morpheme_arr[0] == 0: continue form = vocab.forms[morpheme_arr[0]] lemma = vocab.lemmas[morpheme_arr[1]] tag = vocab.tags[morpheme_arr[2]] gender = vocab.feats[morpheme_arr[3]] number = vocab.feats[morpheme_arr[4]] person = vocab.feats[morpheme_arr[5]] tense = vocab.feats[morpheme_arr[6]] binyan = vocab.feats[morpheme_arr[7]] polarity = vocab.feats[morpheme_arr[8]] token = vocab.tokens[morpheme_arr[9]] sent_idx = morpheme_arr[10] token_idx = morpheme_arr[11] analysis_idx = morpheme_arr[12] is_gold = morpheme_arr[13] morpheme_idx = morpheme_arr[14] mtype = _mtypes[morpheme_arr[15]] feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_']) m = morph.Morpheme(form, lemma, tag, feats) morphemes[(token_idx, analysis_idx, morpheme_idx)] = (mtype, m, is_gold) a2m[(token_idx, analysis_idx)].append(morpheme_idx) t2a[token_idx].add(analysis_idx) t2t[token_idx] = token lattice = morph.Lattice() gold_lattice = morph.Lattice() tokens = [] for token_idx in t2a: if token_idx == 0: continue token_analyses = [] gold_token_analyses = [] for analysis_idx in sorted(t2a[token_idx]): prefixes, hosts, suffixes = [], [], [] is_gold_analysis = False for morpheme_idx in a2m[(token_idx, analysis_idx)]: mtype, m, is_gold = morphemes[(token_idx, analysis_idx, morpheme_idx)] if mtype == 'pref': prefixes.append(m) elif mtype == 'suff': suffixes.append(m) else: hosts.append(m) is_gold_analysis = is_gold != 0 a = morph.Analysis(prefixes, hosts, suffixes) token_analyses.append(a) if is_gold_analysis: gold_token_analyses.append(a) lattice[token_idx] = token_analyses gold_lattice[token_idx] = gold_token_analyses tokens.append(t2t[token_idx]) return nlp.Sentence(tokens, lattice, gold_lattice)
def arr_to_sentence(arr: np.ndarray, vocab: TokenVocab) -> nlp.Sentence: tokens = {} token_analyses = defaultdict(list) token_gold_analyses = defaultdict(list) for analysis_arr in arr: pref_forms = vocab.pref_forms[analysis_arr[0]] pref_lemmas = vocab.pref_lemmas[analysis_arr[1]] pref_tags = vocab.pref_tags[analysis_arr[2]] pref_feats = vocab.pref_feats[analysis_arr[3]] host_forms = vocab.host_forms[analysis_arr[4]] host_lemmas = vocab.host_lemmas[analysis_arr[5]] host_tags = vocab.host_tags[analysis_arr[6]] host_feats = vocab.host_feats[analysis_arr[7]] suff_forms = vocab.suff_forms[analysis_arr[8]] suff_lemmas = vocab.suff_lemmas[analysis_arr[9]] suff_tags = vocab.suff_tags[analysis_arr[10]] suff_feats = vocab.suff_feats[analysis_arr[11]] token = vocab.tokens[analysis_arr[12]] # sent_idx = analysis_arr[13] token_idx = analysis_arr[14] # analysis_idx = analysis_arr[15] is_gold = analysis_arr[16] # if analysis_arr[6] == vocab.host_tag2id[tuple('<PAD>')]: if analysis_arr[6] == 0: break tokens[token_idx] = token prefixes, hosts, suffixes = [], [], [] for form, lemma, tag, feats in zip(pref_forms, pref_lemmas, pref_tags, pref_feats): # feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) prefixes.append(m) for form, lemma, tag, feats in zip(host_forms, host_lemmas, host_tags, host_feats): # feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) hosts.append(m) for form, lemma, tag, feats in zip(suff_forms, suff_lemmas, suff_tags, suff_feats): # feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) suffixes.append(m) analysis = morph.Analysis(prefixes, hosts, suffixes) token_analyses[token_idx].append(analysis) if is_gold: token_gold_analyses[token_idx].append(analysis) tokens = [tokens[token_id] for token_id in sorted(tokens)] lattice = morph.Lattice() for token_id in token_analyses: lattice[token_id] = token_analyses[token_id] gold_lattice = morph.Lattice() for token_id in token_gold_analyses: gold_lattice[token_id] = token_gold_analyses[token_id] return nlp.Sentence(tokens, lattice, gold_lattice)
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence: token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx) tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)] token_analyses = defaultdict(list) token_gold_analyses = defaultdict(list) for tg in sorted(token_gb): token_idx = tg[0] token_analyses = [] gold_token_analyses = [] analysis_gb = tg[1].groupby(sent_df.analysis_idx) for ag in analysis_gb: pref_forms = ag[1]['pref_forms'].iloc[0] pref_lemmas = ag[1]['pref_lemmas'].iloc[0] pref_tags = ag[1]['pref_tags'].iloc[0] pref_feats = ag[1]['pref_feats'].iloc[0] host_forms = ag[1]['host_forms'].iloc[0] host_lemmas = ag[1]['host_lemmas'].iloc[0] host_tags = ag[1]['host_tags'].iloc[0] host_feats = ag[1]['host_feats'].iloc[0] suff_forms = ag[1]['suff_forms'].iloc[0] suff_lemmas = ag[1]['suff_lemmas'].iloc[0] suff_tags = ag[1]['suff_tags'].iloc[0] suff_feats = ag[1]['suff_feats'].iloc[0] prefixes, hosts, suffixes = [], [], [] for form, lemma, tag, fstr in zip(pref_forms, pref_lemmas, pref_tags, pref_feats): feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) prefixes.append(m) for form, lemma, tag, fstr in zip(host_forms, host_lemmas, host_tags, host_feats): feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) hosts.append(m) for form, lemma, tag, fstr in zip(suff_forms, suff_lemmas, suff_tags, suff_feats): feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) suffixes.append(m) analysis = morph.Analysis(prefixes, hosts, suffixes) token_analyses[token_idx].append(analysis) is_gold = ag[1]['is_gold'].iloc[0] if is_gold: gold_token_analyses[token_idx].append(analysis) lattice = morph.Lattice(token_analyses) gold_lattice = morph.Lattice(token_gold_analyses) return nlp.Sentence(tokens, lattice, gold_lattice)
def _reduce_normalized_gold_analysis(gold: morph.Analysis, normalized_gold: list): if len(normalized_gold) == 1: return normalized_gold[0] f = _get_norm_analysis_features(normalized_gold, gold) if len(f) < len(gold.hosts[0].feats): return gold forms = list({form for a in normalized_gold for form in a.host_forms}) lemmas = list({lemma for a in normalized_gold for lemma in a.host_lemmas}) tags = list({tag for a in normalized_gold for tag in a.host_tags}) form = forms[0] if len(lemmas) == 1: lemma = lemmas[0] elif len(lemmas) == 0: lemma = None else: lemma = gold.host_lemmas[-1] tag = tags[0] m = morph.Morpheme(form, lemma, tag, f) return morph.Analysis(gold.prefixes, [m], gold.suffixes)
def _create_analysis(self, path: list) -> morph.Analysis: prefixes = [] hosts = [] suffixes = [] for edge in path: p = self._create_pref_morpheme(edge) if p is not None: if hosts: hosts.append(p) else: prefixes.append(p) continue h = self._create_host_morpheme(edge) if h is not None: hosts.append(h) s = self._create_suff_morpheme(edge) if s is not None: suffixes.append(s) return morph.Analysis(prefixes, hosts, suffixes)
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence: token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx) tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)] lattice = morph.Lattice() gold_lattice = morph.Lattice() for tg in sorted(token_gb): token_idx = tg[0] token_analyses = [] gold_token_analyses = [] analysis_gb = tg[1].groupby(sent_df.analysis_idx) for ag in analysis_gb: prefixes, hosts, suffixes = [], [], [] morpheme_gb = ag[1].groupby(sent_df.morpheme_idx) for mg in morpheme_gb: mtype = mg[1]['mtype'].iloc[0] form = mg[1]['form'].iloc[0] lemma = mg[1]['lemma'].iloc[0] tag = mg[1]['tag'].iloc[0] gender = mg[1]['gender'].iloc[0] number = mg[1]['number'].iloc[0] person = mg[1]['person'].iloc[0] tense = mg[1]['tense'].iloc[0] binyan = mg[1]['binyan'].iloc[0] polarity = mg[1]['polarity'].iloc[0] feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_']) m = morph.Morpheme(form, lemma, tag, feats) if mtype == 'pref': prefixes.append(m) elif mtype == 'suff': suffixes.append(m) else: hosts.append(m) a = morph.Analysis(prefixes, hosts, suffixes) token_analyses.append(a) is_gold = ag[1]['is_gold'].iloc[0] if is_gold: gold_token_analyses.append(a) lattice[token_idx] = token_analyses gold_lattice[token_idx] = gold_token_analyses return nlp.Sentence(tokens, lattice, gold_lattice)
def ptrnet_ma(tokens: list, lex: Lexicon, model: PtrNetModel) -> nlp.Sentence: lattice = ma.lattice(tokens, lex) gold_lattice = morph.Lattice() for token_id in lattice: gold_lattice[token_id] = [morph.Analysis([], [], [])] sentence = nlp.Sentence(tokens, lattice, gold_lattice) new_tokens, new_forms, new_lemmas = model.vocab.update(sentence) if new_forms: new_form_matrix = model_ft.get_word_vectors(home_path, sorted(new_forms)) new_form_matrix = torch.tensor(new_form_matrix, dtype=torch.float, device=device) model.emb.update_form_emb_(new_form_matrix) if new_lemmas: new_lemma_matrix = model_ft.get_word_vectors(home_path, sorted(new_lemmas)) new_lemma_matrix = torch.tensor(new_lemma_matrix, dtype=torch.float, device=device) model.emb.update_lemma_emb_(new_lemma_matrix) return sentence
def append_sos_analysis(sent_idx: int): t = '<SOS>' m = morph.Morpheme(t, t, t, morph.EMPTY_FEATURES) a = morph.Analysis([], [m], []) append_analysis(sent_idx, 0, 0, a, 1, t)