def arr_to_sentence(arr: np.ndarray, vocab: MorphVocab) -> nlp.Sentence: morphemes = {} a2m = defaultdict(list) t2a = defaultdict(set) t2t = {} analysis_arr = np.resize(arr, (-1, arr.shape[2])) for morpheme_arr in analysis_arr: if morpheme_arr[0] == 0: continue form = vocab.forms[morpheme_arr[0]] lemma = vocab.lemmas[morpheme_arr[1]] tag = vocab.tags[morpheme_arr[2]] gender = vocab.feats[morpheme_arr[3]] number = vocab.feats[morpheme_arr[4]] person = vocab.feats[morpheme_arr[5]] tense = vocab.feats[morpheme_arr[6]] binyan = vocab.feats[morpheme_arr[7]] polarity = vocab.feats[morpheme_arr[8]] token = vocab.tokens[morpheme_arr[9]] sent_idx = morpheme_arr[10] token_idx = morpheme_arr[11] analysis_idx = morpheme_arr[12] is_gold = morpheme_arr[13] morpheme_idx = morpheme_arr[14] mtype = _mtypes[morpheme_arr[15]] feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_']) m = morph.Morpheme(form, lemma, tag, feats) morphemes[(token_idx, analysis_idx, morpheme_idx)] = (mtype, m, is_gold) a2m[(token_idx, analysis_idx)].append(morpheme_idx) t2a[token_idx].add(analysis_idx) t2t[token_idx] = token lattice = morph.Lattice() gold_lattice = morph.Lattice() tokens = [] for token_idx in t2a: if token_idx == 0: continue token_analyses = [] gold_token_analyses = [] for analysis_idx in sorted(t2a[token_idx]): prefixes, hosts, suffixes = [], [], [] is_gold_analysis = False for morpheme_idx in a2m[(token_idx, analysis_idx)]: mtype, m, is_gold = morphemes[(token_idx, analysis_idx, morpheme_idx)] if mtype == 'pref': prefixes.append(m) elif mtype == 'suff': suffixes.append(m) else: hosts.append(m) is_gold_analysis = is_gold != 0 a = morph.Analysis(prefixes, hosts, suffixes) token_analyses.append(a) if is_gold_analysis: gold_token_analyses.append(a) lattice[token_idx] = token_analyses gold_lattice[token_idx] = gold_token_analyses tokens.append(t2t[token_idx]) return nlp.Sentence(tokens, lattice, gold_lattice)
def arr_to_sentence(arr: np.ndarray, vocab: TokenVocab) -> nlp.Sentence: tokens = {} token_analyses = defaultdict(list) token_gold_analyses = defaultdict(list) for analysis_arr in arr: pref_forms = vocab.pref_forms[analysis_arr[0]] pref_lemmas = vocab.pref_lemmas[analysis_arr[1]] pref_tags = vocab.pref_tags[analysis_arr[2]] pref_feats = vocab.pref_feats[analysis_arr[3]] host_forms = vocab.host_forms[analysis_arr[4]] host_lemmas = vocab.host_lemmas[analysis_arr[5]] host_tags = vocab.host_tags[analysis_arr[6]] host_feats = vocab.host_feats[analysis_arr[7]] suff_forms = vocab.suff_forms[analysis_arr[8]] suff_lemmas = vocab.suff_lemmas[analysis_arr[9]] suff_tags = vocab.suff_tags[analysis_arr[10]] suff_feats = vocab.suff_feats[analysis_arr[11]] token = vocab.tokens[analysis_arr[12]] # sent_idx = analysis_arr[13] token_idx = analysis_arr[14] # analysis_idx = analysis_arr[15] is_gold = analysis_arr[16] # if analysis_arr[6] == vocab.host_tag2id[tuple('<PAD>')]: if analysis_arr[6] == 0: break tokens[token_idx] = token prefixes, hosts, suffixes = [], [], [] for form, lemma, tag, feats in zip(pref_forms, pref_lemmas, pref_tags, pref_feats): # feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) prefixes.append(m) for form, lemma, tag, feats in zip(host_forms, host_lemmas, host_tags, host_feats): # feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) hosts.append(m) for form, lemma, tag, feats in zip(suff_forms, suff_lemmas, suff_tags, suff_feats): # feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) suffixes.append(m) analysis = morph.Analysis(prefixes, hosts, suffixes) token_analyses[token_idx].append(analysis) if is_gold: token_gold_analyses[token_idx].append(analysis) tokens = [tokens[token_id] for token_id in sorted(tokens)] lattice = morph.Lattice() for token_id in token_analyses: lattice[token_id] = token_analyses[token_id] gold_lattice = morph.Lattice() for token_id in token_gold_analyses: gold_lattice[token_id] = token_gold_analyses[token_id] return nlp.Sentence(tokens, lattice, gold_lattice)
def _create_sentence(self, lexicon: lex.Lexicon, tokens: list, md_lattice: conllx.LatticeGraph) -> nlp.Sentence: lex_entries = [lexicon.entry(token) for token in tokens] lattice = morph.Lattice() for i, entry in enumerate(lex_entries): lattice[i + 1] = entry.analyses gold_lattice = morph.Lattice() for tid, token in enumerate(tokens): token_id = tid + 1 gold_analysis = [] for path in md_lattice._token_paths[token_id]: gold_analysis.append(self._create_analysis(path)) if len(gold_analysis) != 1: raise ValueError("token gold analysis: {}".format(gold_analysis)) gold_lattice[token_id] = gold_analysis return nlp.Sentence(tokens, lattice, gold_lattice)
def _infuse(data_set_name: str, sentences: list) -> list: infused_sentences = [] total_infused_token_lattices = 0 total_infused_sentence_lattices = 0 for sent_index, sent in enumerate(sentences): sentence_infused = False infused_lattice = morph.Lattice() for token_index in sent.gold_lattice: infused_lattice[token_index] = sent.lattice[token_index].copy() gold_analysis = sent.analysis(token_index) found = False for analysis in sent.lattice[token_index]: if morph.analysis_equals_no_lemma(gold_analysis, analysis, []): found = True break if not found: sentence_infused = True total_infused_token_lattices += 1 print('Infusing {} sent_index {} token_index {}: {}'.format(data_set_name, sent_index, token_index, gold_analysis)) infused_lattice[token_index].append(gold_analysis) if sentence_infused: total_infused_sentence_lattices += 1 infused_sent = nlp.Sentence(sent.tokens, infused_lattice, sent.gold_lattice) infused_sentences.append(infused_sent) print("Total {} infused token lattices = {}".format(data_set_name, total_infused_token_lattices)) print("Total {} infused sentence lattices = {}".format(data_set_name, total_infused_sentence_lattices)) return infused_sentences
def lattice(tokens: list, lex: Lexicon) -> morph.Lattice: lex_entries = [lex.entry(token) for token in tokens] lex_lattice = morph.Lattice() for tid, token in enumerate(tokens): token_id = tid + 1 lex_lattice[token_id] = lex_entries[tid].analyses return lex_lattice
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence: token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx) tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)] token_analyses = defaultdict(list) token_gold_analyses = defaultdict(list) for tg in sorted(token_gb): token_idx = tg[0] token_analyses = [] gold_token_analyses = [] analysis_gb = tg[1].groupby(sent_df.analysis_idx) for ag in analysis_gb: pref_forms = ag[1]['pref_forms'].iloc[0] pref_lemmas = ag[1]['pref_lemmas'].iloc[0] pref_tags = ag[1]['pref_tags'].iloc[0] pref_feats = ag[1]['pref_feats'].iloc[0] host_forms = ag[1]['host_forms'].iloc[0] host_lemmas = ag[1]['host_lemmas'].iloc[0] host_tags = ag[1]['host_tags'].iloc[0] host_feats = ag[1]['host_feats'].iloc[0] suff_forms = ag[1]['suff_forms'].iloc[0] suff_lemmas = ag[1]['suff_lemmas'].iloc[0] suff_tags = ag[1]['suff_tags'].iloc[0] suff_feats = ag[1]['suff_feats'].iloc[0] prefixes, hosts, suffixes = [], [], [] for form, lemma, tag, fstr in zip(pref_forms, pref_lemmas, pref_tags, pref_feats): feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) prefixes.append(m) for form, lemma, tag, fstr in zip(host_forms, host_lemmas, host_tags, host_feats): feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) hosts.append(m) for form, lemma, tag, fstr in zip(suff_forms, suff_lemmas, suff_tags, suff_feats): feats = morph.Features.create([f for f in fstr.split("|") if f != '_']) m = morph.Morpheme(form, lemma, tag, feats) suffixes.append(m) analysis = morph.Analysis(prefixes, hosts, suffixes) token_analyses[token_idx].append(analysis) is_gold = ag[1]['is_gold'].iloc[0] if is_gold: gold_token_analyses[token_idx].append(analysis) lattice = morph.Lattice(token_analyses) gold_lattice = morph.Lattice(token_gold_analyses) return nlp.Sentence(tokens, lattice, gold_lattice)
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence: token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx) tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)] lattice = morph.Lattice() gold_lattice = morph.Lattice() for tg in sorted(token_gb): token_idx = tg[0] token_analyses = [] gold_token_analyses = [] analysis_gb = tg[1].groupby(sent_df.analysis_idx) for ag in analysis_gb: prefixes, hosts, suffixes = [], [], [] morpheme_gb = ag[1].groupby(sent_df.morpheme_idx) for mg in morpheme_gb: mtype = mg[1]['mtype'].iloc[0] form = mg[1]['form'].iloc[0] lemma = mg[1]['lemma'].iloc[0] tag = mg[1]['tag'].iloc[0] gender = mg[1]['gender'].iloc[0] number = mg[1]['number'].iloc[0] person = mg[1]['person'].iloc[0] tense = mg[1]['tense'].iloc[0] binyan = mg[1]['binyan'].iloc[0] polarity = mg[1]['polarity'].iloc[0] feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_']) m = morph.Morpheme(form, lemma, tag, feats) if mtype == 'pref': prefixes.append(m) elif mtype == 'suff': suffixes.append(m) else: hosts.append(m) a = morph.Analysis(prefixes, hosts, suffixes) token_analyses.append(a) is_gold = ag[1]['is_gold'].iloc[0] if is_gold: gold_token_analyses.append(a) lattice[token_idx] = token_analyses gold_lattice[token_idx] = gold_token_analyses return nlp.Sentence(tokens, lattice, gold_lattice)
def _normalize(sentences: list) -> list: normalized_sentences = [] for sent in sentences: normalized_gold_lattice = morph.Lattice() for token_index in sent.lattice: token_lattice = sent.lattice[token_index] gold = sent.gold_lattice[token_index][0] normalized_gold = _map_normalized_gold_analysis(token_lattice, gold) normalized_gold = _reduce_normalized_gold_analysis(gold, normalized_gold) normalized_gold_lattice[token_index] = [normalized_gold] normalized_sent = nlp.Sentence(sent.tokens, sent.lattice, normalized_gold_lattice) normalized_sentences.append(normalized_sent) return normalized_sentences
def ptrnet_ma(tokens: list, lex: Lexicon, model: PtrNetModel) -> nlp.Sentence: lattice = ma.lattice(tokens, lex) gold_lattice = morph.Lattice() for token_id in lattice: gold_lattice[token_id] = [morph.Analysis([], [], [])] sentence = nlp.Sentence(tokens, lattice, gold_lattice) new_tokens, new_forms, new_lemmas = model.vocab.update(sentence) if new_forms: new_form_matrix = model_ft.get_word_vectors(home_path, sorted(new_forms)) new_form_matrix = torch.tensor(new_form_matrix, dtype=torch.float, device=device) model.emb.update_form_emb_(new_form_matrix) if new_lemmas: new_lemma_matrix = model_ft.get_word_vectors(home_path, sorted(new_lemmas)) new_lemma_matrix = torch.tensor(new_lemma_matrix, dtype=torch.float, device=device) model.emb.update_lemma_emb_(new_lemma_matrix) return sentence