Пример #1
0
def arr_to_sentence(arr: np.ndarray, vocab: MorphVocab) -> nlp.Sentence:
    morphemes = {}
    a2m = defaultdict(list)
    t2a = defaultdict(set)
    t2t = {}
    analysis_arr = np.resize(arr, (-1, arr.shape[2]))
    for morpheme_arr in analysis_arr:
        if morpheme_arr[0] == 0:
            continue
        form = vocab.forms[morpheme_arr[0]]
        lemma = vocab.lemmas[morpheme_arr[1]]
        tag = vocab.tags[morpheme_arr[2]]
        gender = vocab.feats[morpheme_arr[3]]
        number = vocab.feats[morpheme_arr[4]]
        person = vocab.feats[morpheme_arr[5]]
        tense = vocab.feats[morpheme_arr[6]]
        binyan = vocab.feats[morpheme_arr[7]]
        polarity = vocab.feats[morpheme_arr[8]]
        token = vocab.tokens[morpheme_arr[9]]
        sent_idx = morpheme_arr[10]
        token_idx = morpheme_arr[11]
        analysis_idx = morpheme_arr[12]
        is_gold = morpheme_arr[13]
        morpheme_idx = morpheme_arr[14]
        mtype = _mtypes[morpheme_arr[15]]
        feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_'])
        m = morph.Morpheme(form, lemma, tag, feats)
        morphemes[(token_idx, analysis_idx, morpheme_idx)] = (mtype, m, is_gold)
        a2m[(token_idx, analysis_idx)].append(morpheme_idx)
        t2a[token_idx].add(analysis_idx)
        t2t[token_idx] = token
    lattice = morph.Lattice()
    gold_lattice = morph.Lattice()
    tokens = []
    for token_idx in t2a:
        if token_idx == 0:
            continue
        token_analyses = []
        gold_token_analyses = []
        for analysis_idx in sorted(t2a[token_idx]):
            prefixes, hosts, suffixes = [], [], []
            is_gold_analysis = False
            for morpheme_idx in a2m[(token_idx, analysis_idx)]:
                mtype, m, is_gold = morphemes[(token_idx, analysis_idx, morpheme_idx)]
                if mtype == 'pref':
                    prefixes.append(m)
                elif mtype == 'suff':
                    suffixes.append(m)
                else:
                    hosts.append(m)
                is_gold_analysis = is_gold != 0
            a = morph.Analysis(prefixes, hosts, suffixes)
            token_analyses.append(a)
            if is_gold_analysis:
                gold_token_analyses.append(a)
        lattice[token_idx] = token_analyses
        gold_lattice[token_idx] = gold_token_analyses
        tokens.append(t2t[token_idx])
    return nlp.Sentence(tokens, lattice, gold_lattice)
Пример #2
0
def arr_to_sentence(arr: np.ndarray, vocab: TokenVocab) -> nlp.Sentence:
    tokens = {}
    token_analyses = defaultdict(list)
    token_gold_analyses = defaultdict(list)
    for analysis_arr in arr:
        pref_forms = vocab.pref_forms[analysis_arr[0]]
        pref_lemmas = vocab.pref_lemmas[analysis_arr[1]]
        pref_tags = vocab.pref_tags[analysis_arr[2]]
        pref_feats = vocab.pref_feats[analysis_arr[3]]
        host_forms = vocab.host_forms[analysis_arr[4]]
        host_lemmas = vocab.host_lemmas[analysis_arr[5]]
        host_tags = vocab.host_tags[analysis_arr[6]]
        host_feats = vocab.host_feats[analysis_arr[7]]
        suff_forms = vocab.suff_forms[analysis_arr[8]]
        suff_lemmas = vocab.suff_lemmas[analysis_arr[9]]
        suff_tags = vocab.suff_tags[analysis_arr[10]]
        suff_feats = vocab.suff_feats[analysis_arr[11]]
        token = vocab.tokens[analysis_arr[12]]
        # sent_idx = analysis_arr[13]
        token_idx = analysis_arr[14]
        # analysis_idx = analysis_arr[15]
        is_gold = analysis_arr[16]
        # if analysis_arr[6] == vocab.host_tag2id[tuple('<PAD>')]:
        if analysis_arr[6] == 0:
            break
        tokens[token_idx] = token
        prefixes, hosts, suffixes = [], [], []
        for form, lemma, tag, feats in zip(pref_forms, pref_lemmas, pref_tags, pref_feats):
            # feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
            m = morph.Morpheme(form, lemma, tag, feats)
            prefixes.append(m)
        for form, lemma, tag, feats in zip(host_forms, host_lemmas, host_tags, host_feats):
            # feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
            m = morph.Morpheme(form, lemma, tag, feats)
            hosts.append(m)
        for form, lemma, tag, feats in zip(suff_forms, suff_lemmas, suff_tags, suff_feats):
            # feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
            m = morph.Morpheme(form, lemma, tag, feats)
            suffixes.append(m)
        analysis = morph.Analysis(prefixes, hosts, suffixes)
        token_analyses[token_idx].append(analysis)
        if is_gold:
            token_gold_analyses[token_idx].append(analysis)
    tokens = [tokens[token_id] for token_id in sorted(tokens)]
    lattice = morph.Lattice()
    for token_id in token_analyses:
        lattice[token_id] = token_analyses[token_id]
    gold_lattice = morph.Lattice()
    for token_id in token_gold_analyses:
        gold_lattice[token_id] = token_gold_analyses[token_id]
    return nlp.Sentence(tokens, lattice, gold_lattice)
Пример #3
0
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence:
    token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx)
    tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)]
    token_analyses = defaultdict(list)
    token_gold_analyses = defaultdict(list)
    for tg in sorted(token_gb):
        token_idx = tg[0]
        token_analyses = []
        gold_token_analyses = []
        analysis_gb = tg[1].groupby(sent_df.analysis_idx)
        for ag in analysis_gb:
            pref_forms = ag[1]['pref_forms'].iloc[0]
            pref_lemmas = ag[1]['pref_lemmas'].iloc[0]
            pref_tags = ag[1]['pref_tags'].iloc[0]
            pref_feats = ag[1]['pref_feats'].iloc[0]
            host_forms = ag[1]['host_forms'].iloc[0]
            host_lemmas = ag[1]['host_lemmas'].iloc[0]
            host_tags = ag[1]['host_tags'].iloc[0]
            host_feats = ag[1]['host_feats'].iloc[0]
            suff_forms = ag[1]['suff_forms'].iloc[0]
            suff_lemmas = ag[1]['suff_lemmas'].iloc[0]
            suff_tags = ag[1]['suff_tags'].iloc[0]
            suff_feats = ag[1]['suff_feats'].iloc[0]
            prefixes, hosts, suffixes = [], [], []
            for form, lemma, tag, fstr in zip(pref_forms, pref_lemmas, pref_tags, pref_feats):
                feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                prefixes.append(m)
            for form, lemma, tag, fstr in zip(host_forms, host_lemmas, host_tags, host_feats):
                feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                hosts.append(m)
            for form, lemma, tag, fstr in zip(suff_forms, suff_lemmas, suff_tags, suff_feats):
                feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                suffixes.append(m)
            analysis = morph.Analysis(prefixes, hosts, suffixes)
            token_analyses[token_idx].append(analysis)
            is_gold = ag[1]['is_gold'].iloc[0]
            if is_gold:
                gold_token_analyses[token_idx].append(analysis)
    lattice = morph.Lattice(token_analyses)
    gold_lattice = morph.Lattice(token_gold_analyses)
    return nlp.Sentence(tokens, lattice, gold_lattice)
Пример #4
0
def _reduce_normalized_gold_analysis(gold: morph.Analysis, normalized_gold: list):
    if len(normalized_gold) == 1:
        return normalized_gold[0]
    f = _get_norm_analysis_features(normalized_gold, gold)
    if len(f) < len(gold.hosts[0].feats):
        return gold
    forms = list({form for a in normalized_gold for form in a.host_forms})
    lemmas = list({lemma for a in normalized_gold for lemma in a.host_lemmas})
    tags = list({tag for a in normalized_gold for tag in a.host_tags})
    form = forms[0]
    if len(lemmas) == 1:
        lemma = lemmas[0]
    elif len(lemmas) == 0:
        lemma = None
    else:
        lemma = gold.host_lemmas[-1]
    tag = tags[0]
    m = morph.Morpheme(form, lemma, tag, f)
    return morph.Analysis(gold.prefixes, [m], gold.suffixes)
Пример #5
0
 def _create_analysis(self, path: list) -> morph.Analysis:
     prefixes = []
     hosts = []
     suffixes = []
     for edge in path:
         p = self._create_pref_morpheme(edge)
         if p is not None:
             if hosts:
                 hosts.append(p)
             else:
                 prefixes.append(p)
             continue
         h = self._create_host_morpheme(edge)
         if h is not None:
             hosts.append(h)
         s = self._create_suff_morpheme(edge)
         if s is not None:
             suffixes.append(s)
     return morph.Analysis(prefixes, hosts, suffixes)
Пример #6
0
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence:
    token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx)
    tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)]
    lattice = morph.Lattice()
    gold_lattice = morph.Lattice()
    for tg in sorted(token_gb):
        token_idx = tg[0]
        token_analyses = []
        gold_token_analyses = []
        analysis_gb = tg[1].groupby(sent_df.analysis_idx)
        for ag in analysis_gb:
            prefixes, hosts, suffixes = [], [], []
            morpheme_gb = ag[1].groupby(sent_df.morpheme_idx)
            for mg in morpheme_gb:
                mtype = mg[1]['mtype'].iloc[0]
                form = mg[1]['form'].iloc[0]
                lemma = mg[1]['lemma'].iloc[0]
                tag = mg[1]['tag'].iloc[0]
                gender = mg[1]['gender'].iloc[0]
                number = mg[1]['number'].iloc[0]
                person = mg[1]['person'].iloc[0]
                tense = mg[1]['tense'].iloc[0]
                binyan = mg[1]['binyan'].iloc[0]
                polarity = mg[1]['polarity'].iloc[0]
                feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                if mtype == 'pref':
                    prefixes.append(m)
                elif mtype == 'suff':
                    suffixes.append(m)
                else:
                    hosts.append(m)
            a = morph.Analysis(prefixes, hosts, suffixes)
            token_analyses.append(a)
            is_gold = ag[1]['is_gold'].iloc[0]
            if is_gold:
                gold_token_analyses.append(a)
        lattice[token_idx] = token_analyses
        gold_lattice[token_idx] = gold_token_analyses
    return nlp.Sentence(tokens, lattice, gold_lattice)
Пример #7
0
def ptrnet_ma(tokens: list, lex: Lexicon, model: PtrNetModel) -> nlp.Sentence:
    lattice = ma.lattice(tokens, lex)
    gold_lattice = morph.Lattice()
    for token_id in lattice:
        gold_lattice[token_id] = [morph.Analysis([], [], [])]
    sentence = nlp.Sentence(tokens, lattice, gold_lattice)
    new_tokens, new_forms, new_lemmas = model.vocab.update(sentence)
    if new_forms:
        new_form_matrix = model_ft.get_word_vectors(home_path,
                                                    sorted(new_forms))
        new_form_matrix = torch.tensor(new_form_matrix,
                                       dtype=torch.float,
                                       device=device)
        model.emb.update_form_emb_(new_form_matrix)
    if new_lemmas:
        new_lemma_matrix = model_ft.get_word_vectors(home_path,
                                                     sorted(new_lemmas))
        new_lemma_matrix = torch.tensor(new_lemma_matrix,
                                        dtype=torch.float,
                                        device=device)
        model.emb.update_lemma_emb_(new_lemma_matrix)
    return sentence
Пример #8
0
 def append_sos_analysis(sent_idx: int):
     t = '<SOS>'
     m = morph.Morpheme(t, t, t, morph.EMPTY_FEATURES)
     a = morph.Analysis([], [m], [])
     append_analysis(sent_idx, 0, 0, a, 1, t)