示例#1
0
def arr_to_sentence(arr: np.ndarray, vocab: MorphVocab) -> nlp.Sentence:
    morphemes = {}
    a2m = defaultdict(list)
    t2a = defaultdict(set)
    t2t = {}
    analysis_arr = np.resize(arr, (-1, arr.shape[2]))
    for morpheme_arr in analysis_arr:
        if morpheme_arr[0] == 0:
            continue
        form = vocab.forms[morpheme_arr[0]]
        lemma = vocab.lemmas[morpheme_arr[1]]
        tag = vocab.tags[morpheme_arr[2]]
        gender = vocab.feats[morpheme_arr[3]]
        number = vocab.feats[morpheme_arr[4]]
        person = vocab.feats[morpheme_arr[5]]
        tense = vocab.feats[morpheme_arr[6]]
        binyan = vocab.feats[morpheme_arr[7]]
        polarity = vocab.feats[morpheme_arr[8]]
        token = vocab.tokens[morpheme_arr[9]]
        sent_idx = morpheme_arr[10]
        token_idx = morpheme_arr[11]
        analysis_idx = morpheme_arr[12]
        is_gold = morpheme_arr[13]
        morpheme_idx = morpheme_arr[14]
        mtype = _mtypes[morpheme_arr[15]]
        feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_'])
        m = morph.Morpheme(form, lemma, tag, feats)
        morphemes[(token_idx, analysis_idx, morpheme_idx)] = (mtype, m, is_gold)
        a2m[(token_idx, analysis_idx)].append(morpheme_idx)
        t2a[token_idx].add(analysis_idx)
        t2t[token_idx] = token
    lattice = morph.Lattice()
    gold_lattice = morph.Lattice()
    tokens = []
    for token_idx in t2a:
        if token_idx == 0:
            continue
        token_analyses = []
        gold_token_analyses = []
        for analysis_idx in sorted(t2a[token_idx]):
            prefixes, hosts, suffixes = [], [], []
            is_gold_analysis = False
            for morpheme_idx in a2m[(token_idx, analysis_idx)]:
                mtype, m, is_gold = morphemes[(token_idx, analysis_idx, morpheme_idx)]
                if mtype == 'pref':
                    prefixes.append(m)
                elif mtype == 'suff':
                    suffixes.append(m)
                else:
                    hosts.append(m)
                is_gold_analysis = is_gold != 0
            a = morph.Analysis(prefixes, hosts, suffixes)
            token_analyses.append(a)
            if is_gold_analysis:
                gold_token_analyses.append(a)
        lattice[token_idx] = token_analyses
        gold_lattice[token_idx] = gold_token_analyses
        tokens.append(t2t[token_idx])
    return nlp.Sentence(tokens, lattice, gold_lattice)
示例#2
0
def arr_to_sentence(arr: np.ndarray, vocab: TokenVocab) -> nlp.Sentence:
    tokens = {}
    token_analyses = defaultdict(list)
    token_gold_analyses = defaultdict(list)
    for analysis_arr in arr:
        pref_forms = vocab.pref_forms[analysis_arr[0]]
        pref_lemmas = vocab.pref_lemmas[analysis_arr[1]]
        pref_tags = vocab.pref_tags[analysis_arr[2]]
        pref_feats = vocab.pref_feats[analysis_arr[3]]
        host_forms = vocab.host_forms[analysis_arr[4]]
        host_lemmas = vocab.host_lemmas[analysis_arr[5]]
        host_tags = vocab.host_tags[analysis_arr[6]]
        host_feats = vocab.host_feats[analysis_arr[7]]
        suff_forms = vocab.suff_forms[analysis_arr[8]]
        suff_lemmas = vocab.suff_lemmas[analysis_arr[9]]
        suff_tags = vocab.suff_tags[analysis_arr[10]]
        suff_feats = vocab.suff_feats[analysis_arr[11]]
        token = vocab.tokens[analysis_arr[12]]
        # sent_idx = analysis_arr[13]
        token_idx = analysis_arr[14]
        # analysis_idx = analysis_arr[15]
        is_gold = analysis_arr[16]
        # if analysis_arr[6] == vocab.host_tag2id[tuple('<PAD>')]:
        if analysis_arr[6] == 0:
            break
        tokens[token_idx] = token
        prefixes, hosts, suffixes = [], [], []
        for form, lemma, tag, feats in zip(pref_forms, pref_lemmas, pref_tags, pref_feats):
            # feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
            m = morph.Morpheme(form, lemma, tag, feats)
            prefixes.append(m)
        for form, lemma, tag, feats in zip(host_forms, host_lemmas, host_tags, host_feats):
            # feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
            m = morph.Morpheme(form, lemma, tag, feats)
            hosts.append(m)
        for form, lemma, tag, feats in zip(suff_forms, suff_lemmas, suff_tags, suff_feats):
            # feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
            m = morph.Morpheme(form, lemma, tag, feats)
            suffixes.append(m)
        analysis = morph.Analysis(prefixes, hosts, suffixes)
        token_analyses[token_idx].append(analysis)
        if is_gold:
            token_gold_analyses[token_idx].append(analysis)
    tokens = [tokens[token_id] for token_id in sorted(tokens)]
    lattice = morph.Lattice()
    for token_id in token_analyses:
        lattice[token_id] = token_analyses[token_id]
    gold_lattice = morph.Lattice()
    for token_id in token_gold_analyses:
        gold_lattice[token_id] = token_gold_analyses[token_id]
    return nlp.Sentence(tokens, lattice, gold_lattice)
示例#3
0
 def _create_sentence(self, lexicon: lex.Lexicon, tokens: list, md_lattice: conllx.LatticeGraph) -> nlp.Sentence:
     lex_entries = [lexicon.entry(token) for token in tokens]
     lattice = morph.Lattice()
     for i, entry in enumerate(lex_entries):
         lattice[i + 1] = entry.analyses
     gold_lattice = morph.Lattice()
     for tid, token in enumerate(tokens):
         token_id = tid + 1
         gold_analysis = []
         for path in md_lattice._token_paths[token_id]:
             gold_analysis.append(self._create_analysis(path))
         if len(gold_analysis) != 1:
             raise ValueError("token gold analysis: {}".format(gold_analysis))
         gold_lattice[token_id] = gold_analysis
     return nlp.Sentence(tokens, lattice, gold_lattice)
示例#4
0
def _infuse(data_set_name: str, sentences: list) -> list:
    infused_sentences = []
    total_infused_token_lattices = 0
    total_infused_sentence_lattices = 0
    for sent_index, sent in enumerate(sentences):
        sentence_infused = False
        infused_lattice = morph.Lattice()
        for token_index in sent.gold_lattice:
            infused_lattice[token_index] = sent.lattice[token_index].copy()
            gold_analysis = sent.analysis(token_index)
            found = False
            for analysis in sent.lattice[token_index]:
                if morph.analysis_equals_no_lemma(gold_analysis, analysis, []):
                    found = True
                    break
            if not found:
                sentence_infused = True
                total_infused_token_lattices += 1
                print('Infusing {} sent_index {} token_index {}: {}'.format(data_set_name, sent_index, token_index, gold_analysis))
                infused_lattice[token_index].append(gold_analysis)
        if sentence_infused:
            total_infused_sentence_lattices += 1
        infused_sent = nlp.Sentence(sent.tokens, infused_lattice, sent.gold_lattice)
        infused_sentences.append(infused_sent)
    print("Total {} infused token lattices = {}".format(data_set_name, total_infused_token_lattices))
    print("Total {} infused sentence lattices = {}".format(data_set_name, total_infused_sentence_lattices))
    return infused_sentences
示例#5
0
文件: ma.py 项目: aseker00/meryl
def lattice(tokens: list, lex: Lexicon) -> morph.Lattice:
    lex_entries = [lex.entry(token) for token in tokens]
    lex_lattice = morph.Lattice()
    for tid, token in enumerate(tokens):
        token_id = tid + 1
        lex_lattice[token_id] = lex_entries[tid].analyses
    return lex_lattice
示例#6
0
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence:
    token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx)
    tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)]
    token_analyses = defaultdict(list)
    token_gold_analyses = defaultdict(list)
    for tg in sorted(token_gb):
        token_idx = tg[0]
        token_analyses = []
        gold_token_analyses = []
        analysis_gb = tg[1].groupby(sent_df.analysis_idx)
        for ag in analysis_gb:
            pref_forms = ag[1]['pref_forms'].iloc[0]
            pref_lemmas = ag[1]['pref_lemmas'].iloc[0]
            pref_tags = ag[1]['pref_tags'].iloc[0]
            pref_feats = ag[1]['pref_feats'].iloc[0]
            host_forms = ag[1]['host_forms'].iloc[0]
            host_lemmas = ag[1]['host_lemmas'].iloc[0]
            host_tags = ag[1]['host_tags'].iloc[0]
            host_feats = ag[1]['host_feats'].iloc[0]
            suff_forms = ag[1]['suff_forms'].iloc[0]
            suff_lemmas = ag[1]['suff_lemmas'].iloc[0]
            suff_tags = ag[1]['suff_tags'].iloc[0]
            suff_feats = ag[1]['suff_feats'].iloc[0]
            prefixes, hosts, suffixes = [], [], []
            for form, lemma, tag, fstr in zip(pref_forms, pref_lemmas, pref_tags, pref_feats):
                feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                prefixes.append(m)
            for form, lemma, tag, fstr in zip(host_forms, host_lemmas, host_tags, host_feats):
                feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                hosts.append(m)
            for form, lemma, tag, fstr in zip(suff_forms, suff_lemmas, suff_tags, suff_feats):
                feats = morph.Features.create([f for f in fstr.split("|") if f != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                suffixes.append(m)
            analysis = morph.Analysis(prefixes, hosts, suffixes)
            token_analyses[token_idx].append(analysis)
            is_gold = ag[1]['is_gold'].iloc[0]
            if is_gold:
                gold_token_analyses[token_idx].append(analysis)
    lattice = morph.Lattice(token_analyses)
    gold_lattice = morph.Lattice(token_gold_analyses)
    return nlp.Sentence(tokens, lattice, gold_lattice)
示例#7
0
def _dataframe_to_sentence(sent_df: pd.DataFrame) -> nlp.Sentence:
    token_gb = sent_df.iloc[1:].groupby(sent_df.token_idx)
    tokens = [tg[1].iloc[0].token for tg in sorted(token_gb)]
    lattice = morph.Lattice()
    gold_lattice = morph.Lattice()
    for tg in sorted(token_gb):
        token_idx = tg[0]
        token_analyses = []
        gold_token_analyses = []
        analysis_gb = tg[1].groupby(sent_df.analysis_idx)
        for ag in analysis_gb:
            prefixes, hosts, suffixes = [], [], []
            morpheme_gb = ag[1].groupby(sent_df.morpheme_idx)
            for mg in morpheme_gb:
                mtype = mg[1]['mtype'].iloc[0]
                form = mg[1]['form'].iloc[0]
                lemma = mg[1]['lemma'].iloc[0]
                tag = mg[1]['tag'].iloc[0]
                gender = mg[1]['gender'].iloc[0]
                number = mg[1]['number'].iloc[0]
                person = mg[1]['person'].iloc[0]
                tense = mg[1]['tense'].iloc[0]
                binyan = mg[1]['binyan'].iloc[0]
                polarity = mg[1]['polarity'].iloc[0]
                feats = morph.Features.create([ff for f in [gender, number, person, tense, binyan, polarity] for ff in f.split("|") if ff != '_'])
                m = morph.Morpheme(form, lemma, tag, feats)
                if mtype == 'pref':
                    prefixes.append(m)
                elif mtype == 'suff':
                    suffixes.append(m)
                else:
                    hosts.append(m)
            a = morph.Analysis(prefixes, hosts, suffixes)
            token_analyses.append(a)
            is_gold = ag[1]['is_gold'].iloc[0]
            if is_gold:
                gold_token_analyses.append(a)
        lattice[token_idx] = token_analyses
        gold_lattice[token_idx] = gold_token_analyses
    return nlp.Sentence(tokens, lattice, gold_lattice)
示例#8
0
def _normalize(sentences: list) -> list:
    normalized_sentences = []
    for sent in sentences:
        normalized_gold_lattice = morph.Lattice()
        for token_index in sent.lattice:
            token_lattice = sent.lattice[token_index]
            gold = sent.gold_lattice[token_index][0]
            normalized_gold = _map_normalized_gold_analysis(token_lattice, gold)
            normalized_gold = _reduce_normalized_gold_analysis(gold, normalized_gold)
            normalized_gold_lattice[token_index] = [normalized_gold]
        normalized_sent = nlp.Sentence(sent.tokens, sent.lattice, normalized_gold_lattice)
        normalized_sentences.append(normalized_sent)
    return normalized_sentences
示例#9
0
def ptrnet_ma(tokens: list, lex: Lexicon, model: PtrNetModel) -> nlp.Sentence:
    lattice = ma.lattice(tokens, lex)
    gold_lattice = morph.Lattice()
    for token_id in lattice:
        gold_lattice[token_id] = [morph.Analysis([], [], [])]
    sentence = nlp.Sentence(tokens, lattice, gold_lattice)
    new_tokens, new_forms, new_lemmas = model.vocab.update(sentence)
    if new_forms:
        new_form_matrix = model_ft.get_word_vectors(home_path,
                                                    sorted(new_forms))
        new_form_matrix = torch.tensor(new_form_matrix,
                                       dtype=torch.float,
                                       device=device)
        model.emb.update_form_emb_(new_form_matrix)
    if new_lemmas:
        new_lemma_matrix = model_ft.get_word_vectors(home_path,
                                                     sorted(new_lemmas))
        new_lemma_matrix = torch.tensor(new_lemma_matrix,
                                        dtype=torch.float,
                                        device=device)
        model.emb.update_lemma_emb_(new_lemma_matrix)
    return sentence