Пример #1
0
def main():
    home_path = Path('.')
    src_tokens_idx = '01'
    src_suffix = '10'
    tb_files = {'train-hebtb.tokens': f'data/clean/spmrl/hebtb/train-hebtb-{src_tokens_idx}-tokens.txt',
                'train-hebtb-gold.lattices': f'data/clean/spmrl/hebtb/train-hebtb-{src_suffix}-gold.lattices',
                'dev-hebtb.tokens': f'data/clean/spmrl/hebtb/dev-hebtb-{src_tokens_idx}-tokens.txt',
                'dev-hebtb-gold.lattices': f'data/clean/spmrl/hebtb/dev-hebtb-{src_suffix}-gold.lattices',
                'test-hebtb.tokens': f'data/clean/spmrl/hebtb/test-hebtb-{src_tokens_idx}-tokens.txt',
                'test-hebtb-gold.lattices': f'data/clean/spmrl/hebtb/test-hebtb-{src_suffix}-gold.lattices'}
    lex_files = {'pref-lex': 'data/raw/spmrl/bgulex/bgupreflex_withdef.utf8.hr',
                 'lex': 'data/clean/spmrl/bgulex/bgulex-03.hr'}
    bgulex_file_path = Path('data/processed/spmrl/bgulex.pickle')
    hebtb_file_path = Path('data/processed/spmrl/hebtb.pickle')
    vocab_file_path = Path('data/processed/spmrl/hebtb-morph-vocab/vocab.pickle')
    if bgulex_file_path.exists():
        bgulex = Lexicon.load(bgulex_file_path)
    else:
        bgulex = Lexicon(lex_files)
        bgulex.save(bgulex_file_path)
    if hebtb_file_path.exists():
        hebtb = Treebank.load(hebtb_file_path)
    else:
        hebtb = Treebank(bgulex, tb_files)
        hebtb.save(hebtb_file_path)
    tb_train_size = len(hebtb.infused_train_sentences)
    tb_dev_size = len(hebtb.infused_dev_sentences)
    tb_test_size = len(hebtb.infused_test_sentences)
    print(f"Train sentences: {tb_train_size}")
    print(f"Dev sentences: {tb_dev_size}")
    print(f"Test sentences: {tb_test_size}")
    tb_sentences = (hebtb.infused_train_sentences + hebtb.infused_dev_sentences + hebtb.infused_test_sentences)
    if vocab_file_path.exists():
        tb_vocab = MorphVocab.load(vocab_file_path)
    else:
        tb_vocab = MorphVocab(tb_sentences)
        tb_vocab.save(vocab_file_path)
    print("Vocab tokens: {}".format(len(tb_vocab.tokens)))
    print("Vocab forms: {}".format(len(tb_vocab.forms)))
    print("Vocab lemmas: {}".format(len(tb_vocab.lemmas)))
    print("Vocab tags: {}".format(len(tb_vocab.tags)))
    print("Vocab feats: {}".format(len(tb_vocab.feats)))

    train_ds = get_morph_dataset_partition('train-inf', home_path, tb_vocab, hebtb)
    dev_inf_ds = get_morph_dataset_partition('dev-inf', home_path, tb_vocab, hebtb)
    test_inf_ds = get_morph_dataset_partition('test-inf', home_path, tb_vocab, hebtb)
    dev_uninf_ds = get_morph_dataset_partition('dev-uninf', home_path, tb_vocab, hebtb)
    test_uninf_ds = get_morph_dataset_partition('test-uninf', home_path, tb_vocab, hebtb)
    print("Train infused dataset: {}".format(len(train_ds)))
    print("Dev infused dataset: {}".format(len(dev_inf_ds)))
    print("Test infused dataset: {}".format(len(test_inf_ds)))
    print("Dev uninfused dataset: {}".format(len(dev_uninf_ds)))
    print("Test uninfused dataset: {}".format(len(test_uninf_ds)))
Пример #2
0
def lattice(tokens: list, lex: Lexicon) -> morph.Lattice:
    lex_entries = [lex.entry(token) for token in tokens]
    lex_lattice = morph.Lattice()
    for tid, token in enumerate(tokens):
        token_id = tid + 1
        lex_lattice[token_id] = lex_entries[tid].analyses
    return lex_lattice
Пример #3
0
 def _create_sentence(self, lexicon: lex.Lexicon, tokens: list, md_lattice: conllx.LatticeGraph) -> nlp.Sentence:
     lex_entries = [lexicon.entry(token) for token in tokens]
     lattice = morph.Lattice()
     for i, entry in enumerate(lex_entries):
         lattice[i + 1] = entry.analyses
     gold_lattice = morph.Lattice()
     for tid, token in enumerate(tokens):
         token_id = tid + 1
         gold_analysis = []
         for path in md_lattice._token_paths[token_id]:
             gold_analysis.append(self._create_analysis(path))
         if len(gold_analysis) != 1:
             raise ValueError("token gold analysis: {}".format(gold_analysis))
         gold_lattice[token_id] = gold_analysis
     return nlp.Sentence(tokens, lattice, gold_lattice)
Пример #4
0
    'dev-hebtb-gold.lattices':
    f'{home_path}/data/clean/spmrl/hebtb/dev-hebtb-{src_lattice_idx}-gold.lattices',
    'test-hebtb.tokens':
    f'{home_path}/data/clean/spmrl/hebtb/test-hebtb-{src_tokens_idx}-tokens.txt',
    'test-hebtb-gold.lattices':
    f'{home_path}/data/clean/spmrl/hebtb/test-hebtb-{src_lattice_idx}-gold.lattices'
}
lex_files = {
    'pref-lex': 'data/raw/spmrl/bgulex/bgupreflex_withdef.utf8.hr',
    'lex': 'data/clean/spmrl/bgulex/bgulex-03.hr'
}
bgulex_file_path = Path(f'{home_path}/data/processed/spmrl/bgulex.pickle')
hebtb_file_path = Path(f'{home_path}/data/processed/spmrl/hebtb.pickle')
vocab_file_path = Path(
    f'{home_path}/data/processed/spmrl/hebtb-token-vocab/vocab.pickle')
bgulex = Lexicon.load(bgulex_file_path)
hebtb = Treebank.load(hebtb_file_path)
hebtb_partition = {
    'train-inf': hebtb.infused_train_sentences,
    'dev-inf': hebtb.infused_dev_sentences,
    'test-inf': hebtb.infused_test_sentences,
    'dev-uninf': hebtb.uninfused_dev_sentences,
    'test-uninf': hebtb.uninfused_test_sentences
}
tb_vocab = TokenVocab.load(vocab_file_path)

# Data
train_set = get_token_dataset_partition('train-inf', home_path, tb_vocab,
                                        hebtb)
dev_inf_set = get_token_dataset_partition('dev-inf', home_path, tb_vocab,
                                          hebtb)