def corpus_and_sequences():
    corpus = pcc.PostagCorpus()
    train_seq = corpus.read_sequence_list_conll(data.find('train-02-21.conll'),
                                                max_sent_len=10,
                                                max_nr_sent=1000)
    dev_seq = corpus.read_sequence_list_conll(data.find('dev-22.conll'),
                                              max_sent_len=10,
                                              max_nr_sent=1000)
    test_seq = corpus.read_sequence_list_conll(data.find('test-23.conll'),
                                               max_sent_len=10,
                                               max_nr_sent=1000)
    return corpus, train_seq, dev_seq, test_seq
示例#2
0
import numpy as np
import lxmls.readers.pos_corpus as pcc
import os
import pickle

corpus = pcc.PostagCorpus()
input_data = os.path.join(os.path.dirname(__file__), '..', '..', 'data',
                          'train-02-21.conll')
train_seq = corpus.read_sequence_list_conll(input_data,
                                            max_sent_len=15,
                                            max_nr_sent=1000)

pickle.dump((corpus.word_dict, corpus.tag_dict), open('word_tag_dict.pkl',
                                                      'w'))

with open('encoded.txt', 'w') as output:
    for seq in train_seq:
        words = [
            corpus.word_dict.get_label_name(seq.x[i]) for i in range(len(seq))
        ]
        tags = [
            corpus.tag_dict.get_label_name(seq.y[i]) for i in range(len(seq))
        ]
        s = ' '.join(['_'.join([word, tag]) for word, tag in zip(words, tags)])
        output.write(s + '\n')
        # output.write(str(seq)+'\n')