예제 #1
0
파일: corpus.py 프로젝트: estnltk/pfe
def as_t3corpus(orig_path, t3_path):
    '''Convert a corpus at orig_path to t3mesta corpus to t3_path.'''
    orig_corpus = PyCorpus(orig_path)
    dest_corpus = PyCorpus(t3_path)
    dest_corpus.autocommit(False)

    dest_keys = set(dest_corpus.keys())
    for key in orig_corpus.keys():
        if key not in dest_keys:
            dest_corpus[key] = as_t3doc(orig_corpus[key])

    dest_corpus.commit()

    orig_corpus.close()
    dest_corpus.close()
예제 #2
0
파일: corpus.py 프로젝트: estnltk/pfe
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'):
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language)
    dest.commit()
    orig.close()
    dest.close()
예제 #3
0
파일: corpus.py 프로젝트: estnltk/pfe
def as_eng_postagged_corpus(orig_path, eng_path):
    '''Uses nltk default tagger.'''
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_eng_postagged_doc(orig[doc_id])
    dest.commit()
    orig.close()
    dest.close()