def as_t3corpus(orig_path, t3_path): '''Convert a corpus at orig_path to t3mesta corpus to t3_path.''' orig_corpus = PyCorpus(orig_path) dest_corpus = PyCorpus(t3_path) dest_corpus.autocommit(False) dest_keys = set(dest_corpus.keys()) for key in orig_corpus.keys(): if key not in dest_keys: dest_corpus[key] = as_t3doc(orig_corpus[key]) dest_corpus.commit() orig_corpus.close() dest_corpus.close()
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'): assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language) dest.commit() orig.close() dest.close()
def as_eng_postagged_corpus(orig_path, eng_path): '''Uses nltk default tagger.''' assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_eng_postagged_doc(orig[doc_id]) dest.commit() orig.close() dest.close()