def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'): assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language) dest.commit() orig.close() dest.close()
def as_eng_postagged_corpus(orig_path, eng_path): '''Uses nltk default tagger.''' assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_eng_postagged_doc(orig[doc_id]) dest.commit() orig.close() dest.close()
def crf_model_predict_mc(model_path, corpus, target_path, series_name, n): '''Multi core version of crf_model_predict. n - number of processes to use. ''' sys.stderr.write('Dividing documents between {0} processes.\n'.format(n)) doc_ids = list(corpus.keys()) id_lists = [[] for _ in range(n)] idx = 0 for doc_id in corpus.keys(): id_lists[idx].append(doc_id) idx += 1 if idx >= n: idx = 0 sys.stderr.write('Launching processes.\n') dest_names = [] processes = [] for idx, ids in enumerate(id_lists): if len(ids) > 0: folder = tempfile.mkdtemp() # write the new corpus src_name = os.path.join(folder, 'src.corpus') dest_name = os.path.join(folder, 'dest.corpus') tmp_corp = Corpus(src_name) tmp_corp.autocommit(False) for doc_id in ids: tmp_corp[doc_id] = corpus[doc_id] tmp_corp.close() # start the process process = Process(target=crf_process, args=(model_path, src_name, dest_name, series_name)) process.start() sys.stderr.write('Process {0} launched\n'.format(idx)) # store the identificators dest_names.append(dest_name) processes.append(process) for p in processes: p.join() sys.stderr.write('Processes finished!\n') # concatenate temporary outputs target_corp = Corpus(target_path) target_corp.autocommit(False) for dest_name in dest_names: tmp_corp = Corpus(dest_name) for doc_id in tmp_corp: target_corp[doc_id] = tmp_corp[doc_id] tmp_corp.close() target_corp.close() sys.stderr.write('Corpus {0} created'.format(target_path))
def as_t3corpus(orig_path, t3_path): '''Convert a corpus at orig_path to t3mesta corpus to t3_path.''' orig_corpus = PyCorpus(orig_path) dest_corpus = PyCorpus(t3_path) dest_corpus.autocommit(False) dest_keys = set(dest_corpus.keys()) for key in orig_corpus.keys(): if key not in dest_keys: dest_corpus[key] = as_t3doc(orig_corpus[key]) dest_corpus.commit() orig_corpus.close() dest_corpus.close()
def parse_plain_corpus(plainpath, corpuspath): corpus = PyCorpus(corpuspath) data = codecs.open(plainpath, 'rb', 'utf-8').read() docs = re.split('s*?\r?\n\r?\n', data) data = None corpus.autocommit(False) for doc in docs: lines = re.split('\r?\n', doc.strip()) title = lines[0].strip() contents = '\n'.join(lines[1:]).strip() text_stream = cStringIO.StringIO(contents.encode('utf-8')) utf8_stream = codecs.getreader('utf-8')(text_stream) corpus[title] = parse_plain_doc_from_stream(utf8_stream) corpus.commit() corpus.close()