def excel_to_corpus(excel_path, corpus_path): '''NB! Make sure to use .xls file extension for Excel files.''' corpus = PyCorpus(corpus_path) excel = ExcelFile(excel_path) # as we do not know the number of sheets, we parse all of them # until we obtain a error idx = 0 while True: try: df = excel.parse(str(idx)) # recreate some information that was modified when exporting to xls new_df = dict() for col in df.columns: data = [] for v in df[col]: if type(v) == float and math.isnan(v): data.append(None) elif v == 0: data.append(False) elif v == 1: data.append(True) else: data.append(v) new_df[col] = Series(data) corpus[str(idx)] = DataFrame(new_df) except xlrd.biffh.XLRDError: break idx += 1 corpus.close()
def corpus_to_excel(corpus_path, excel_path): '''NB! Make sure to use .xls file extension for Excel files.''' corpus = PyCorpus(corpus_path) writer = ExcelWriter(excel_path) for key in corpus: corpus[key].to_excel(writer, sheet_name=key) writer.save() corpus.close()
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'): assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language) dest.commit() orig.close() dest.close()
def boi_to_t3corpus(orig_path, t3_path): '''Parse a t3 corpus, where documents are separated with -- ''' f = codecs.open(orig_path, 'rb', 'utf-8') contents = f.read() f.close() docs = re.split('--\r?\n\r?\n', contents) corpus = PyCorpus(t3_path) for i, doc in enumerate(docs): corpus[str(i+1)] = parse_t3_doc_from_string(doc) corpus.close()
def as_eng_postagged_corpus(orig_path, eng_path): '''Uses nltk default tagger.''' assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_eng_postagged_doc(orig[doc_id]) dest.commit() orig.close() dest.close()
def crf_model_predict(model_path, corpus, target_path, series_name): f = open(model_path, 'rb') model, kwargs = cPickle.load(f) f.close() s = Corpus(target_path) for doc_id, predictions in crf_predict(model, corpus, **kwargs): doc = corpus[doc_id] doc[series_name] = predictions s[doc_id] = doc sys.stderr.write('Document {0} classified.\n'.format(doc_id)) s.close()
def as_t3corpus(orig_path, t3_path): '''Convert a corpus at orig_path to t3mesta corpus to t3_path.''' orig_corpus = PyCorpus(orig_path) dest_corpus = PyCorpus(t3_path) dest_corpus.autocommit(False) dest_keys = set(dest_corpus.keys()) for key in orig_corpus.keys(): if key not in dest_keys: dest_corpus[key] = as_t3doc(orig_corpus[key]) dest_corpus.commit() orig_corpus.close() dest_corpus.close()
def parse_plain_corpus(plainpath, corpuspath): corpus = PyCorpus(corpuspath) data = codecs.open(plainpath, 'rb', 'utf-8').read() docs = re.split('s*?\r?\n\r?\n', data) data = None corpus.autocommit(False) for doc in docs: lines = re.split('\r?\n', doc.strip()) title = lines[0].strip() contents = '\n'.join(lines[1:]).strip() text_stream = cStringIO.StringIO(contents.encode('utf-8')) utf8_stream = codecs.getreader('utf-8')(text_stream) corpus[title] = parse_plain_doc_from_stream(utf8_stream) corpus.commit() corpus.close()
def crf_model_predict_mc(model_path, corpus, target_path, series_name, n): '''Multi core version of crf_model_predict. n - number of processes to use. ''' sys.stderr.write('Dividing documents between {0} processes.\n'.format(n)) doc_ids = list(corpus.keys()) id_lists = [[] for _ in range(n)] idx = 0 for doc_id in corpus.keys(): id_lists[idx].append(doc_id) idx += 1 if idx >= n: idx = 0 sys.stderr.write('Launching processes.\n') dest_names = [] processes = [] for idx, ids in enumerate(id_lists): if len(ids) > 0: folder = tempfile.mkdtemp() # write the new corpus src_name = os.path.join(folder, 'src.corpus') dest_name = os.path.join(folder, 'dest.corpus') tmp_corp = Corpus(src_name) tmp_corp.autocommit(False) for doc_id in ids: tmp_corp[doc_id] = corpus[doc_id] tmp_corp.close() # start the process process = Process(target=crf_process, args=(model_path, src_name, dest_name, series_name)) process.start() sys.stderr.write('Process {0} launched\n'.format(idx)) # store the identificators dest_names.append(dest_name) processes.append(process) for p in processes: p.join() sys.stderr.write('Processes finished!\n') # concatenate temporary outputs target_corp = Corpus(target_path) target_corp.autocommit(False) for dest_name in dest_names: tmp_corp = Corpus(dest_name) for doc_id in tmp_corp: target_corp[doc_id] = tmp_corp[doc_id] tmp_corp.close() target_corp.close() sys.stderr.write('Corpus {0} created'.format(target_path))
def crf_process(model_path, tmp_corpus_path, tmp_target_path, series_name): tmp_corp = Corpus(tmp_corpus_path) crf_model_predict(model_path, tmp_corp, tmp_target_path, series_name) tmp_corp.close()