Пример #1
0
def excel_to_corpus(excel_path, corpus_path):
    '''NB! Make sure to use .xls file extension for Excel files.'''
    corpus = PyCorpus(corpus_path)
    excel  = ExcelFile(excel_path)
    # as we do not know the number of sheets, we parse all of them
    # until we obtain a error
    idx = 0
    while True:
        try:
            df = excel.parse(str(idx))
            # recreate some information that was modified when exporting to xls
            new_df = dict()
            for col in df.columns:
                data = []
                for v in df[col]:
                    if type(v) == float and math.isnan(v):
                        data.append(None)
                    elif v == 0:
                        data.append(False)
                    elif v == 1:
                        data.append(True)
                    else:
                        data.append(v)
                new_df[col] = Series(data)
            corpus[str(idx)] = DataFrame(new_df)
        except xlrd.biffh.XLRDError:
            break
        idx += 1
    corpus.close()
Пример #2
0
def corpus_to_excel(corpus_path, excel_path):
    '''NB! Make sure to use .xls file extension for Excel files.'''
    corpus = PyCorpus(corpus_path)
    writer = ExcelWriter(excel_path)
    for key in corpus:
        corpus[key].to_excel(writer, sheet_name=key)
    writer.save()
    corpus.close()
Пример #3
0
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'):
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language)
    dest.commit()
    orig.close()
    dest.close()
Пример #4
0
def boi_to_t3corpus(orig_path, t3_path):
    '''Parse a t3 corpus, where documents are separated with -- '''
    f = codecs.open(orig_path, 'rb', 'utf-8')
    contents = f.read()
    f.close()
    docs = re.split('--\r?\n\r?\n', contents)
    corpus = PyCorpus(t3_path)
    for i, doc in enumerate(docs):
        corpus[str(i+1)] = parse_t3_doc_from_string(doc)
    corpus.close()
Пример #5
0
def as_eng_postagged_corpus(orig_path, eng_path):
    '''Uses nltk default tagger.'''
    assert (orig_path != eng_path)
    orig = PyCorpus(orig_path)
    dest = PyCorpus(eng_path)
    dest.autocommit(False)
    for doc_id in orig.keys():
        dest[doc_id] = as_eng_postagged_doc(orig[doc_id])
    dest.commit()
    orig.close()
    dest.close()
Пример #6
0
Файл: ner.py Проект: estnltk/pfe
def crf_model_predict(model_path, corpus, target_path, series_name):
    f = open(model_path, 'rb')
    model, kwargs = cPickle.load(f)
    f.close()
    s = Corpus(target_path)
    for doc_id, predictions in crf_predict(model, corpus, **kwargs):
        doc = corpus[doc_id]
        doc[series_name] = predictions
        s[doc_id] = doc
        sys.stderr.write('Document {0} classified.\n'.format(doc_id))
    s.close()
Пример #7
0
def as_t3corpus(orig_path, t3_path):
    '''Convert a corpus at orig_path to t3mesta corpus to t3_path.'''
    orig_corpus = PyCorpus(orig_path)
    dest_corpus = PyCorpus(t3_path)
    dest_corpus.autocommit(False)

    dest_keys = set(dest_corpus.keys())
    for key in orig_corpus.keys():
        if key not in dest_keys:
            dest_corpus[key] = as_t3doc(orig_corpus[key])

    dest_corpus.commit()

    orig_corpus.close()
    dest_corpus.close()
Пример #8
0
def parse_plain_corpus(plainpath, corpuspath):
    corpus = PyCorpus(corpuspath)
    data = codecs.open(plainpath, 'rb', 'utf-8').read()
    docs = re.split('s*?\r?\n\r?\n', data)
    data = None
    corpus.autocommit(False)
    for doc in docs:
        lines = re.split('\r?\n', doc.strip())
        title = lines[0].strip()
        contents = '\n'.join(lines[1:]).strip()
        text_stream = cStringIO.StringIO(contents.encode('utf-8'))
        utf8_stream = codecs.getreader('utf-8')(text_stream)
        corpus[title] = parse_plain_doc_from_stream(utf8_stream)
    corpus.commit()
    corpus.close()
Пример #9
0
Файл: ner.py Проект: estnltk/pfe
def crf_model_predict_mc(model_path, corpus, target_path, series_name, n):
    '''Multi core version of crf_model_predict.
       n - number of processes to use.
    '''
    sys.stderr.write('Dividing documents between {0} processes.\n'.format(n))
    doc_ids  = list(corpus.keys())
    id_lists = [[] for _ in range(n)]
    idx = 0
    for doc_id in corpus.keys():
        id_lists[idx].append(doc_id)
        idx += 1
        if idx >= n:
            idx = 0
    sys.stderr.write('Launching processes.\n')
    dest_names   = []
    processes    = []
    for idx, ids in enumerate(id_lists):
        if len(ids) > 0:
            folder = tempfile.mkdtemp()
            # write the new corpus
            src_name  = os.path.join(folder, 'src.corpus')
            dest_name = os.path.join(folder, 'dest.corpus')
            tmp_corp = Corpus(src_name)
            tmp_corp.autocommit(False)
            for doc_id in ids:
                tmp_corp[doc_id] = corpus[doc_id]
            tmp_corp.close()

            # start the process
            process = Process(target=crf_process,
                                args=(model_path,
                                     src_name,
                                     dest_name,
                                     series_name))
            process.start()
            sys.stderr.write('Process {0} launched\n'.format(idx))
            # store the identificators
            dest_names.append(dest_name)
            processes.append(process)
    for p in processes:
        p.join()
    sys.stderr.write('Processes finished!\n')

    # concatenate temporary outputs
    target_corp = Corpus(target_path)
    target_corp.autocommit(False)
    for dest_name in dest_names:
        tmp_corp = Corpus(dest_name)
        for doc_id in tmp_corp:
            target_corp[doc_id] = tmp_corp[doc_id]
        tmp_corp.close()
    target_corp.close()
    sys.stderr.write('Corpus {0} created'.format(target_path))
Пример #10
0
Файл: ner.py Проект: estnltk/pfe
def crf_process(model_path, tmp_corpus_path, tmp_target_path, series_name):
    tmp_corp = Corpus(tmp_corpus_path)
    crf_model_predict(model_path, tmp_corp, tmp_target_path, series_name)
    tmp_corp.close()