Python text_process示例

编程语言: Python

命名空间/包名称: text_processor

方法/功能: text_process

hotexamples.com的示例: 4

Python text_process - 已找到4个示例。这些是从开源项目中提取的最受好评的text_processor.text_process现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： timeseries_utils.py 项目： gaphex/Oracle

def process_batch(cur, geo=False, fsw=False, stem=False):
    st = datetime.now()
    i = 0
    r = []
    l = cur.count()
    for doc in cur:
        if i == 0:
            stt = doc['created']
        t = text_process(doc, geo=geo, filter_sw=fsw, stem=stem)
        r.append({'words': t[0].split(), 'created_at': doc['created'], 'geo': t[1]})
        i += 1
        progress(i, l, skip=100)
    end = doc['created']
    print '\nretrieval and processing took', datetime.now() - st
    return r, stt, end

示例#2

显示文件

def process_batch(cur, geo=False, fsw=False, stem=False):
    st = datetime.now()
    i = 0
    r = []
    l = cur.count()
    for doc in cur:
        if i == 0:
            stt = doc['created']
        t = text_process(doc, geo=geo, filter_sw=fsw, stem=stem)
        r.append({
            'words': t[0].split(),
            'created_at': doc['created'],
            'geo': t[1]
        })
        i += 1
        progress(i, l, skip=100)
    end = doc['created']
    print '\nretrieval and processing took', datetime.now() - st
    return r, stt, end

示例#3

显示文件

文件： build_corpus.py 项目： gaphex/Oracle

f = open('assets/tw_ht_corpus_2.txt', 'a')
p = MDB('tweets')
cols = p.client['tweets'].collection_names()
cols.remove('SPB')
cols.remove('EKB')
cols.remove('Moscow')
print cols
i = 0

counts = []
for c in cols:
    ml = p.client['tweets'][c].find()
    counts.append(ml.count())
    
total = sum(counts)
print 'total:', total, 'documents'

for c in cols:
    ml = p.client['tweets'][c].find()
    for t in ml:
        try:
            dt = text_process(t)[0]
            progress(i, total)
            if dt:
                f.write(dt + '\n')
        except Exception as e:
            print e
        finally:
            i += 1

示例#4

显示文件

f = open('assets/tw_ht_corpus_2.txt', 'a')
p = MDB('tweets')
cols = p.client['tweets'].collection_names()
cols.remove('SPB')
cols.remove('EKB')
cols.remove('Moscow')
print cols
i = 0

counts = []
for c in cols:
    ml = p.client['tweets'][c].find()
    counts.append(ml.count())

total = sum(counts)
print 'total:', total, 'documents'

for c in cols:
    ml = p.client['tweets'][c].find()
    for t in ml:
        try:
            dt = text_process(t)[0]
            progress(i, total)
            if dt:
                f.write(dt + '\n')
        except Exception as e:
            print e
        finally:
            i += 1