def process_batch(cur, geo=False, fsw=False, stem=False): st = datetime.now() i = 0 r = [] l = cur.count() for doc in cur: if i == 0: stt = doc['created'] t = text_process(doc, geo=geo, filter_sw=fsw, stem=stem) r.append({'words': t[0].split(), 'created_at': doc['created'], 'geo': t[1]}) i += 1 progress(i, l, skip=100) end = doc['created'] print '\nretrieval and processing took', datetime.now() - st return r, stt, end
def process_batch(cur, geo=False, fsw=False, stem=False): st = datetime.now() i = 0 r = [] l = cur.count() for doc in cur: if i == 0: stt = doc['created'] t = text_process(doc, geo=geo, filter_sw=fsw, stem=stem) r.append({ 'words': t[0].split(), 'created_at': doc['created'], 'geo': t[1] }) i += 1 progress(i, l, skip=100) end = doc['created'] print '\nretrieval and processing took', datetime.now() - st return r, stt, end
f = open('assets/tw_ht_corpus_2.txt', 'a') p = MDB('tweets') cols = p.client['tweets'].collection_names() cols.remove('SPB') cols.remove('EKB') cols.remove('Moscow') print cols i = 0 counts = [] for c in cols: ml = p.client['tweets'][c].find() counts.append(ml.count()) total = sum(counts) print 'total:', total, 'documents' for c in cols: ml = p.client['tweets'][c].find() for t in ml: try: dt = text_process(t)[0] progress(i, total) if dt: f.write(dt + '\n') except Exception as e: print e finally: i += 1