Пример #1
0
import logging
import json
from common import engine

log = logging.getLogger(__name__)
raw_table = engine.get_table('raw')
#status_table = engine.get_table('raw_dumped')

BATCH_SIZE = 10000


def dump_batches():
    if len(raw_table) < BATCH_SIZE:
        log.info("Not enough entries remaining.")
        return False
    data, min_id, max_id = [], None, 0
    log.info("Fetching %s raw tweets...", BATCH_SIZE)
    engine.begin()
    for row in raw_table.find(_limit=BATCH_SIZE, order_by=['id']):
        if min_id is None:
            min_id = row['id']
        data.append(row['json'])
        raw_table.delete(id=row['id'])
    log.info("Saving file...")
    fh = open('dumps/raw_%s.json' % min_id, 'wb')
    data = '\n'.join(data)
    fh.write(data.encode('utf-8'))
    fh.close()
    engine.commit()
    return True
Пример #2
0
import json
import logging
from common import engine
from dataset.freeze.format.fjson import JSONEncoder
from datetime import datetime, timedelta
import sqlalchemy.sql.expression as sql

log = logging.getLogger(__name__)
raw_tbl = engine.get_table('raw').table
hashtags_tbl = engine.get_table('hashtags').table
#status_table = engine.get_table('raw_dumped')

def dump_hashtag(tag):
    data = []

    status_tbl = engine['status'].table
    user_tbl = engine['user'].table
    q = status_tbl.join(user_tbl, user_tbl.c.id == status_tbl.c.user_id)
    q = q.join(hashtags_tbl, status_tbl.c.id == hashtags_tbl.c.status_id)
    q = sql.select([status_tbl, user_tbl], from_obj=q, use_labels=True)
    q = q.where(hashtags_tbl.c.text.ilike(tag))
    q = q.order_by(hashtags_tbl.c.status_id.asc())
    
    statuses = []
    for row in engine.query(q):
        data.append(row)
        #data.append(json.loads(row['raw_json']))
    #for json_file in os.listdir('dumps'):
    #    print json_file, len(statuses), len(data)
    #    #min_id = int(json_file.split('.', 1)[0].split('_', 1)[-1])
    #    fh = open('dumps/%s' % json_file, 'rb')