Exemplo n.º 1
0
def iterate_from_s3(game_id,
                    bucket,
                    logname,
                    start_time,
                    end_time,
                    verbose=True):
    assert start_time > 0

    # to protect against same-time collisions, create a unique fake "PID" for MongoDB row _ids
    sha = hashlib.sha1()
    sha.update(game_id)
    dig = sha.digest()
    fake_pid = (ord(dig[1]) << 8) | ord(dig[0])

    s3 = SpinS3.S3(SpinConfig.aws_key_file())
    last_id_time = -1
    id_serial = 0

    for t in xrange(86400 * (start_time // 86400), 86400 * (end_time // 86400),
                    86400):  # for each day
        y, m, d = SpinConfig.unix_to_cal(t)
        prefix = '%04d%02d/%s-%04d%02d%02d-%s' % (
            y, m, SpinConfig.game_id_long(override_game_id=game_id), y, m, d,
            logname)

        for entry in s3.list_bucket(bucket, prefix=prefix):
            filename = entry['name'].split('/')[-1]
            if verbose: print 'reading', filename

            if entry['name'].endswith('.zip'):
                tf = tempfile.NamedTemporaryFile(prefix=logname + '-' +
                                                 filename,
                                                 suffix='.zip')
                s3.get_file(bucket, entry['name'], tf.name)
                unzipper = subprocess.Popen(['unzip', '-q', '-p', tf.name],
                                            stdout=subprocess.PIPE)
            elif entry['name'].endswith('.gz'):
                tf = tempfile.NamedTemporaryFile(prefix=logname + '-' +
                                                 filename,
                                                 suffix='.gz')
                s3.get_file(bucket, entry['name'], tf.name)
                unzipper = subprocess.Popen(['gunzip', '-c', tf.name],
                                            stdout=subprocess.PIPE)
            else:
                raise Exception('unhandled file extension: ' + entry['name'])

            for line in unzipper.stdout.xreadlines():
                row = SpinJSON.loads(line)
                if row['time'] < start_time: continue  # skip ahead
                elif row['time'] >= end_time: break

                if '_id' not in row:
                    # synthesize a fake MongoDB row ID
                    if row['time'] != last_id_time:
                        last_id_time = row['time']
                        id_serial = 0
                    row['_id'] = SpinNoSQLId.creation_time_id(row['time'],
                                                              pid=fake_pid,
                                                              serial=id_serial)
                    assert SpinNoSQLId.is_valid(row['_id'])
                    id_serial += 1

                # note: there's a small chance this could end up duplicating an event at the boundary of an S3 import and MongoDB import
                if verbose: print row
                yield row
Exemplo n.º 2
0
def do_upload(nosql_client, table, verbose, dry_run, keep_local):
    if not table['s3_name']: return
    msg_fd = sys.stderr if verbose else NullFD()
    s3_logs = SpinS3.S3(s3_key_file_for_logs)
    print >> msg_fd, '%s: upload' % (table['table_name'])

    tbl = nosql_client._table(table['table_name'])
    # find earliest timestamp
    first = list(tbl.find({}, {'time': 1}).sort([('time', 1)]).limit(1))
    if not first:
        print >> msg_fd, 'no records'
        return
    start_time = first[0]['time']

    # snap to day boundary
    start_time = 86400 * (start_time // 86400)
    today_start = 86400 * (time_now // 86400)

    # check each full UTC day from start_time, stopping before the current day
    while start_time < today_start:
        date_str = time.strftime('%Y%m%d', time.gmtime(start_time))
        year_month = date_str[:-2]
        obj_name = '%s/%s-%s-%s.json.%s' % (
            year_month, SpinConfig.game_id_long(), date_str, table['s3_name'],
            table['compression'])
        print >> msg_fd, '  checking %s/%s...' % (log_bucket, obj_name),
        msg_fd.flush()

        if s3_logs.exists(log_bucket, obj_name, has_read_permission=False):
            print >> msg_fd, 'already exists, skipping.'
        else:
            # upload one day's data
            print >> msg_fd, 'does not exist, dumping...'

            # spit out the entries to a flat file using SpinLog
            tf_name = '%s/%s-%s-%s.json' % (tempfile.gettempdir(),
                                            SpinConfig.game_id_long(),
                                            date_str, table['s3_name'])
            try:
                target = SpinLog.SimpleJSONLog(tf_name, buffer=-1)
                cursor = tbl.find({
                    'time': {
                        '$gte': start_time,
                        '$lt': start_time + 86400
                    }
                }).sort([('time', 1)])
                total = cursor.count()
                count = 0
                for row in cursor:
                    if '_id' in row:
                        if type(row['_id']) is bson.objectid.ObjectId:
                            row['_id'] = SpinNoSQL.NoSQLClient.decode_object_id(
                                row['_id'])
                    assert 'time' in row
                    t = row['time']
                    del row['time']
                    target.event(t, row)
                    count += 1
                    if count == 1 or count == total or (count % 1000) == 0:
                        print >> msg_fd, '\r    %d/%d %s dump' % (
                            count, total, table['table_name']),
                print >> msg_fd, 'finished'
                target.close()

                # compress the file
                obj_file_name = os.path.basename(obj_name)
                print >> msg_fd, '  compressing', os.path.basename(
                    tf_name), '->', os.path.basename(obj_file_name), '...',
                msg_fd.flush()
                assert table['compression'] == 'zip'
                save_cwd = os.getcwd()
                try:
                    os.chdir(os.path.dirname(tf_name))
                    args = [
                        '/usr/bin/zip', '-q',
                        os.path.basename(obj_file_name),
                        os.path.basename(tf_name)
                    ]
                    subprocess.check_call(args)
                    print >> msg_fd, 'done'

                    print >> msg_fd, '  uploading', obj_file_name, '->', log_bucket + ':' + obj_name, '...',
                    msg_fd.flush()
                    if not dry_run:
                        s3_logs.put_file(log_bucket, obj_name,
                                         os.path.basename(obj_file_name))
                finally:
                    safe_unlink(os.path.basename(obj_file_name))
                    os.chdir(save_cwd)
            finally:
                if keep_local:
                    print >> msg_fd, '  KEEPING', tf_name
                else:
                    safe_unlink(tf_name)

            print >> msg_fd, 'done'

        start_time += 86400
Exemplo n.º 3
0
def do_slave(task):
    date = task['date']
    game_id = task['game_id']
    verbose = task['verbose']
    dry_run = task['dry_run']
    commit_interval = task['commit_interval']

    start_time = SpinConfig.cal_to_unix((int(date[0:4]),int(date[4:6]),int(date[6:8])))
    end_time = start_time + 86400

    gamedata = SpinJSON.load(open(SpinConfig.gamedata_filename(override_game_id=game_id)))
    STORE = {}
    [get_store_items(STORE, sku) for sku in gamedata['store']['catalog']]

    if verbose:
        print >> sys.stderr, 'converting date', date, 'start_time', start_time, 'end_time', end_time, '...'

    if not verbose: filterwarnings('ignore', category = MySQLdb.Warning)

    cfg = SpinConfig.get_mysql_config(game_id+'_upcache')
    con = MySQLdb.connect(*cfg['connect_args'], **cfg['connect_kwargs'])
    store_table = cfg['table_prefix']+game_id+'_store'

    s3 = SpinS3.S3(SpinConfig.aws_key_file())
    bucket = 'spinpunch-logs'

    batch = 0
    total = 0
    cur = con.cursor()

    for entry in s3.list_bucket(bucket, prefix='%s/%s-%s-metrics.json' % (date[0:6], SpinConfig.game_id_long(override_game_id=game_id), date)):
        filename = entry['name'].split('/')[-1]

        if verbose: print >> sys.stderr, 'reading', filename

        if entry['name'].endswith('.zip'):
            tf = tempfile.NamedTemporaryFile(prefix='old_metrics_to_mysql-'+filename, suffix='.zip')
            s3.get_file(bucket, entry['name'], tf.name)
            unzipper = subprocess.Popen(['unzip', '-q', '-p', tf.name],
                                        stdout = subprocess.PIPE)

        elif entry['name'].endswith('.gz'):
            fd = s3.get_open(bucket, entry['name'], allow_keepalive = False)
            unzipper = subprocess.Popen(['gunzip', '-c', '-'],
                                        stdin = fd.fileno(),
                                        stdout = subprocess.PIPE)

        for line in unzipper.stdout.xreadlines():
            if '5120_buy_item' in line:
                #and ('item:token' in line):
                entry = SpinJSON.loads(line)
                if entry['event_name'] != '5120_buy_item': continue

                if 'price_currency' not in entry:
                    # old metric, need to fill in manually
                    if entry['items'][0]['spec'] in STORE:
                        entry['price_currency'] = 'item:token'
                        entry['price'] = STORE[entry['items'][0]['spec']]

                if verbose: print >> sys.stderr, SpinJSON.dumps(entry)

                if entry.get('price_currency','unknown') != 'item:token': continue


                if '_id' in entry:
                    entry_id = entry['_id']
                else:
                    id_generator.set_time(int(time.time()))
                    entry_id = id_generator.generate() # arbitrary

                assert len(entry['items']) == 1
                item = entry['items'][0]
                keyvals = [('_id', entry_id),
                           ('time', entry['time']),
                           ('user_id', entry['user_id']),
                           ('price', entry['price']),
                           ('currency', entry['price_currency']),
                           ('item', item['spec']),
                           ('stack', item.get('stack',1))]

                query = "INSERT INTO " + store_table + \
                            "("+', '.join(['`'+k+'`' for k,v in keyvals])+")"+ \
                            " VALUES ("+', '.join(['%s'] * len(keyvals)) +")"
                if dry_run:
                    print >> sys.stderr, query, [v for k,v in keyvals]
                else:
                    cur.execute(query, [v for k,v in keyvals])

                    batch += 1
                    total += 1
                    if commit_interval > 0 and batch >= commit_interval:
                        batch = 0
                        con.commit()
                        cur = con.cursor()
                        if verbose: print >> sys.stderr, total, 'inserted'

    if not dry_run:
        con.commit()
Exemplo n.º 4
0
# set TMPDIR environment variable to a suitable location

import sys, os, getopt, time, tempfile, shutil
import SpinS3
import SpinUserDB
import SpinConfig
import SpinParallel
import SpinSingletonProcess
import subprocess

date_str = time.strftime('%Y%m%d', time.gmtime())

# autoconfigure based on config.json
game_id = SpinConfig.config['game_id']
backup_bucket = 'spinpunch-backups'
backup_obj_prefix = '%s-player-data-%s/' % (SpinConfig.game_id_long(),
                                            date_str)
s3_key_file_for_db = SpinConfig.aws_key_file()
s3_key_file_for_backups = SpinConfig.aws_key_file()


class NullFD(object):
    def write(self, stuff):
        pass


def backup_s3_dir(title,
                  bucket_name,
                  prefix='',
                  ignore_errors=False,
                  verbose=False):