Пример #1
0
    def mkrollups(self):
        staging_table = 'clientrollups_staging'
        drop_table_if_exists(staging_table, self.pconn, self.pcur)
        megaquery = """
    CREATE TABLE {} AS
    SELECT
        client_id,
        {}
        from events t
        inner join visits using (visit_id)
        inner join visitors v using (visitor_id)
        inner join campaigns using (campaign_id)
        inner join clients cl using (client_id)
        WHERE visits.ip not in ({})
        AND campaigns.delete_dt is null
        GROUP BY client_id
        """.format(staging_table, self.metric_expressions(), OUR_IP_STRING)

        debug('Calculating client rollups')
        with self.pconn:
            self.pcur.execute(megaquery)
        debug('beginning deploy table on clientrollups')
        deploy_table('clientrollups', 'clientrollups_staging', 'clientrollups_old', self.pcur, self.pconn)

        self.updated = strftime('%x %X')

        debug('Done.')
def load_users(connection, cursor, bucket_name, key_name, staging_table, final_table, access_key, secret_key):
	print "loading users"
	drop_table_if_exists(staging_table, connection, cursor)
	print "creating staging table"
	cursor.execute('create table {} (like {})'.format(staging_table, final_table))
	print "copying from s3 into staging table"
	copy_from_s3(connection, cursor, bucket_name, key_name, staging_table, access_key, secret_key)
	print "making room for new records"
	cursor.execute('delete from {} where fbid in (select distinct fbid from {})'.format(final_table, staging_table))
	print "inserting new records"
	cursor.execute('insert into {} select * from {}'.format(final_table, staging_table))
	connection.commit()
def main(table):
    start = time.time()

    # get the schema of the table
    columns = None
    csvtime = None
    runcsv = None
    try:
        with mysql.connect(**rds) as dbconn:
            dbconn.execute("describe %s" % table)
            description = dbconn.fetchall()
            columns = create_query(description)
            write2csv(table, dbconn)
            csvtime = time.time()
            runcsv = csvtime - start
    except StandardError as e:
        logger.warning("error: {0}".format(e))

    redconn = psycopg2.connect(**redshift)
    redcursor = redconn.cursor()

    up2s3(table)
    ups3time = time.time()
    runs3 = ups3time - csvtime

    # create the table with the columns query now generated
    staging_table = "{0}_staging".format(table)
    old_table = "{0}_old".format(table)
    drop_table_if_exists(staging_table, redconn, redcursor)

    logging.info("Creating table {}".format(staging_table))
    with redconn:
        redcursor.execute("CREATE TABLE {0} ({1})".format(staging_table, columns))

    # copy the file that we just uploaded to s3 to redshift
    access_key = aws["aws_access_key_id"]
    secret_key = aws["aws_secret_access_key"]
    try:
        copy_from_s3(redconn, redcursor, table, staging_table, access_key, secret_key)
    except psycopg2.DatabaseError:
        # step through the csv we are about to copy over and change the encodings to work properly with redshift
        logging.info("Error copying, assuming encoding errors and rewriting CSV...")

        with open("%s.csv" % table, "r") as csvfile:
            reader = csv.reader(csvfile, delimiter="|")
            with open("%s2.csv" % table, "wb") as csvfile2:
                writer = csv.writer(csvfile2, delimiter="|")
                keep_going = True
                while keep_going:
                    try:
                        this = reader.next()
                        new = [i.decode("latin-1").encode("utf-8") for i in this]
                        writer.writerow(new)
                    except StopIteration:
                        keep_going = False

        logging.info("Rewrite complete")
        os.remove("%s.csv" % table)
        os.system("mv {0}2.csv {0}.csv".format(table))
        up2s3(table)
        # atomicity insurance
        time.sleep(10)
        copy_from_s3(redconn, redcursor, table, staging_table, access_key, secret_key)

    copytime = time.time()
    runcopy = copytime - ups3time

    deploy_table(table, staging_table, old_table, redcursor, redconn)

    endtime = time.time()
    runswap = endtime - copytime
    runtotal = endtime - start
    logging.info("Successfully copied %s from RDS to S3 to Redshift" % table)

    events = [
        ("write csv", runcsv),
        ("write to s3", runs3),
        ("copy from s3 to redshift", runcopy),
        ("swap redshift tables", runswap),
        ("complete entire process", runtotal),
    ]
    logging.info("|".join("{0:.2f} seconds to {1}".format(duration, event) for event, duration in events))