Exemplo n.º 1
0
def start_gpdb(port, nproc):
    if (port is None) or (nproc == '24'):
        os.system('yes | gpstart')
        cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)
    else:
        call = 'yes | gpstart -d /gpsegs/gpdb-{}/master/gpseg-1'.format(nproc)
        os.system(call)
        cxn = SQLCxn(username='******', db='ubuntu', timeout=10000, port=port)
    return cxn
Exemplo n.º 2
0
def main(kwargs):
    opType = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {
        'x_table_name': x_table_name,
        'y_table_name': y_table_name,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'do_gnmf': do_gnmf,
        'do_robust': do_robust,
        'shape': shape,
        'cxn': cxn
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        cxn.execute('DROP TABLE IF EXISTS R2 CASCADE')
        cxn.execute(
            "SELECT MADLIB.matrix_random({},1,NULL,'uniform','R2',NULL)".
            format(shape[0]))
        cxn.execute('ALTER TABLE R2 RENAME COLUMN ROW TO ROW_NUM')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    print res
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Exemplo n.º 3
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    tableStub = kwargs.get('tableStub')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    outdir = kwargs.get('outdir')

    savestub = '' if (savestub is None) else savestub
    try:
        tableStub = int(tableStub)
    except ValueError:
        pass

    Mname = 'M{}'.format(tableStub)
    Nname = 'N{}'.format(tableStub)
    wname = 'w{}'.format(tableStub)

    print 'Evaluating: {}'.format(opType)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=2000)
    shape = cxn.get_shape_dense('M{}'.format(tableStub))

    cleanup = []
    if (opType == 'SVD'):
        call = "svd('{}','svd','row_num',10, 10,'svd_summary')".format(
            Mname, shape[1])
        cleanup.append('svd_s')
        cleanup.append('svd_u')
        cleanup.append('svd_v')
        cleanup.append('svd_summary')
    else:
        raise NotImplementedError('Invalid Operation')

    for obj in cleanup:
        cxn.execute('DROP TABLE IF EXISTS {}'.format(obj))

    sql_call = 'SELECT madlib.{}'.format(call)
    rows = shape[0]
    cols = shape[1]
    path = '../output/{}/madlib_{}_{}{}.txt'.format(outdir, mattype, opType,
                                                    int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    madlib_timeout = ('../temp/madlib_punked_out.json', opType)
    res = cxn.time(sql_call, cleanup, madlib_timeout)
    if (res is None):
        print 'Timed Out'
        return
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Exemplo n.º 4
0
helpstr = 'Approximate fraction nonzero values. Delimited by spaces. Default "0.0001 0.001 0.01 0.1"'
parser.add_argument('--sparsity',
                    default='0.0001 0.001 0.01 0.1',
                    type=str,
                    help=helpstr)
args = parser.parse_args()

if not os.path.exists('manifest.txt'):
    with open('manifest.txt', 'w') as fh:
        fh.write('')

# start logging
start_make_logging()

os.system('hdfs dfs -mkdir -p /scratch')
cxn = SQLCxn(timeout=None, username='******', db='ubuntu')

sparse_gb = int(args.msize_sparse)
sparsity = map(lambda x: float(x), args.sparsity.split(' '))
for sr in sparsity:
    stub = '{}'.format(sr).replace('0.', '_')
    gb_stub = sparse_gb
    fmt = (stub, gb_stub)
    k = int(np.ceil((sparse_gb * 1e9) / float(8 * 100)))

    mpath_tall = os.path.abspath(
        '../output/M{}{}_sparse_tall.mtx'.format(*fmt))
    mpath_wide = os.path.abspath(
        '../output/M{}{}_sparse_wide.mtx'.format(*fmt))
    data.gen_data_sparse(k, 100, sr, 'M{}{}_sparse_tall'.format(*fmt),
                         mpath_tall)
Exemplo n.º 5
0
parser.add_argument(
    '--msize',
    default='2 4 8 16',
    type=str,
    help='Approximate size of matrix (GB in memory) to generate')
args = parser.parse_args()

if not os.path.exists('manifest.txt'):
    with open('manifest.txt', 'w') as fh:
        fh.write('')

# start logging
start_make_logging()

os.system('hdfs dfs -mkdir -p /scratch')
cxn = SQLCxn(timeout=None, username='******', db='ubuntu')

approx_gb = [.001]
if len(args.msize) > 0:
    approx_gb += [int(x) for x in args.msize.split()]

mtypes = ['tall', 'wide']
for mtype in mtypes:
    for gb in approx_gb:
        print mtype
        if gb < .1: gb_stub = '_test'
        else: gb_stub = '_small' if gb < 1 else gb

        if mtype == 'square':
            k = int(np.ceil(np.sqrt((gb * 1e9) / float(8))))
            m = k
Exemplo n.º 6
0
def doMatrixOp(kwargs):
    opType  = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    tableStub = kwargs.get('tableStub')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    outdir = kwargs.get('outdir')
    sr = kwargs.get('sr')

    try:
        tableStub = int(tableStub)
    except ValueError:
        pass

    Mname = 'M{}'.format(tableStub)
    Nname = 'M{}'.format(tableStub)
    wname = 'w{}'.format(tableStub)
    if opType == 'GMM':
        Nname = Mname.replace('wide','tall')

    print 'Evaluating: {}'.format(opType)

    colnames = ['nodes','sr','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames
    runTimes['nodes'] = runTimes['nodes'].astype('O')
    runTimes['sr'] = runTimes['sr'].astype('O')

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)
    
    cleanup = []
    if (opType == 'TRANS'):
        call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
        cleanup.append('Mt')
    elif (opType == 'NORM'):
        call = "matrix_norm('{}',NULL,'fro')".format(Mname)
    elif (opType == 'GMM'):
        call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(Mname,Nname)
        cleanup.append('MN')
    elif (opType == 'MVM'):
        array_call = 'SELECT array_agg(random()) FROM generate_series(1,100)'
        call = "matrix_vec_mult('{}',NULL,({}))".format(Mname,array_call)
        cleanup.append('Mw')
    elif (opType == 'TSM'):
        call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(Mname)
        cleanup.append('MtM')
    elif (opType == 'ADD'):
        call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(Mname, Nname)
        cleanup.append('M_N')
    else:
        raise NotImplementedError('Invalid Operation')

    for obj in cleanup:
        cxn.execute('DROP TABLE IF EXISTS {}'.format(obj))

    sql_call = 'SELECT madlib.{}'.format(call)
    fmt = (outdir, mattype, opType, nodes)
    path = '../output/{}/madlib_{}_{}{}.txt'.format(*fmt)
    res = cxn.time(sql_call, cleanup)
    if (res is None):
        print 'Timed Out'
        return

    runTimes.ix[:,'nodes'] = nodes
    runTimes.ix[:,'sr'] = sr
    runTimes.ix[:,2:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Exemplo n.º 7
0
def main():
    stub = sys.argv[1]
    cxn = SQLCxn(timeout=None, username='******', db='ubuntu')

    if not cxn.table_exists('adclick_clean_vectors_split'):
        shape = cxn.get_shape('adclick_clean{}_dense'.format(stub))
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    if not cxn.table_exists('adclick_clean_indepvars_long'):
        stmt = """
            CREATE TABLE adclick_clean_indepvars_long AS (
                SELECT row_num, ix AS col_num, indep_vars[ix] AS val
                  FROM (
                    SELECT *, GENERATE_SUBSCRIPTS(indep_vars, 1) AS ix
                      FROM adclick_clean_vectors_split
                  ) tmp
            ) DISTRIBUTED BY (row_num, col_num)
        """
        cxn.execute(stmt)

    if not cxn.table_exists('adclick_clean_y'):
        stmt = """
            CREATE TABLE adclick_clean_y AS (
                SELECT row_num, 1 AS col_num, y AS val
                  FROM adclick_clean_vectors_split
            ) DISTRIBUTED BY (row_num)
        """
        cxn.execute(stmt)
Exemplo n.º 8
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu')
    shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    if not cxn.table_exists('adclick_clean_vectors_split'):
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    # need to do a bit of preprocessing
    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_vectors_split',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_vectors_split',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        stmt = """
            CREATE TABLE adclick_clean_depvars AS (
                SELECT row_num, val[2:{}]::NUMERIC[] val
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        if not cxn.table_exists('adclick_clean_depvars'):
            cxn.execute(stmt)
        call = """
            SELECT madlib.pca_train('adclick_clean_depvars',
                                    'result_table',
                                    'row_num',
                                    5);
            SELECT madlib.pca_project('adclick_clean_depvars',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'residual_table',
                                      'result_summary_table')
        """
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)
Exemplo n.º 9
0
def main(argv):
    cxn = SQLCxn(timeout=None, username='******', db='ubuntu')
    coalesce_files(argv.stub, argv.sparse)
    process_madlib(argv.stub, argv.sparse, cxn)
Exemplo n.º 10
0
def main(kwargs):
    opType = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=2000)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {
        'x_table_name': x_table_name,
        'y_table_name': y_table_name,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'shape': shape,
        'cxn': cxn
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        do_reg(x_table_name, y_table_name, cxn)
        preproc = """
            DROP TABLE IF EXISTS Y_HAT;
            SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
            CREATE TABLE R2 AS (
                SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
                  FROM {y}
                 INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
            ) DISTRIBUTED BY (row_num)
        """.format(X=x_table_name, y=y_table_name)
        cxn.execute(preproc)
        call = 'do_robust(x_table_name, cxn)'
    elif opType == 'pca':
        print 'Not Implemented'
        return

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    print res
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Exemplo n.º 11
0
def main(kwargs):
    opType  = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {'x_table_name': x_table_name,
           'y_table_name': y_table_name,
           'do_logit': do_logit,
           'do_gnmf': do_gnmf,
           'do_reg': do_reg,
           'do_robust': do_robust,
           'shape': shape,
           'cxn': cxn}
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        #do_reg(x_table_name, y_table_name, cxn)
        #preproc = """
        #    DROP TABLE IF EXISTS Y_HAT;
        #    SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
        #    CREATE TABLE R2 AS (
        #        SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
        #          FROM {y}
        #         INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
        #    ) DISTRIBUTED BY (row_num)
        #""".format(X=x_table_name, y=y_table_name)

        # We can just generate a vector of residuals on the fly
        # rather than computing them explicitly.
        cxn.execute('DROP TABLE IF EXISTS R2')
        cxn.randomMatrix(shape[0], 1, 'R2')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    res
    runTimes.ix[:,3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Exemplo n.º 12
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    fixedAxis = int(kwargs.get('fixedAxis'))
    nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' '))
    nproc = kwargs.get('nproc')

    port = GPDB_PORT_MAP[nproc] if nproc is not None else None

    if nproc is not None:
        cxn = start_gpdb(port, nproc)
        cxn.execute('DROP TABLE IF EXISTS M16_tall')
        atexit.register(stop_gpdb, nproc, cxn)
    else:
        cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5']
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    if nproc is None:
        path = os.path.join('..', 'output',
                            'madlib_{}_{}.txt'.format(mattype, opType))
    else:
        path = os.path.join('..', 'output',
                            'madlib_cpu_{}_scale.txt'.format(opType))
    for nr in nrow_scale:
        nrow = fixedAxis if opType == 'GMM' else nr
        ncol = nr if opType == 'GMM' else fixedAxis
        print nrow
        print ncol
        Mname = 'M{}{}'.format(nrow, ncol)
        if not cxn.table_exists('M{}{}'.format(nrow, ncol)):
            cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol))
        if (opType == 'GMM'):
            if not cxn.table_exists('N{}{}'.format(ncol, nrow)):
                cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow))
            Nname = 'N{}{}'.format(ncol, nrow)
        elif (opType == 'ADD'):
            if not cxn.table_exists('N{}{}'.format(nrow, ncol)):
                cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol))
            Nname = 'N{}{}'.format(nrow, ncol)

        cleanup = []
        if (opType == 'TRANS'):
            call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
            cleanup.append('Mt')
        elif (opType == 'NORM'):
            call = "matrix_norm('{}',NULL,'fro')".format(Mname)
        elif (opType == 'GMM'):
            call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(
                Mname, Nname)
            cleanup.append('MN')
        elif (opType == 'MVM'):
            array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format(
                ncol)
            call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call)
        elif (opType == 'TSM'):
            call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(
                Mname)
            cleanup.append('MtM')
        elif (opType == 'ADD'):
            call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(
                Mname, Nname)
            cleanup.append('M_N')
        else:
            raise NotImplementedError('Invalid Operation')

        sql_call = 'SELECT madlib.{}'.format(call)
        runTimes.ix[:, 'rows'] = nr if nproc is None else nproc
        runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup)
        writeHeader = False if (os.path.exists(path)) else True
        runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Exemplo n.º 13
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    shape = cxn.get_shape('adclick_clean_1_sparse')
    if not cxn.table_exists('adclick_clean_1_vectors_sparse'):
        stmt = """
        CREATE TABLE adclick_clean_1_vectors_sparse AS (
            SELECT x.row_num, madlib.svec_cast_positions_float8arr(
               ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0
               ) AS indep_vars, y.val AS y
             FROM adclick_clean_1_sparse x
            INNER JOIN adclick_clean_y y ON x.row_num = y.row_num
            GROUP BY x.row_num, y.val
        ) DISTRIBUTED BY (row_num)
        """.format(shape[1])
        cxn.execute(stmt)

    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        call = """
            SELECT madlib.pca_sparse_train('adclick_clean_1_sparse',
                                           'result_table',
                                           'row_num',
                                           'col_num',
                                           'val',
                                           '{0}',
                                           '{1}',
                                           5);
            SELECT madlib.pca_sparse_project('adclick_clean_1_sparse',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'col_num',
                                      'val',
                                      '{0}',
                                      '{1}',
                                      'residual_table',
                                      'result_summary_table')
        """.format(*shape)
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)