Пример #1
0
def main():
    stub = sys.argv[1]
    cxn = SQLCxn(timeout=None, username='******', db='ubuntu')

    if not cxn.table_exists('adclick_clean_vectors_split'):
        shape = cxn.get_shape('adclick_clean{}_dense'.format(stub))
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    if not cxn.table_exists('adclick_clean_indepvars_long'):
        stmt = """
            CREATE TABLE adclick_clean_indepvars_long AS (
                SELECT row_num, ix AS col_num, indep_vars[ix] AS val
                  FROM (
                    SELECT *, GENERATE_SUBSCRIPTS(indep_vars, 1) AS ix
                      FROM adclick_clean_vectors_split
                  ) tmp
            ) DISTRIBUTED BY (row_num, col_num)
        """
        cxn.execute(stmt)

    if not cxn.table_exists('adclick_clean_y'):
        stmt = """
            CREATE TABLE adclick_clean_y AS (
                SELECT row_num, 1 AS col_num, y AS val
                  FROM adclick_clean_vectors_split
            ) DISTRIBUTED BY (row_num)
        """
        cxn.execute(stmt)
Пример #2
0
def main(kwargs):
    opType = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {
        'x_table_name': x_table_name,
        'y_table_name': y_table_name,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'do_gnmf': do_gnmf,
        'do_robust': do_robust,
        'shape': shape,
        'cxn': cxn
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        cxn.execute('DROP TABLE IF EXISTS R2 CASCADE')
        cxn.execute(
            "SELECT MADLIB.matrix_random({},1,NULL,'uniform','R2',NULL)".
            format(shape[0]))
        cxn.execute('ALTER TABLE R2 RENAME COLUMN ROW TO ROW_NUM')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    print res
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Пример #3
0
def main(kwargs):
    opType  = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {'x_table_name': x_table_name,
           'y_table_name': y_table_name,
           'do_logit': do_logit,
           'do_gnmf': do_gnmf,
           'do_reg': do_reg,
           'do_robust': do_robust,
           'shape': shape,
           'cxn': cxn}
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        #do_reg(x_table_name, y_table_name, cxn)
        #preproc = """
        #    DROP TABLE IF EXISTS Y_HAT;
        #    SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
        #    CREATE TABLE R2 AS (
        #        SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
        #          FROM {y}
        #         INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
        #    ) DISTRIBUTED BY (row_num)
        #""".format(X=x_table_name, y=y_table_name)

        # We can just generate a vector of residuals on the fly
        # rather than computing them explicitly.
        cxn.execute('DROP TABLE IF EXISTS R2')
        cxn.randomMatrix(shape[0], 1, 'R2')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    res
    runTimes.ix[:,3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Пример #4
0
def main(kwargs):
    opType = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=2000)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {
        'x_table_name': x_table_name,
        'y_table_name': y_table_name,
        'do_logit': do_logit,
        'do_reg': do_reg,
        'shape': shape,
        'cxn': cxn
    }
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        do_reg(x_table_name, y_table_name, cxn)
        preproc = """
            DROP TABLE IF EXISTS Y_HAT;
            SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
            CREATE TABLE R2 AS (
                SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
                  FROM {y}
                 INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
            ) DISTRIBUTED BY (row_num)
        """.format(X=x_table_name, y=y_table_name)
        cxn.execute(preproc)
        call = 'do_robust(x_table_name, cxn)'
    elif opType == 'pca':
        print 'Not Implemented'
        return

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    print res
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Пример #5
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    shape = cxn.get_shape('adclick_clean_1_sparse')
    if not cxn.table_exists('adclick_clean_1_vectors_sparse'):
        stmt = """
        CREATE TABLE adclick_clean_1_vectors_sparse AS (
            SELECT x.row_num, madlib.svec_cast_positions_float8arr(
               ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0
               ) AS indep_vars, y.val AS y
             FROM adclick_clean_1_sparse x
            INNER JOIN adclick_clean_y y ON x.row_num = y.row_num
            GROUP BY x.row_num, y.val
        ) DISTRIBUTED BY (row_num)
        """.format(shape[1])
        cxn.execute(stmt)

    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        call = """
            SELECT madlib.pca_sparse_train('adclick_clean_1_sparse',
                                           'result_table',
                                           'row_num',
                                           'col_num',
                                           'val',
                                           '{0}',
                                           '{1}',
                                           5);
            SELECT madlib.pca_sparse_project('adclick_clean_1_sparse',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'col_num',
                                      'val',
                                      '{0}',
                                      '{1}',
                                      'residual_table',
                                      'result_summary_table')
        """.format(*shape)
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)