Пример #1
0
def main():
    stub = sys.argv[1]
    cxn = SQLCxn(timeout=None, username='******', db='ubuntu')

    if not cxn.table_exists('adclick_clean_vectors_split'):
        shape = cxn.get_shape('adclick_clean{}_dense'.format(stub))
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    if not cxn.table_exists('adclick_clean_indepvars_long'):
        stmt = """
            CREATE TABLE adclick_clean_indepvars_long AS (
                SELECT row_num, ix AS col_num, indep_vars[ix] AS val
                  FROM (
                    SELECT *, GENERATE_SUBSCRIPTS(indep_vars, 1) AS ix
                      FROM adclick_clean_vectors_split
                  ) tmp
            ) DISTRIBUTED BY (row_num, col_num)
        """
        cxn.execute(stmt)

    if not cxn.table_exists('adclick_clean_y'):
        stmt = """
            CREATE TABLE adclick_clean_y AS (
                SELECT row_num, 1 AS col_num, y AS val
                  FROM adclick_clean_vectors_split
            ) DISTRIBUTED BY (row_num)
        """
        cxn.execute(stmt)
Пример #2
0
    mpath_tall = os.path.abspath(
        '../output/M{}{}_sparse_tall.mtx'.format(*fmt))
    mpath_wide = os.path.abspath(
        '../output/M{}{}_sparse_wide.mtx'.format(*fmt))
    data.gen_data_sparse(k, 100, sr, 'M{}{}_sparse_tall'.format(*fmt),
                         mpath_tall)
    data.gen_data_sparse(100, k, sr, 'M{}{}_sparse_wide'.format(*fmt),
                         mpath_wide)
    data.gen_data_disk('../output/y{}_sparse.csv'.format(sparse_gb), k, 1, k,
                       True)
    stmt = """
        CREATE VIEW N{0}{1}_sparse_tall AS (
            SELECT * FROM M{0}{1}_sparse_tall
        )
    """.format(*fmt)
    if not cxn.table_exists('N{}{}_sparse_tall'.format(*fmt)):
        cxn.execute(stmt)

    utils.link_if_not('../output/M{}{}_sparse_tall.mtx'.format(*fmt),
                      '../output/N{}{}_sparse_tall.mtx'.format(*fmt))

cxn.load_dense_matrix('../output/y{}_sparse.csv'.format(sparse_gb),
                      'y{}_sparse'.format(sparse_gb))

paths = os.listdir('../output')
paths = filter(
    lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x),
    paths)
paths = map(lambda x: os.path.join('../output', x), paths)

with open('manifest.txt') as fh:
Пример #3
0
            m = k
            rows = 2**12
        elif mtype == 'tall':
            k = int(np.ceil((gb * 1e9) / float(8 * 100)))
            m = 100
            rows = 2**14
        elif mtype == 'wide':
            k = 100
            m = int(np.ceil((gb * 1e9) / float(8 * 100)))
            rows = 1

        stub = '_' + mtype
        fmt = (gb_stub, stub)
        data.gen_data_disk('../output/M{}{}.csv'.format(*fmt), k, m, rows)
        if ((mtype == 'wide')
                and (not cxn.table_exists('M{}{}'.format(*fmt)))):
            print 'CREATING MATRIX: M{}{}'.format(*fmt)
            cxn.randomMatrix(k, m, 'M{}{}'.format(*fmt))
        if mtype != 'tall':
            continue
        mpath = os.path.abspath('../output/M{}{}_sparse.mtx'.format(*fmt))
        data.gen_data_disk('../output/y{}{}.csv'.format(*fmt), k, 1, rows,
                           True)
        utils.link_if_not('../output/M{}{}.csv'.format(*fmt),
                          '../output/N{}{}.csv'.format(*fmt))
        utils.link_if_not('../output/M{}{}.csv.mtd'.format(*fmt),
                          '../output/N{}{}.csv.mtd'.format(*fmt))

paths = os.listdir('../output')
paths = filter(
    lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x),
Пример #4
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu')
    shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    if not cxn.table_exists('adclick_clean_vectors_split'):
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    # need to do a bit of preprocessing
    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_vectors_split',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_vectors_split',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        stmt = """
            CREATE TABLE adclick_clean_depvars AS (
                SELECT row_num, val[2:{}]::NUMERIC[] val
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        if not cxn.table_exists('adclick_clean_depvars'):
            cxn.execute(stmt)
        call = """
            SELECT madlib.pca_train('adclick_clean_depvars',
                                    'result_table',
                                    'row_num',
                                    5);
            SELECT madlib.pca_project('adclick_clean_depvars',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'residual_table',
                                      'result_summary_table')
        """
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)
Пример #5
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    fixedAxis = int(kwargs.get('fixedAxis'))
    nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' '))
    nproc = kwargs.get('nproc')

    port = GPDB_PORT_MAP[nproc] if nproc is not None else None

    if nproc is not None:
        cxn = start_gpdb(port, nproc)
        cxn.execute('DROP TABLE IF EXISTS M16_tall')
        atexit.register(stop_gpdb, nproc, cxn)
    else:
        cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5']
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    if nproc is None:
        path = os.path.join('..', 'output',
                            'madlib_{}_{}.txt'.format(mattype, opType))
    else:
        path = os.path.join('..', 'output',
                            'madlib_cpu_{}_scale.txt'.format(opType))
    for nr in nrow_scale:
        nrow = fixedAxis if opType == 'GMM' else nr
        ncol = nr if opType == 'GMM' else fixedAxis
        print nrow
        print ncol
        Mname = 'M{}{}'.format(nrow, ncol)
        if not cxn.table_exists('M{}{}'.format(nrow, ncol)):
            cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol))
        if (opType == 'GMM'):
            if not cxn.table_exists('N{}{}'.format(ncol, nrow)):
                cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow))
            Nname = 'N{}{}'.format(ncol, nrow)
        elif (opType == 'ADD'):
            if not cxn.table_exists('N{}{}'.format(nrow, ncol)):
                cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol))
            Nname = 'N{}{}'.format(nrow, ncol)

        cleanup = []
        if (opType == 'TRANS'):
            call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
            cleanup.append('Mt')
        elif (opType == 'NORM'):
            call = "matrix_norm('{}',NULL,'fro')".format(Mname)
        elif (opType == 'GMM'):
            call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(
                Mname, Nname)
            cleanup.append('MN')
        elif (opType == 'MVM'):
            array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format(
                ncol)
            call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call)
        elif (opType == 'TSM'):
            call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(
                Mname)
            cleanup.append('MtM')
        elif (opType == 'ADD'):
            call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(
                Mname, Nname)
            cleanup.append('M_N')
        else:
            raise NotImplementedError('Invalid Operation')

        sql_call = 'SELECT madlib.{}'.format(call)
        runTimes.ix[:, 'rows'] = nr if nproc is None else nproc
        runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup)
        writeHeader = False if (os.path.exists(path)) else True
        runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Пример #6
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    shape = cxn.get_shape('adclick_clean_1_sparse')
    if not cxn.table_exists('adclick_clean_1_vectors_sparse'):
        stmt = """
        CREATE TABLE adclick_clean_1_vectors_sparse AS (
            SELECT x.row_num, madlib.svec_cast_positions_float8arr(
               ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0
               ) AS indep_vars, y.val AS y
             FROM adclick_clean_1_sparse x
            INNER JOIN adclick_clean_y y ON x.row_num = y.row_num
            GROUP BY x.row_num, y.val
        ) DISTRIBUTED BY (row_num)
        """.format(shape[1])
        cxn.execute(stmt)

    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        call = """
            SELECT madlib.pca_sparse_train('adclick_clean_1_sparse',
                                           'result_table',
                                           'row_num',
                                           'col_num',
                                           'val',
                                           '{0}',
                                           '{1}',
                                           5);
            SELECT madlib.pca_sparse_project('adclick_clean_1_sparse',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'col_num',
                                      'val',
                                      '{0}',
                                      '{1}',
                                      'residual_table',
                                      'result_summary_table')
        """.format(*shape)
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)