Пример #1
0
def main(kwargs):
    opType  = kwargs.get('opType')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    x_table_name = kwargs.get('xTableName')
    y_table_name = kwargs.get('yTableName')

    savestub = '' if (savestub is None) else savestub

    print 'Evaluating: {}'.format(opType)

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5']
    runTimes = pd.DataFrame(np.zeros((1,len(colnames))))
    runTimes.columns = colnames

    shape = cxn.get_shape(x_table_name)

    env = {'x_table_name': x_table_name,
           'y_table_name': y_table_name,
           'do_logit': do_logit,
           'do_gnmf': do_gnmf,
           'do_reg': do_reg,
           'do_robust': do_robust,
           'shape': shape,
           'cxn': cxn}
    cleanup = None
    if opType == 'logit':
        call = 'do_logit(x_table_name, y_table_name, shape, cxn)'
    elif opType == 'gnmf':
        call = 'do_gnmf(x_table_name, shape, 10, cxn)'
    elif opType == 'reg':
        call = 'do_reg(x_table_name, y_table_name, cxn)'
        cleanup = ("map(lambda x: cxn.execute("
                   "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])")
    elif opType == 'robust':
        #do_reg(x_table_name, y_table_name, cxn)
        #preproc = """
        #    DROP TABLE IF EXISTS Y_HAT;
        #    SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT');
        #    CREATE TABLE R2 AS (
        #        SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val
        #          FROM {y}
        #         INNER JOIN y_hat ON {y}.row_num = y_hat.row_num
        #    ) DISTRIBUTED BY (row_num)
        #""".format(X=x_table_name, y=y_table_name)

        # We can just generate a vector of residuals on the fly
        # rather than computing them explicitly.
        cxn.execute('DROP TABLE IF EXISTS R2')
        cxn.randomMatrix(shape[0], 1, 'R2')
        call = 'do_robust(x_table_name, cxn)'

    rows = shape[0]
    cols = shape[1]
    path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes))
    runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols)
    res = utils.timeOp(call, env, cleanup)
    res
    runTimes.ix[:,3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
Пример #2
0
        elif mtype == 'tall':
            k = int(np.ceil((gb * 1e9) / float(8 * 100)))
            m = 100
            rows = 2**14
        elif mtype == 'wide':
            k = 100
            m = int(np.ceil((gb * 1e9) / float(8 * 100)))
            rows = 1

        stub = '_' + mtype
        fmt = (gb_stub, stub)
        data.gen_data_disk('../output/M{}{}.csv'.format(*fmt), k, m, rows)
        if ((mtype == 'wide')
                and (not cxn.table_exists('M{}{}'.format(*fmt)))):
            print 'CREATING MATRIX: M{}{}'.format(*fmt)
            cxn.randomMatrix(k, m, 'M{}{}'.format(*fmt))
        if mtype != 'tall':
            continue
        mpath = os.path.abspath('../output/M{}{}_sparse.mtx'.format(*fmt))
        data.gen_data_disk('../output/y{}{}.csv'.format(*fmt), k, 1, rows,
                           True)
        utils.link_if_not('../output/M{}{}.csv'.format(*fmt),
                          '../output/N{}{}.csv'.format(*fmt))
        utils.link_if_not('../output/M{}{}.csv.mtd'.format(*fmt),
                          '../output/N{}{}.csv.mtd'.format(*fmt))

paths = os.listdir('../output')
paths = filter(
    lambda x: (x != '.gitignore') and ('.log' not in x) and ('.mtd' not in x),
    paths)
paths = map(lambda x: os.path.join('../output', x), paths)
Пример #3
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    fixedAxis = int(kwargs.get('fixedAxis'))
    nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' '))
    nproc = kwargs.get('nproc')

    port = GPDB_PORT_MAP[nproc] if nproc is not None else None

    if nproc is not None:
        cxn = start_gpdb(port, nproc)
        cxn.execute('DROP TABLE IF EXISTS M16_tall')
        atexit.register(stop_gpdb, nproc, cxn)
    else:
        cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)

    colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5']
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    if nproc is None:
        path = os.path.join('..', 'output',
                            'madlib_{}_{}.txt'.format(mattype, opType))
    else:
        path = os.path.join('..', 'output',
                            'madlib_cpu_{}_scale.txt'.format(opType))
    for nr in nrow_scale:
        nrow = fixedAxis if opType == 'GMM' else nr
        ncol = nr if opType == 'GMM' else fixedAxis
        print nrow
        print ncol
        Mname = 'M{}{}'.format(nrow, ncol)
        if not cxn.table_exists('M{}{}'.format(nrow, ncol)):
            cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol))
        if (opType == 'GMM'):
            if not cxn.table_exists('N{}{}'.format(ncol, nrow)):
                cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow))
            Nname = 'N{}{}'.format(ncol, nrow)
        elif (opType == 'ADD'):
            if not cxn.table_exists('N{}{}'.format(nrow, ncol)):
                cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol))
            Nname = 'N{}{}'.format(nrow, ncol)

        cleanup = []
        if (opType == 'TRANS'):
            call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
            cleanup.append('Mt')
        elif (opType == 'NORM'):
            call = "matrix_norm('{}',NULL,'fro')".format(Mname)
        elif (opType == 'GMM'):
            call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(
                Mname, Nname)
            cleanup.append('MN')
        elif (opType == 'MVM'):
            array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format(
                ncol)
            call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call)
        elif (opType == 'TSM'):
            call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(
                Mname)
            cleanup.append('MtM')
        elif (opType == 'ADD'):
            call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(
                Mname, Nname)
            cleanup.append('M_N')
        else:
            raise NotImplementedError('Invalid Operation')

        sql_call = 'SELECT madlib.{}'.format(call)
        runTimes.ix[:, 'rows'] = nr if nproc is None else nproc
        runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup)
        writeHeader = False if (os.path.exists(path)) else True
        runTimes.to_csv(path, index=False, header=writeHeader, mode='a')