Exemplo n.º 1
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    tableStub = kwargs.get('tableStub')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    outdir = kwargs.get('outdir')

    savestub = '' if (savestub is None) else savestub
    try:
        tableStub = int(tableStub)
    except ValueError:
        pass

    Mname = 'M{}'.format(tableStub)
    Nname = 'N{}'.format(tableStub)
    wname = 'w{}'.format(tableStub)

    print 'Evaluating: {}'.format(opType)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=2000)
    shape = cxn.get_shape_dense('M{}'.format(tableStub))

    cleanup = []
    if (opType == 'SVD'):
        call = "svd('{}','svd','row_num',10, 10,'svd_summary')".format(
            Mname, shape[1])
        cleanup.append('svd_s')
        cleanup.append('svd_u')
        cleanup.append('svd_v')
        cleanup.append('svd_summary')
    else:
        raise NotImplementedError('Invalid Operation')

    for obj in cleanup:
        cxn.execute('DROP TABLE IF EXISTS {}'.format(obj))

    sql_call = 'SELECT madlib.{}'.format(call)
    rows = shape[0]
    cols = shape[1]
    path = '../output/{}/madlib_{}_{}{}.txt'.format(outdir, mattype, opType,
                                                    int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    madlib_timeout = ('../temp/madlib_punked_out.json', opType)
    res = cxn.time(sql_call, cleanup, madlib_timeout)
    if (res is None):
        print 'Timed Out'
        return
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
Exemplo n.º 2
0
def main(kwargs):
    op_type = kwargs['opType']
    nodes = kwargs['nodes']
    stub = kwargs['stub']

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu')
    shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    if not cxn.table_exists('adclick_clean_vectors_split'):
        stmt = """
            CREATE TABLE adclick_clean_vectors_split AS (
                SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        cxn.execute(stmt)

    # need to do a bit of preprocessing
    if op_type == 'logit':
        cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_logit')
        call = """
            SELECT madlib.logregr_train('adclick_clean_vectors_split',
                                        'adclick_logit',
                                        'y', 'indep_vars', NULL,
                                        3, 'igd', .000001)
        """
        cleanup = ['adclick_logit_summary', 'adclick_logit']
    elif op_type == 'reg':
        cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary')
        cxn.execute('DROP TABLE IF EXISTS adclick_reg')
        call = """
            SELECT madlib.linregr_train('adclick_clean_vectors_split',
                                        'adclick_reg', 'y', 'indep_vars')
        """
        cleanup = ['adclick_reg_summary', 'adclick_reg']
    elif op_type == 'pca':
        cxn.execute('DROP TABLE IF EXISTS result_table')
        cxn.execute('DROP TABLE IF EXISTS result_table_mean')
        cxn.execute('DROP TABLE IF EXISTS residual_table')
        cxn.execute('DROP TABLE IF EXISTS result_summary_table')
        cxn.execute('DROP TABLE IF EXISTS adlick_prj')
        stmt = """
            CREATE TABLE adclick_clean_depvars AS (
                SELECT row_num, val[2:{}]::NUMERIC[] val
                  FROM adclick_clean{}_dense
            ) DISTRIBUTED BY (row_num)
        """.format(shape[1], stub)
        if not cxn.table_exists('adclick_clean_depvars'):
            cxn.execute(stmt)
        call = """
            SELECT madlib.pca_train('adclick_clean_depvars',
                                    'result_table',
                                    'row_num',
                                    5);
            SELECT madlib.pca_project('adclick_clean_depvars',
                                      'result_table',
                                      'adclick_prj',
                                      'row_num',
                                      'residual_table',
                                      'result_summary_table')
        """
        cleanup = [
            'result_table', 'result_table_mean', 'residual_table',
            'result_summary_table', 'adclick_prj'
        ]

    #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub))
    runTimes.ix[:, ['rows', 'cols']] = shape

    path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes))
    runTimes.ix[:, 'nodes'] = nodes
    res = cxn.time(call, cleanup)
    runTimes.ix[:, 3:] = res
    runTimes.to_csv(path, index=False)
Exemplo n.º 3
0
def doMatrixOp(kwargs):
    opType = kwargs.get('opType')
    mattype = kwargs.get('mattype')
    tableStub = kwargs.get('tableStub')
    savestub = kwargs.get('savestub')
    nodes = kwargs.get('nodes')
    outdir = kwargs.get('outdir')

    savestub = '' if (savestub is None) else savestub
    try:
        tableStub = int(tableStub)
    except ValueError:
        pass

    Mname = 'M{}'.format(tableStub)
    Nname = 'N{}'.format(tableStub)
    wname = 'w{}'.format(tableStub)

    print 'Evaluating: {}'.format(opType)

    colnames = [
        'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5'
    ]
    runTimes = pd.DataFrame(np.zeros((1, len(colnames))))
    runTimes.columns = colnames

    cxn = SQLCxn(username='******', db='ubuntu', timeout=10000)
    shape = cxn.get_shape_dense('M{}'.format(tableStub))

    cleanup = []
    if (opType == 'TRANS'):
        call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname)
        cleanup.append('Mt')
    elif (opType == 'NORM'):
        call = "matrix_norm('{}',NULL,'fro')".format(Mname)
    elif (opType == 'GMM'):
        Nname = Mname.replace('wide', 'tall')
        call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(
            Mname, Nname)
        cleanup.append('MN')
    elif (opType == 'MVM'):
        array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format(
            shape[1])
        call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call)
    elif (opType == 'TSM'):
        call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(
            Mname)
        cleanup.append('MtM')
    elif (opType == 'ADD'):
        call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(
            Mname, Nname)
        cleanup.append('M_N')
    else:
        raise NotImplementedError('Invalid Operation')

    for obj in cleanup:
        cxn.execute('DROP TABLE IF EXISTS {}'.format(obj))

    sql_call = 'SELECT madlib.{}'.format(call)
    rows = shape[0]
    cols = shape[1]
    path = '../output/{}/madlib_{}_{}{}.txt'.format(outdir, mattype, opType,
                                                    int(nodes))
    runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols)
    madlib_timeout = ('../temp/madlib_punked_out.json', opType)
    res = cxn.time(sql_call, cleanup, madlib_timeout)
    if (res is None):
        print 'Timed Out'
        return
    runTimes.ix[:, 3:] = res
    writeHeader = False if (os.path.exists(path)) else True
    runTimes.to_csv(path, index=False, header=writeHeader, mode='a')