def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') tableStub = kwargs.get('tableStub') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') outdir = kwargs.get('outdir') savestub = '' if (savestub is None) else savestub try: tableStub = int(tableStub) except ValueError: pass Mname = 'M{}'.format(tableStub) Nname = 'N{}'.format(tableStub) wname = 'w{}'.format(tableStub) print 'Evaluating: {}'.format(opType) colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu', timeout=2000) shape = cxn.get_shape_dense('M{}'.format(tableStub)) cleanup = [] if (opType == 'SVD'): call = "svd('{}','svd','row_num',10, 10,'svd_summary')".format( Mname, shape[1]) cleanup.append('svd_s') cleanup.append('svd_u') cleanup.append('svd_v') cleanup.append('svd_summary') else: raise NotImplementedError('Invalid Operation') for obj in cleanup: cxn.execute('DROP TABLE IF EXISTS {}'.format(obj)) sql_call = 'SELECT madlib.{}'.format(call) rows = shape[0] cols = shape[1] path = '../output/{}/madlib_{}_{}{}.txt'.format(outdir, mattype, opType, int(nodes)) runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols) madlib_timeout = ('../temp/madlib_punked_out.json', opType) res = cxn.time(sql_call, cleanup, madlib_timeout) if (res is None): print 'Timed Out' return runTimes.ix[:, 3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') tableStub = kwargs.get('tableStub') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') outdir = kwargs.get('outdir') sr = kwargs.get('sr') try: tableStub = int(tableStub) except ValueError: pass Mname = 'M{}'.format(tableStub) Nname = 'M{}'.format(tableStub) wname = 'w{}'.format(tableStub) if opType == 'GMM': Nname = Mname.replace('wide','tall') print 'Evaluating: {}'.format(opType) colnames = ['nodes','sr','time1','time2','time3','time4','time5'] runTimes = pd.DataFrame(np.zeros((1,len(colnames)))) runTimes.columns = colnames runTimes['nodes'] = runTimes['nodes'].astype('O') runTimes['sr'] = runTimes['sr'].astype('O') cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) cleanup = [] if (opType == 'TRANS'): call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname) cleanup.append('Mt') elif (opType == 'NORM'): call = "matrix_norm('{}',NULL,'fro')".format(Mname) elif (opType == 'GMM'): call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format(Mname,Nname) cleanup.append('MN') elif (opType == 'MVM'): array_call = 'SELECT array_agg(random()) FROM generate_series(1,100)' call = "matrix_vec_mult('{}',NULL,({}))".format(Mname,array_call) cleanup.append('Mw') elif (opType == 'TSM'): call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format(Mname) cleanup.append('MtM') elif (opType == 'ADD'): call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format(Mname, Nname) cleanup.append('M_N') else: raise NotImplementedError('Invalid Operation') for obj in cleanup: cxn.execute('DROP TABLE IF EXISTS {}'.format(obj)) sql_call = 'SELECT madlib.{}'.format(call) fmt = (outdir, mattype, opType, nodes) path = '../output/{}/madlib_{}_{}{}.txt'.format(*fmt) res = cxn.time(sql_call, cleanup) if (res is None): print 'Timed Out' return runTimes.ix[:,'nodes'] = nodes runTimes.ix[:,'sr'] = sr runTimes.ix[:,2:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
def doMatrixOp(kwargs): opType = kwargs.get('opType') mattype = kwargs.get('mattype') fixedAxis = int(kwargs.get('fixedAxis')) nrow_scale = map(lambda x: int(x), kwargs['nrows'].split(' ')) nproc = kwargs.get('nproc') port = GPDB_PORT_MAP[nproc] if nproc is not None else None if nproc is not None: cxn = start_gpdb(port, nproc) cxn.execute('DROP TABLE IF EXISTS M16_tall') atexit.register(stop_gpdb, nproc, cxn) else: cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['rows', 'time1', 'time2', 'time3', 'time4', 'time5'] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames if nproc is None: path = os.path.join('..', 'output', 'madlib_{}_{}.txt'.format(mattype, opType)) else: path = os.path.join('..', 'output', 'madlib_cpu_{}_scale.txt'.format(opType)) for nr in nrow_scale: nrow = fixedAxis if opType == 'GMM' else nr ncol = nr if opType == 'GMM' else fixedAxis print nrow print ncol Mname = 'M{}{}'.format(nrow, ncol) if not cxn.table_exists('M{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'M{}{}'.format(nrow, ncol)) if (opType == 'GMM'): if not cxn.table_exists('N{}{}'.format(ncol, nrow)): cxn.randomMatrix(ncol, nrow, 'N{}{}'.format(ncol, nrow)) Nname = 'N{}{}'.format(ncol, nrow) elif (opType == 'ADD'): if not cxn.table_exists('N{}{}'.format(nrow, ncol)): cxn.randomMatrix(nrow, ncol, 'N{}{}'.format(nrow, ncol)) Nname = 'N{}{}'.format(nrow, ncol) cleanup = [] if (opType == 'TRANS'): call = "matrix_trans('{}',NULL,'Mt',NULL)".format(Mname) cleanup.append('Mt') elif (opType == 'NORM'): call = "matrix_norm('{}',NULL,'fro')".format(Mname) elif (opType == 'GMM'): call = "matrix_mult('{}',NULL,'{}',NULL,'MN',NULL)".format( Mname, Nname) cleanup.append('MN') elif (opType == 'MVM'): array_call = 'SELECT array_agg(random()) FROM generate_series(1,{})'.format( ncol) call = "matrix_vec_mult('{}',NULL,({}))".format(Mname, array_call) elif (opType == 'TSM'): call = "matrix_mult('{0}','trans=True','{0}',NULL,'MtM',NULL)".format( Mname) cleanup.append('MtM') elif (opType == 'ADD'): call = "matrix_add('{}',NULL,'{}',NULL,'M_N',NULL)".format( Mname, Nname) cleanup.append('M_N') else: raise NotImplementedError('Invalid Operation') sql_call = 'SELECT madlib.{}'.format(call) runTimes.ix[:, 'rows'] = nr if nproc is None else nproc runTimes.ix[:, 1:] = cxn.time(sql_call, cleanup) writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu') shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub)) if not cxn.table_exists('adclick_clean_vectors_split'): stmt = """ CREATE TABLE adclick_clean_vectors_split AS ( SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) cxn.execute(stmt) # need to do a bit of preprocessing if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_vectors_split', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_vectors_split', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') stmt = """ CREATE TABLE adclick_clean_depvars AS ( SELECT row_num, val[2:{}]::NUMERIC[] val FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) if not cxn.table_exists('adclick_clean_depvars'): cxn.execute(stmt) call = """ SELECT madlib.pca_train('adclick_clean_depvars', 'result_table', 'row_num', 5); SELECT madlib.pca_project('adclick_clean_depvars', 'result_table', 'adclick_prj', 'row_num', 'residual_table', 'result_summary_table') """ cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] #shape = cxn.get_shape_dense('adclick_clean{}_dense'.format(stub)) runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_dense.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) shape = cxn.get_shape('adclick_clean_1_sparse') if not cxn.table_exists('adclick_clean_1_vectors_sparse'): stmt = """ CREATE TABLE adclick_clean_1_vectors_sparse AS ( SELECT x.row_num, madlib.svec_cast_positions_float8arr( ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0 ) AS indep_vars, y.val AS y FROM adclick_clean_1_sparse x INNER JOIN adclick_clean_y y ON x.row_num = y.row_num GROUP BY x.row_num, y.val ) DISTRIBUTED BY (row_num) """.format(shape[1]) cxn.execute(stmt) if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') call = """ SELECT madlib.pca_sparse_train('adclick_clean_1_sparse', 'result_table', 'row_num', 'col_num', 'val', '{0}', '{1}', 5); SELECT madlib.pca_sparse_project('adclick_clean_1_sparse', 'result_table', 'adclick_prj', 'row_num', 'col_num', 'val', '{0}', '{1}', 'residual_table', 'result_summary_table') """.format(*shape) cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)