def main(): stub = sys.argv[1] cxn = SQLCxn(timeout=None, username='******', db='ubuntu') if not cxn.table_exists('adclick_clean_vectors_split'): shape = cxn.get_shape('adclick_clean{}_dense'.format(stub)) stmt = """ CREATE TABLE adclick_clean_vectors_split AS ( SELECT row_num, val[1]::INTEGER y, val[2:{}]::NUMERIC[] indep_vars FROM adclick_clean{}_dense ) DISTRIBUTED BY (row_num) """.format(shape[1], stub) cxn.execute(stmt) if not cxn.table_exists('adclick_clean_indepvars_long'): stmt = """ CREATE TABLE adclick_clean_indepvars_long AS ( SELECT row_num, ix AS col_num, indep_vars[ix] AS val FROM ( SELECT *, GENERATE_SUBSCRIPTS(indep_vars, 1) AS ix FROM adclick_clean_vectors_split ) tmp ) DISTRIBUTED BY (row_num, col_num) """ cxn.execute(stmt) if not cxn.table_exists('adclick_clean_y'): stmt = """ CREATE TABLE adclick_clean_y AS ( SELECT row_num, 1 AS col_num, y AS val FROM adclick_clean_vectors_split ) DISTRIBUTED BY (row_num) """ cxn.execute(stmt)
def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = { 'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_reg': do_reg, 'do_gnmf': do_gnmf, 'do_robust': do_robust, 'shape': shape, 'cxn': cxn } cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': cxn.execute('DROP TABLE IF EXISTS R2 CASCADE') cxn.execute( "SELECT MADLIB.matrix_random({},1,NULL,'uniform','R2',NULL)". format(shape[0])) cxn.execute('ALTER TABLE R2 RENAME COLUMN ROW TO ROW_NUM') call = 'do_robust(x_table_name, cxn)' rows = shape[0] cols = shape[1] path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) print res runTimes.ix[:, 3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) colnames = ['nodes','rows','cols','time1','time2','time3','time4','time5'] runTimes = pd.DataFrame(np.zeros((1,len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = {'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_gnmf': do_gnmf, 'do_reg': do_reg, 'do_robust': do_robust, 'shape': shape, 'cxn': cxn} cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': #do_reg(x_table_name, y_table_name, cxn) #preproc = """ # DROP TABLE IF EXISTS Y_HAT; # SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT'); # CREATE TABLE R2 AS ( # SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val # FROM {y} # INNER JOIN y_hat ON {y}.row_num = y_hat.row_num # ) DISTRIBUTED BY (row_num) #""".format(X=x_table_name, y=y_table_name) # We can just generate a vector of residuals on the fly # rather than computing them explicitly. cxn.execute('DROP TABLE IF EXISTS R2') cxn.randomMatrix(shape[0], 1, 'R2') call = 'do_robust(x_table_name, cxn)' rows = shape[0] cols = shape[1] path = '../output/madlib_tall_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:,['nodes','rows','cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) res runTimes.ix[:,3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header = writeHeader, mode = 'a')
def main(kwargs): opType = kwargs.get('opType') savestub = kwargs.get('savestub') nodes = kwargs.get('nodes') x_table_name = kwargs.get('xTableName') y_table_name = kwargs.get('yTableName') savestub = '' if (savestub is None) else savestub print 'Evaluating: {}'.format(opType) cxn = SQLCxn(username='******', db='ubuntu', timeout=2000) colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames shape = cxn.get_shape(x_table_name) env = { 'x_table_name': x_table_name, 'y_table_name': y_table_name, 'do_logit': do_logit, 'do_reg': do_reg, 'shape': shape, 'cxn': cxn } cleanup = None if opType == 'logit': call = 'do_logit(x_table_name, y_table_name, shape, cxn)' elif opType == 'gnmf': call = 'do_gnmf(x_table_name, shape, 10, cxn)' elif opType == 'reg': call = 'do_reg(x_table_name, y_table_name, cxn)' cleanup = ("map(lambda x: cxn.execute(" "'DROP TABLE {}'.format(x)), ['XTX','XTY','XTX_INV','B'])") elif opType == 'robust': do_reg(x_table_name, y_table_name, cxn) preproc = """ DROP TABLE IF EXISTS Y_HAT; SELECT madlib.matrix_mult('{X}',NULL,'B',NULL,'Y_HAT'); CREATE TABLE R2 AS ( SELECT {y}.row_num, ARRAY[POW({y}.val[1]-y_hat.val[1],2)] val FROM {y} INNER JOIN y_hat ON {y}.row_num = y_hat.row_num ) DISTRIBUTED BY (row_num) """.format(X=x_table_name, y=y_table_name) cxn.execute(preproc) call = 'do_robust(x_table_name, cxn)' elif opType == 'pca': print 'Not Implemented' return rows = shape[0] cols = shape[1] path = '../output/madlib_adclick_{}{}.txt'.format(opType, int(nodes)) runTimes.ix[:, ['nodes', 'rows', 'cols']] = (nodes, rows, cols) res = utils.timeOp(call, env, cleanup) print res runTimes.ix[:, 3:] = res writeHeader = False if (os.path.exists(path)) else True runTimes.to_csv(path, index=False, header=writeHeader, mode='a')
def main(kwargs): op_type = kwargs['opType'] nodes = kwargs['nodes'] stub = kwargs['stub'] colnames = [ 'nodes', 'rows', 'cols', 'time1', 'time2', 'time3', 'time4', 'time5' ] runTimes = pd.DataFrame(np.zeros((1, len(colnames)))) runTimes.columns = colnames cxn = SQLCxn(username='******', db='ubuntu', timeout=10000) shape = cxn.get_shape('adclick_clean_1_sparse') if not cxn.table_exists('adclick_clean_1_vectors_sparse'): stmt = """ CREATE TABLE adclick_clean_1_vectors_sparse AS ( SELECT x.row_num, madlib.svec_cast_positions_float8arr( ARRAY_AGG(x.col_num), ARRAY_AGG(x.val), {}, 0.0 ) AS indep_vars, y.val AS y FROM adclick_clean_1_sparse x INNER JOIN adclick_clean_y y ON x.row_num = y.row_num GROUP BY x.row_num, y.val ) DISTRIBUTED BY (row_num) """.format(shape[1]) cxn.execute(stmt) if op_type == 'logit': cxn.execute('DROP TABLE IF EXISTS adclick_logit_summary') cxn.execute('DROP TABLE IF EXISTS adclick_logit') call = """ SELECT madlib.logregr_train('adclick_clean_1_vectors_sparse', 'adclick_logit', 'y', 'indep_vars', NULL, 3, 'igd', .000001) """ cleanup = ['adclick_logit_summary', 'adclick_logit'] elif op_type == 'reg': cxn.execute('DROP TABLE IF EXISTS adclick_reg_summary') cxn.execute('DROP TABLE IF EXISTS adclick_reg') call = """ SELECT madlib.linregr_train('adclick_clean_1_vectors_sparse', 'adclick_reg', 'y', 'indep_vars') """ cleanup = ['adclick_reg_summary', 'adclick_reg'] elif op_type == 'pca': cxn.execute('DROP TABLE IF EXISTS result_table') cxn.execute('DROP TABLE IF EXISTS result_table_mean') cxn.execute('DROP TABLE IF EXISTS residual_table') cxn.execute('DROP TABLE IF EXISTS result_summary_table') cxn.execute('DROP TABLE IF EXISTS adlick_prj') call = """ SELECT madlib.pca_sparse_train('adclick_clean_1_sparse', 'result_table', 'row_num', 'col_num', 'val', '{0}', '{1}', 5); SELECT madlib.pca_sparse_project('adclick_clean_1_sparse', 'result_table', 'adclick_prj', 'row_num', 'col_num', 'val', '{0}', '{1}', 'residual_table', 'result_summary_table') """.format(*shape) cleanup = [ 'result_table', 'result_table_mean', 'residual_table', 'result_summary_table', 'adclick_prj' ] runTimes.ix[:, ['rows', 'cols']] = shape path = '../output/madlib_{}{}_sparse.txt'.format(op_type, int(nodes)) runTimes.ix[:, 'nodes'] = nodes res = cxn.time(call, cleanup) runTimes.ix[:, 3:] = res runTimes.to_csv(path, index=False)