def comp_sketch(matrix, objective, load_N=False, save_N=False, N_dir='../N_file/', **kwargs): """ Given matrix A, the function comp_sketch computes a sketch for A and performs further operations on PA. It returns the total running time and the desired quantity. parameter: matrix: a RowMatrix object storing the matrix [A b] objective: either 'x' or 'N' 'x': the function returns the solution to the problem min_x || PA[:,:-1]x - PA[:,-1] ||_2 'N': the function returns a square matrix N such that PA[:,:-1]*inv(N) is a matrix with orthonormal columns load_N: load the precomputed N matrices if possible (it reduces the actual running time for sampling sketches) save_N: save the computed N matrices for future use sketch_type: either 'projection' or 'sampling' projection_type: cw, gaussian, rademacher or srdht c: projection size s: sampling size (for sampling sketch only) k: number of independent trials to run """ sketch_type = kwargs.get('sketch_type') if not os.path.exists(N_dir): os.makedirs(N_dir) if objective == 'x': if sketch_type == 'projection': projection = Projections(**kwargs) t = time.time() x = projection.execute(matrix, 'x', save_N) t = time.time() - t if save_N: logger.info('Saving N matrices from projections!') N = [a[0] for a in x] x = [a[1] for a in x] # saving N filename = N_dir + 'N_' + matrix.name + '_projection_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k')))+ '.dat' data = {'N': N, 'time': t} pickle_write(filename,data) elif sketch_type == 'sampling': s = kwargs.get('s') new_N_proj = 0 N_proj_filename = N_dir + 'N_' + matrix.name + '_projection_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) +'.dat' if load_N and os.path.isfile(N_proj_filename): logger.info('Found N matrices from projections, loading them!') N_proj_filename = N_dir + 'N_' + matrix.name + '_projection_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) +'.dat' result = pickle_load(N_proj_filename) N_proj = result['N'] t_proj = result['time'] else: # otherwise, compute it t = time.time() projection = Projections(**kwargs) N_proj = projection.execute(matrix, 'N') t_proj = time.time() - t new_N_proj = 1 sampling = Sampling(N=N_proj) t = time.time() x = sampling.execute(matrix, 'x', s, save_N ) t = time.time() - t + t_proj if save_N and new_N_proj: logger.info('Saving N matrices from projections!') #filename = N_dir + 'N_' + matrix.name + '_projection_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) + '.dat' data = {'N': N_proj, 'time': t_proj} pickle_write(N_proj_filename,data) if save_N: logger.info('Saving N matrices from sampling!') N = [a[0] for a in x] x = [a[1] for a in x] filename = N_dir + 'N_' + matrix.name + '_sampling_s' + str(int(kwargs.get('s'))) + '_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) + '.dat' data = {'N': N, 'time': t} pickle_write(filename,data) else: raise ValueError('Please enter a valid sketch type!') return x, t elif objective == 'N': if sketch_type == 'projection': N_proj_filename = N_dir + 'N_' + matrix.name + '_projection_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) + '.dat' if load_N and os.path.isfile(N_proj_filename): logger.info('Found N matrices from projections, loading them!') result = pickle_load(N_proj_filename) N = result['N'] t = result['time'] else: t = time.time() projection = Projections(**kwargs) N = projection.execute(matrix, 'N') t = time.time() - t if save_N: logger.info('Saving N matrices from projections!') data = {'N': N, 'time': t} pickle_write(N_proj_filename,data) elif sketch_type == 'sampling': s = kwargs.get('s') new_N_proj = 0 new_N_samp = 0 N_samp_filename = N_dir + 'N_' + matrix.name + '_sampling_s' + str(int(kwargs.get('s'))) + '_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) + '.dat' N_proj_filename = N_dir + 'N_' + matrix.name + '_projection_' + kwargs.get('projection_type') + '_c' + str(int(kwargs.get('c'))) + '_k' + str(int(kwargs.get('k'))) + '.dat' if load_N and os.path.isfile(N_samp_filename): logger.info('Found N matrices from sampling, loading them!') result = pickle_load(N_samp_filename) N = result['N'] t = result['time'] elif load_N and os.path.isfile(N_proj_filename): logger.info('Found N matrices from projections, loading them!') result = pickle_load(N_proj_filename) N_proj = result['N'] t_proj = result['time'] sampling = Sampling(N=N_proj) t = time.time() N = sampling.execute(matrix, 'N', s) t = time.time() - t + t_proj new_N_samp = 1 else: t = time.time() projection = Projections(**kwargs) N_proj = projection.execute(matrix, 'N') t_proj = time.time() - t new_N_proj = 1 t = time.time() sampling = Sampling(N=N_proj) N = sampling.execute(matrix, 'N', s) t = time.time() - t + t_proj new_N_samp = 1 if save_N and new_N_proj: logger.info('Saving N matrices from projections!') data = {'N': N_proj, 'time': t_proj} pickle_write(N_proj_filename,data) if save_N and new_N_samp: logger.info('Saving N matrices from sampling!') data = {'N': N, 'time': t} pickle_write(N_samp_filename,data) else: raise ValueError('Please enter a valid sketch type!') return N, t else: raise ValueError('Please enter a valid objective!')
def main(argv): # TO-DO: put these to a configuration file data_dire = '../data/' hdfs_dire = 'data/' logs_dire = 'file://'+os.path.dirname(os.path.abspath(__file__))+'/../log/' logging.config.fileConfig('logging.conf',disable_existing_loggers=False) # setting up the parser logger = logging.getLogger('') #using root parser = argparse.ArgumentParser(description='Getting parameters.',prog='run_ls.sh') parser.add_argument('dataset', type=str, help='dataset_Ab.txt stores the input matrix to run CX on; \ dataset.txt stores the original matrix (only needed for -t);') parser.add_argument('--dims', metavar=('m','n'), type=int, nargs=2, required=True, help='size of the input matrix') parser.add_argument('--nrepetitions', metavar='numRepetitions', default=1, type=int, help='number of times to stack matrix vertically in order to generate large matrices') parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark') parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark') parser.add_argument('--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS (default: loading local files)') group = parser.add_mutually_exclusive_group() group.add_argument('--low-precision', dest='solver_type', default='low_precision', action='store_const', const='low_precision', help='use low-precision solver') group.add_argument('--high_precision', dest='solver_type', default='low_precision', action='store_const', const='high_precision', help='use high_precision solver') group = parser.add_mutually_exclusive_group() group.add_argument('--projection', dest='sketch_type', action='store_const', const='projection', help='compute sketch by projection') group.add_argument('--sampling', dest='sketch_type', action='store_const', const='sampling', help='compute sketch by sampling') parser.add_argument('-p', dest='projection_type', default='gaussian', choices=('cw','gaussian','rademacher','srdht'), help='underlying projection type') parser.add_argument('-r', metavar='projectionSize', required=True, type=int, help='sketch size') parser.add_argument('-s', metavar='samplingSize', type=int, help='sampling size (for samping sektch only)') parser.add_argument('-q', '--niters', metavar='numIters', dest='q', type=int, help='number of iterations in LSQR') parser.add_argument('-k', '--ntrials', metavar='numTrials', dest='k', default=1, type=int, help='number of independent trials to run') parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions') parser.add_argument('--save_logs', action='store_true', help='save Spark logs') parser.add_argument('--output_filename', default='ls.out', help='filename of the output file (default: ls.out)') parser.add_argument('--load_N', action='store_true', help='load N') parser.add_argument('--save_N', action='store_true', help='save N') parser.add_argument('--debug', action='store_true', help='debug mode') if len(argv)>0 and argv[0]=='print_help': parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m,n) = args.dims # validating if m < n: raise ValueError('Number of rows({0}) should be greater than number of columns({1})'.format(m,n)) if args.sketch_type == 'sampling' and args.s is None: raise ValueError('Please enter a sampling size!') if args.solver_type == 'high_precision' and args.q is None: raise ValueError('Please enter number of iterations!') if args.solver_type == 'low_precision' and args.sketch_type is None: raise ValueError('Please specify a sketch method for the low-precision solver!') print_params(args, logger) # print parameters # instantializing a Spark instance if args.save_logs: conf = SparkConf().set('spark.eventLog.enabled','true').set('spark.eventLog.dir',logs_dire) else: conf = SparkConf() sc = SparkContext(appName="ls_exp",conf=conf) if args.file_source=='hdfs': Ab_rdd = sc.textFile(hdfs_dire+args.dataset,args.npartitions) #loading dataset from HDFS else: A = np.loadtxt(data_dire+args.dataset+'.txt') #loading dataset from local disc Ab_rdd = sc.parallelize(A.tolist(),args.npartitions) matrix_Ab = RowMatrix(Ab_rdd,args.dataset,m,n+1,args.cache,repnum=args.nrepetitions) # creating a RowMatrix instance t = time.time() ls = RandLeastSquares(matrix_Ab,solver_type=args.solver_type,sketch_type=args.sketch_type,projection_type=args.projection_type,c=args.r,s=args.s,num_iters=args.q,k=args.k) ls.fit(args.load_N, args.save_N,args.debug) # solving the problem result = {'time':ls.time, 'x':ls.x} pickle_write('../result/'+args.output_filename,result) # saving results logger.info('Total time elapsed:{0}'.format( ls.time )) if args.test: #only need to load these in the test mode if os.path.isfile(data_dire+args.dataset+'_x_opt.txt'): logger.info('Found precomputed optimal solutions!') x_opt = np.loadtxt(dire+args.dataset+'_x_opt.txt') f_opt = np.loadtxt(dire+args.dataset+'_f_opt.txt') else: logger.info('Computing optimal solutions!') Ab = np.array(matrix_Ab.rdd_original.values().collect()) # might not be accurate, needed to check A = Ab[:,:-1] b = Ab[:,-1] x_opt = np.linalg.lstsq(A,b)[0] f_opt = np.linalg.norm(np.dot(A,x_opt)-b) rx, rf = ls.comp_relerr(x_opt,f_opt) logger.info('Median of the relative error on solution vector is:{0}'.format(rx)) logger.info('Median of the relative error on objective value is:{0}'.format(rf))
def main(argv): parser = argparse.ArgumentParser(description="Getting parameters.", prog="run_ls.sh") parser.add_argument( "dataset", type=str, help="dataset_Ab.txt stores the input matrix to run CX on; \ dataset.txt stores the original matrix (only needed for -t);", ) parser.add_argument("--dims", metavar=("m", "n"), type=int, nargs=2, required=True, help="size of the input matrix") parser.add_argument( "--nrepetitions", metavar="numRepetitions", default=1, type=int, help="number of times to stack matrix vertically in order to generate large matrices", ) parser.add_argument("--stack", metavar="stackType", dest="stack_type", type=int, default=1, help="stack type") parser.add_argument( "--npartitions", metavar="numPartitions", default=280, type=int, help="number of partitions in Spark" ) parser.add_argument( "--setting_filename", metavar="settingConfFilename", default="conf/settings.cfg", type=str, help="name of the configuration file storing the settings", ) parser.add_argument( "--logging_filename", metavar="loggingConfFilename", default="conf/logging.cfg", type=str, help="configuration file for Python logging", ) parser.add_argument("-c", "--cache", action="store_true", help="cache the dataset in Spark") group = parser.add_mutually_exclusive_group() group.add_argument( "--hdfs", dest="file_source", default="local", action="store_const", const="hdfs", help="load dataset from HDFS (default: loading files from local)", ) group.add_argument( "--s3", dest="file_source", default="local", action="store_const", const="s3", help="load dataset from Amazon S3 (default: loading files from local)", ) group = parser.add_mutually_exclusive_group() group.add_argument( "--low-precision", dest="solver_type", default="low_precision", action="store_const", const="low_precision", help="use low-precision solver", ) group.add_argument( "--high_precision", dest="solver_type", default="low_precision", action="store_const", const="high_precision", help="use high_precision solver", ) group = parser.add_mutually_exclusive_group() group.add_argument( "--projection", dest="sketch_type", action="store_const", const="projection", help="compute sketch by projection", ) group.add_argument( "--sampling", dest="sketch_type", action="store_const", const="sampling", help="compute sketch by sampling" ) parser.add_argument( "-p", dest="projection_type", default="gaussian", choices=("cw", "gaussian", "rademacher", "srdht"), help="underlying projection type", ) parser.add_argument("-r", metavar="projectionSize", type=int, help="sketch size") parser.add_argument("-s", metavar="samplingSize", type=int, help="sampling size (for samping sektch only)") parser.add_argument("-q", "--niters", metavar="numIters", dest="q", type=int, help="number of iterations in LSQR") parser.add_argument( "-k", "--ntrials", metavar="numTrials", dest="k", default=1, type=int, help="number of independent trials to run", ) parser.add_argument("-t", "--test", action="store_true", help="compute accuracies of the returned solutions") parser.add_argument("--save_logs", action="store_true", help="save Spark logs") parser.add_argument( "--output_filename", metavar="outputFilename", default="ls.out", help="filename of the output file (default: ls.out)", ) parser.add_argument("--load_N", action="store_true", help="load N") parser.add_argument("--save_N", action="store_true", help="save N") parser.add_argument("--debug", action="store_true", help="debug mode") if len(argv) > 0 and argv[0] == "print_help": parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m, n) = args.dims # validating if m < n: raise ValueError("Number of rows({0}) should be greater than number of columns({1})".format(m, n)) if args.sketch_type == "sampling" and args.s is None: raise ValueError("Please enter a sampling size!") if args.solver_type == "high_precision" and args.q is None: raise ValueError("Please enter number of iterations!") if args.solver_type == "low_precision" and args.sketch_type is None: raise ValueError("Please specify a sketch method for the low-precision solver!") if args.sketch_type and args.r is None: raise ValueError("Please enter a projection size!") # loading configuration file config = ConfigParser.RawConfigParser() config.read(args.setting_filename) data_dir = config.get("local_directories", "data_dir") spark_logs_dir = ( "file://" + os.path.dirname(os.path.abspath(__file__)) + "/" + config.get("local_directories", "spark_logs_dir") ) logging.config.fileConfig(args.logging_filename, disable_existing_loggers=False) # setting up the logger logger = logging.getLogger("") # using root print_params(args, logger) # printing parameters # instantializing a Spark instance if args.save_logs: conf = ( SparkConf() .set("spark.eventLog.enabled", "true") .set("spark.eventLog.dir", spark_logs_dir) .set("spark.driver.maxResultSize", "20g") ) else: conf = SparkConf().set("spark.driver.maxResultSize", "20g") sc = SparkContext(appName="ls_exp", conf=conf) if args.file_source == "hdfs": hdfs_dir = config.get("hdfs", "hdfs_dir") Ab_rdd = sc.textFile(hdfs_dir + args.dataset + ".txt", args.npartitions) # loading dataset from HDFS elif args.file_source == "s3": s3_dir = config.get("s3", "s3_dir") key_id = config.get("s3", "key_id") secret_key = config.get("s3", "secret_key") Ab_rdd = sc.textFile( "s3n://" + key_id + ":" + secret_key + "@" + s3_dir + args.dataset + ".txt", args.npartitions ) else: A = np.loadtxt(data_dir + args.dataset + ".txt") # loading dataset from local disc Ab_rdd = sc.parallelize(A.tolist(), args.npartitions) matrix_Ab = RowMatrix( Ab_rdd, args.dataset, m, n, args.cache, stack_type=args.stack_type, repnum=args.nrepetitions ) # creating a RowMatrix instance ls = RandLeastSquares( matrix_Ab, solver_type=args.solver_type, sketch_type=args.sketch_type, projection_type=args.projection_type, c=args.r, s=args.s, num_iters=args.q, k=args.k, ) ls.fit(args.load_N, args.save_N, args.debug) # solving the problem result = {"time": ls.time, "x": ls.x} pickle_write("../result/" + args.output_filename, result) # saving results logger.info("Total time elapsed:{0}".format(ls.time)) if args.test: # only need to load these in the test mode if os.path.isfile(data_dir + args.dataset + "_x_opt.txt"): logger.info("Found precomputed optimal solutions!") x_opt = np.loadtxt(data_dir + args.dataset + "_x_opt.txt") f_opt = np.loadtxt(data_dir + args.dataset + "_f_opt.txt") else: logger.info("Computing optimal solutions!") Ab = np.array(matrix_Ab.rdd_original.values().collect()) # might not be accurate, needed to check A = Ab[:, :-1] b = Ab[:, -1] x_opt = np.linalg.lstsq(A, b)[0] f_opt = np.linalg.norm(np.dot(A, x_opt) - b) rx, rf = ls.comp_relerr(x_opt, f_opt) logger.info("Median of the relative error on solution vector is:{0}".format(rx)) logger.info("Median of the relative error on objective value is:{0}".format(rf))
def main(argv): parser = argparse.ArgumentParser(description='Getting parameters.', prog='run_ls.sh') parser.add_argument( 'dataset', type=str, help='dataset_Ab.txt stores the input matrix to run CX on; \ dataset.txt stores the original matrix (only needed for -t);') parser.add_argument('--dims', metavar=('m', 'n'), type=int, nargs=2, required=True, help='size of the input matrix') parser.add_argument( '--nrepetitions', metavar='numRepetitions', default=1, type=int, help= 'number of times to stack matrix vertically in order to generate large matrices' ) parser.add_argument('--stack', metavar='stackType', dest='stack_type', type=int, default=1, help='stack type') parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark') parser.add_argument( '--setting_filename', metavar='settingConfFilename', default='conf/settings.cfg', type=str, help='name of the configuration file storing the settings') parser.add_argument('--logging_filename', metavar='loggingConfFilename', default='conf/logging.cfg', type=str, help='configuration file for Python logging') parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark') group = parser.add_mutually_exclusive_group() group.add_argument( '--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS (default: loading files from local)') group.add_argument( '--s3', dest='file_source', default='local', action='store_const', const='s3', help='load dataset from Amazon S3 (default: loading files from local)') group = parser.add_mutually_exclusive_group() group.add_argument('--low-precision', dest='solver_type', default='low_precision', action='store_const', const='low_precision', help='use low-precision solver') group.add_argument('--high_precision', dest='solver_type', default='low_precision', action='store_const', const='high_precision', help='use high_precision solver') group = parser.add_mutually_exclusive_group() group.add_argument('--projection', dest='sketch_type', action='store_const', const='projection', help='compute sketch by projection') group.add_argument('--sampling', dest='sketch_type', action='store_const', const='sampling', help='compute sketch by sampling') parser.add_argument('-p', dest='projection_type', default='gaussian', choices=('cw', 'gaussian', 'rademacher', 'srdht'), help='underlying projection type') parser.add_argument('-r', metavar='projectionSize', type=int, help='sketch size') parser.add_argument('-s', metavar='samplingSize', type=int, help='sampling size (for samping sektch only)') parser.add_argument('-q', '--niters', metavar='numIters', dest='q', type=int, help='number of iterations in LSQR') parser.add_argument('-k', '--ntrials', metavar='numTrials', dest='k', default=1, type=int, help='number of independent trials to run') parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions') parser.add_argument('--save_logs', action='store_true', help='save Spark logs') parser.add_argument('--output_filename', metavar='outputFilename', default='ls.out', help='filename of the output file (default: ls.out)') parser.add_argument('--load_N', action='store_true', help='load N') parser.add_argument('--save_N', action='store_true', help='save N') parser.add_argument('--debug', action='store_true', help='debug mode') if len(argv) > 0 and argv[0] == 'print_help': parser.print_help() sys.exit(1) args = parser.parse_args(argv) (m, n) = args.dims # validating if m < n: raise ValueError( 'Number of rows({0}) should be greater than number of columns({1})' .format(m, n)) if args.sketch_type == 'sampling' and args.s is None: raise ValueError('Please enter a sampling size!') if args.solver_type == 'high_precision' and args.q is None: raise ValueError('Please enter number of iterations!') if args.solver_type == 'low_precision' and args.sketch_type is None: raise ValueError( 'Please specify a sketch method for the low-precision solver!') if args.sketch_type and args.r is None: raise ValueError('Please enter a projection size!') # loading configuration file config = ConfigParser.RawConfigParser() config.read(args.setting_filename) data_dir = config.get('local_directories', 'data_dir') spark_logs_dir = 'file://' + os.path.dirname( os.path.abspath(__file__)) + '/' + config.get('local_directories', 'spark_logs_dir') logging.config.fileConfig( args.logging_filename, disable_existing_loggers=False) # setting up the logger logger = logging.getLogger('') #using root print_params(args, logger) # printing parameters # instantializing a Spark instance if args.save_logs: conf = SparkConf().set('spark.eventLog.enabled', 'true').set( 'spark.eventLog.dir', spark_logs_dir).set('spark.driver.maxResultSize', '20g') else: conf = SparkConf().set('spark.driver.maxResultSize', '20g') sc = SparkContext(appName="ls_exp", conf=conf) if args.file_source == 'hdfs': hdfs_dir = config.get('hdfs', 'hdfs_dir') Ab_rdd = sc.textFile(hdfs_dir + args.dataset + '.txt', args.npartitions) #loading dataset from HDFS elif args.file_source == 's3': s3_dir = config.get('s3', 's3_dir') key_id = config.get('s3', 'key_id') secret_key = config.get('s3', 'secret_key') Ab_rdd = sc.textFile( 's3n://' + key_id + ':' + secret_key + '@' + s3_dir + args.dataset + '.txt', args.npartitions) else: A = np.loadtxt(data_dir + args.dataset + '.txt') #loading dataset from local disc Ab_rdd = sc.parallelize(A.tolist(), args.npartitions) matrix_Ab = RowMatrix( Ab_rdd, args.dataset, m, n, args.cache, stack_type=args.stack_type, repnum=args.nrepetitions) # creating a RowMatrix instance ls = RandLeastSquares(matrix_Ab, solver_type=args.solver_type, sketch_type=args.sketch_type, projection_type=args.projection_type, c=args.r, s=args.s, num_iters=args.q, k=args.k) ls.fit(args.load_N, args.save_N, args.debug) # solving the problem result = {'time': ls.time, 'x': ls.x} pickle_write('../result/' + args.output_filename, result) # saving results logger.info('Total time elapsed:{0}'.format(ls.time)) if args.test: #only need to load these in the test mode if os.path.isfile(data_dir + args.dataset + '_x_opt.txt'): logger.info('Found precomputed optimal solutions!') x_opt = np.loadtxt(data_dir + args.dataset + '_x_opt.txt') f_opt = np.loadtxt(data_dir + args.dataset + '_f_opt.txt') else: logger.info('Computing optimal solutions!') Ab = np.array(matrix_Ab.rdd_original.values().collect() ) # might not be accurate, needed to check A = Ab[:, :-1] b = Ab[:, -1] x_opt = np.linalg.lstsq(A, b)[0] f_opt = np.linalg.norm(np.dot(A, x_opt) - b) rx, rf = ls.comp_relerr(x_opt, f_opt) logger.info( 'Median of the relative error on solution vector is:{0}'.format( rx)) logger.info( 'Median of the relative error on objective value is:{0}'.format( rf))