Exemplo n.º 1
0
class MatrixMultiplicationTestCase(unittest.TestCase):
    def setUp(self):
        self.matrix_A = RowMatrix(matrix_rdd,'test_data',1000,100)
        self.matrix_A2 = RowMatrix(matrix_rdd2,'test_data',100,1000)

    def test_mat_rtimes(self):
        mat = np.random.rand(100,50)
        p = self.matrix_A.rtimes(mat)
        p_true = np.dot( A, mat )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_mat_ltimes(self):
        mat = np.random.rand(100,1000)
        p = self.matrix_A.ltimes(mat)
        p_true = np.dot( mat,A )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_atamat(self):
        mat = np.random.rand(100,20)
        p = self.matrix_A.atamat(mat)
        p_true = np.dot( A.T, np.dot(A, mat) )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_mat_rtimes2(self):
        mat = np.random.rand(1000,50)
        p = self.matrix_A2.rtimes(mat)
        p_true = np.dot( A2, mat )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_mat_ltimes2(self):
        mat = np.random.rand(50,100)
        p = self.matrix_A2.ltimes(mat)
        p_true = np.dot( mat,A2 )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_atamat2(self):
        mat = np.random.rand(1000,20)
        p = self.matrix_A2.atamat(mat)
        p_true = np.dot( A2.T, np.dot(A2, mat) )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_mat_rtimes_sub(self):
        mat = np.random.rand(99,50)
        p = self.matrix_A.rtimes(mat, (0,98))
        p_true = np.dot( A[:,:-1], mat )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_mat_ltimes_sub(self):
        mat = np.random.rand(100,1000)
        p = self.matrix_A.ltimes(mat, (0,98))
        p_true = np.dot( mat,A[:,:-1] )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )

    def test_atamat_sub(self):
        mat = np.random.rand(99,50)
        p = self.matrix_A.atamat(mat, (0,98))
        p_true = np.dot( A[:,:-1].T, np.dot(A[:,:-1], mat) )
        self.assertTrue( np.linalg.norm(p-p_true)/np.linalg.norm(p_true) < 1e-5 )
Exemplo n.º 2
0
class MatrixMultiplicationTestCase(unittest.TestCase):
    def setUp(self):
        self.matrix_Ab = RowMatrix(matrix_rdd,'test_data',1000,10)

    def test_mat_rtimes(self):
        vec = np.random.rand(10)
        p = self.matrix_Ab.rtimes_vec(vec)
        p_true = np.dot( A, vec )
        self.assertTrue( np.linalg.norm(p-p_true) < 1e-5 )

    def test_mat_ltimes(self):
        vec = np.random.rand(1000)
        p = self.matrix_Ab.ltimes_vec(vec)
        p_true = np.dot( vec, A )
        self.assertTrue( np.linalg.norm(p-p_true) < 1e-5 )

    def test_get_b(self):
        b = self.matrix_Ab.get_b()
        self.assertTrue( np.linalg.norm(b - Ab[:,-1]) < 1e-5 )
Exemplo n.º 3
0
class MatrixMultiplicationTestCase(unittest.TestCase):
    def setUp(self):
        self.matrix_Ab = RowMatrix(matrix_rdd, 'test_data', 1000, 10)

    def test_mat_rtimes(self):
        vec = np.random.rand(10)
        p = self.matrix_Ab.rtimes_vec(vec)
        p_true = np.dot(A, vec)

        self.assertTrue(np.linalg.norm(p - p_true) < 1e-5)

    def test_mat_ltimes(self):
        vec = np.random.rand(1000)
        p = self.matrix_Ab.ltimes_vec(vec)
        p_true = np.dot(vec, A)

        self.assertTrue(np.linalg.norm(p - p_true) < 1e-5)

    def test_get_b(self):
        b = self.matrix_Ab.get_b()

        self.assertTrue(np.linalg.norm(b - Ab[:, -1]) < 1e-5)
Exemplo n.º 4
0
 def setUp(self):
     self.matrix_Ab = RowMatrix(matrix_rdd,'test_data',1000,10)
Exemplo n.º 5
0
 def setUp(self):
     self.matrix_A = RowMatrix(matrix_rdd, 'test_data', 1000, 100)
     self.matrix_A2 = RowMatrix(matrix_rdd2, 'test_data', 100, 1000)
Exemplo n.º 6
0
class MatrixMultiplicationTestCase(unittest.TestCase):
    def setUp(self):
        self.matrix_A = RowMatrix(matrix_rdd, 'test_data', 1000, 100)
        self.matrix_A2 = RowMatrix(matrix_rdd2, 'test_data', 100, 1000)

    def test_mat_rtimes(self):
        mat = np.random.rand(100, 50)
        p = self.matrix_A.rtimes(mat)
        p_true = np.dot(A, mat)
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_mat_ltimes(self):
        mat = np.random.rand(100, 1000)
        p = self.matrix_A.ltimes(mat)
        p_true = np.dot(mat, A)
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_atamat(self):
        mat = np.random.rand(100, 20)
        p = self.matrix_A.atamat(mat)
        p_true = np.dot(A.T, np.dot(A, mat))
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_mat_rtimes2(self):
        mat = np.random.rand(1000, 50)
        p = self.matrix_A2.rtimes(mat)
        p_true = np.dot(A2, mat)
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_mat_ltimes2(self):
        mat = np.random.rand(50, 100)
        p = self.matrix_A2.ltimes(mat)
        p_true = np.dot(mat, A2)
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_atamat2(self):
        mat = np.random.rand(1000, 20)
        p = self.matrix_A2.atamat(mat)
        p_true = np.dot(A2.T, np.dot(A2, mat))
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_mat_rtimes_sub(self):
        mat = np.random.rand(99, 50)
        p = self.matrix_A.rtimes(mat, (0, 98))
        p_true = np.dot(A[:, :-1], mat)
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_mat_ltimes_sub(self):
        mat = np.random.rand(100, 1000)
        p = self.matrix_A.ltimes(mat, (0, 98))
        p_true = np.dot(mat, A[:, :-1])
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)

    def test_atamat_sub(self):
        mat = np.random.rand(99, 50)
        p = self.matrix_A.atamat(mat, (0, 98))
        p_true = np.dot(A[:, :-1].T, np.dot(A[:, :-1], mat))
        self.assertTrue(
            np.linalg.norm(p - p_true) / np.linalg.norm(p_true) < 1e-5)
Exemplo n.º 7
0
 def setUp(self):
     self.matrix_A = RowMatrix(matrix_rdd,'test_data',1000,100)
     self.matrix_A2 = RowMatrix(matrix_rdd2,'test_data',100,1000)
Exemplo n.º 8
0
def main(argv):
    logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
    logger = logging.getLogger('')  #using root

    parser = argparse.ArgumentParser(description='Getting parameters.',
                                     prog='run_cx.sh')

    parser.add_argument(
        'dataset',
        type=str,
        help='dataset.txt stores the input matrix to run CX on; \
           dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \
           dataset_D.txt stores singular values of the input matrix (only needed for -t)'
    )
    parser.add_argument('--dims',
                        metavar=('m', 'n'),
                        type=int,
                        nargs=2,
                        required=True,
                        help='size of the input matrix')
    parser.add_argument('--sparse',
                        dest='sparse',
                        action='store_true',
                        help='whether the data is sparse')
    parser.add_argument('--hdfs',
                        dest='file_source',
                        default='local',
                        action='store_const',
                        const='hdfs',
                        help='load dataset from HDFS')
    parser.add_argument(
        '-k',
        '--rank',
        metavar='targetRank',
        dest='k',
        default=5,
        type=int,
        help=
        'target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n'
    )
    parser.add_argument('-r',
                        metavar='numRowsToSelect',
                        default=20,
                        type=int,
                        help='number of rows to select in CX')
    parser.add_argument(
        '-q',
        '--niters',
        metavar='numIters',
        dest='q',
        default=2,
        type=int,
        help='number of iterations to run in approximation of leverage scores')
    parser.add_argument(
        '--deterministic',
        dest='scheme',
        default='randomized',
        action='store_const',
        const='deterministic',
        help=
        'use deterministic scheme instead of randomized when selecting rows')
    parser.add_argument('-c',
                        '--cache',
                        action='store_true',
                        help='cache the dataset in Spark')
    parser.add_argument('-t',
                        '--test',
                        action='store_true',
                        help='compute accuracies of the returned solutions')
    parser.add_argument('-s',
                        '--save_logs',
                        action='store_true',
                        help='save Spark logs')
    parser.add_argument(
        '--nrepetitions',
        metavar='numRepetitions',
        default=1,
        type=int,
        help=
        'number of times to stack matrix vertically in order to generate large matrices'
    )
    parser.add_argument('--npartitions',
                        metavar='numPartitions',
                        default=280,
                        type=int,
                        help='number of partitions in Spark')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--row',
                       dest='axis',
                       default=0,
                       action='store_const',
                       const=0,
                       help='compute row leverage scores')
    group.add_argument('--column',
                       dest='axis',
                       default=0,
                       action='store_const',
                       const=1,
                       help='compute column leverage scores')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--leverage-scores-only',
                       dest='stage',
                       default='full',
                       action='store_const',
                       const='leverage',
                       help='return approximate leverage scores only')
    group.add_argument(
        '--indices-only',
        dest='stage',
        default='full',
        action='store_const',
        const='indices',
        help='return approximate leverage scores and selected row indices only'
    )

    if len(argv) > 0 and argv[0] == 'print_help':
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(argv)
    (m, n) = args.dims

    # validating
    if args.k > m or args.k > n:
        raise ValueError(
            'Rank parameter({0}) should not be greater than m({1}) or n({2})'.
            format(args.k, m, n))

    if args.npartitions > m or args.npartitions > n:
        args.npartitions = min(m, n)

    if args.test and args.nrepetitions > 1:
        raise OptionError(
            'Do not use the test mode(-t) on replicated data(numRepetitions>1)!'
        )

    if args.axis == 0:
        raise OptionError('Need to implement transpose first!')

    if args.sparse and args.file_source == 'hdfs':
        raise OptionError('Not yet!')

    # print parameters
    print_params(args, logger)

    # TO-DO: put these to a configuration file
    dire = '../data/'
    hdfs_dire = 'data/'
    logs_dire = 'file:///home/jiyan/cx_logs'

    # instantializing a Spark instance
    if args.save_logs:
        conf = SparkConf().set('spark.eventLog.enabled',
                               'true').set('spark.eventLog.dir', logs_dire)
    else:
        conf = SparkConf()
    sc = SparkContext(appName="cx_exp", conf=conf)

    # loading data
    if args.file_source == 'hdfs':
        A_rdd = sc.textFile(hdfs_dire + args.dataset + '.txt',
                            args.npartitions)  #loading dataset from HDFS
    else:
        A = np.loadtxt(dire + args.dataset +
                       '.txt')  #loading dataset from local disc
        if args.sparse:
            sA = to_sparse(A)
            A_rdd = sc.parallelize(sA, args.npartitions)
        else:
            A_rdd = sc.parallelize(A.tolist(), args.npartitions)

    if args.axis == 0:
        pass  # get rdd from the transpose of A

    t = time.time()
    if args.sparse:
        matrix_A = SparseRowMatrix(
            A_rdd, args.dataset, m, n,
            args.cache)  # creating a SparseRowMatrix instance
    else:
        matrix_A = RowMatrix(
            A_rdd, args.dataset, m, n, args.cache,
            repnum=args.nrepetitions)  # creating a RowMatrix instance

    cx = CX(matrix_A)

    lev, p = cx.get_lev(
        args.k, q=args.q
    )  # getting the approximate row leverage scores. it has the same size as the number of rows

    if args.test:
        if args.file_source != 'local':
            A = np.loadtxt(dire + args.dataset + '.txt')
        U, D, V = np.linalg.svd(A, 0)

        if args.axis == 0:
            lev_exact = np.sum(U[:, :args.k]**2, axis=1)
        else:
            lev_exact = np.sum(V.T[:, :args.k]**2, axis=1)
        p_exact = lev_exact / args.k
        logger.info(
            'KL divergence between the estimation of leverage scores and the exact one is {0}'
            .format(scipy.stats.entropy(p_exact, p)))
    logger.info('finished stage 1')
    logger.info('----------------------------------------------')

    if args.stage == 'indices' or args.stage == 'full':
        idx = cx.comp_idx(args.scheme,
                          args.r)  # choosing rows based on the leverage scores
        # maybe to store the indices to file
        logger.info('finished stage 2')
        logger.info('----------------------------------------------')

    if args.stage == 'full':
        rows = cx.get_rows(
        )  # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above)

        if args.test:
            diff = cx.comp_err()  # computing the relative error
            logger.info('relative error ||A-CX||/||A|| is {0}'.format(
                diff / np.linalg.norm(A, 'fro')))
            logger.info(
                'raltive error of the best rank-{0} approximation is {1}'.
                format(args.k,
                       np.sqrt(np.sum(D[args.k:]**2)) / np.sqrt(np.sum(D**2))))
        logger.info('finished stage 3')

    rtime = time.time() - t
    logger.info('time elapsed: {0} second'.format(rtime))
Exemplo n.º 9
0
def main(argv):
    parser = argparse.ArgumentParser(description='Getting parameters.',
                                     prog='run_ls.sh')

    parser.add_argument(
        'dataset',
        type=str,
        help='dataset_Ab.txt stores the input matrix to run CX on; \
           dataset.txt stores the original matrix (only needed for -t);')
    parser.add_argument('--dims',
                        metavar=('m', 'n'),
                        type=int,
                        nargs=2,
                        required=True,
                        help='size of the input matrix')
    parser.add_argument(
        '--nrepetitions',
        metavar='numRepetitions',
        default=1,
        type=int,
        help=
        'number of times to stack matrix vertically in order to generate large matrices'
    )
    parser.add_argument('--stack',
                        metavar='stackType',
                        dest='stack_type',
                        type=int,
                        default=1,
                        help='stack type')
    parser.add_argument('--npartitions',
                        metavar='numPartitions',
                        default=280,
                        type=int,
                        help='number of partitions in Spark')
    parser.add_argument(
        '--setting_filename',
        metavar='settingConfFilename',
        default='conf/settings.cfg',
        type=str,
        help='name of the configuration file storing the settings')
    parser.add_argument('--logging_filename',
                        metavar='loggingConfFilename',
                        default='conf/logging.cfg',
                        type=str,
                        help='configuration file for Python logging')
    parser.add_argument('-c',
                        '--cache',
                        action='store_true',
                        help='cache the dataset in Spark')
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--hdfs',
        dest='file_source',
        default='local',
        action='store_const',
        const='hdfs',
        help='load dataset from HDFS (default: loading files from local)')
    group.add_argument(
        '--s3',
        dest='file_source',
        default='local',
        action='store_const',
        const='s3',
        help='load dataset from Amazon S3 (default: loading files from local)')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--low-precision',
                       dest='solver_type',
                       default='low_precision',
                       action='store_const',
                       const='low_precision',
                       help='use low-precision solver')
    group.add_argument('--high_precision',
                       dest='solver_type',
                       default='low_precision',
                       action='store_const',
                       const='high_precision',
                       help='use high_precision solver')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--projection',
                       dest='sketch_type',
                       action='store_const',
                       const='projection',
                       help='compute sketch by projection')
    group.add_argument('--sampling',
                       dest='sketch_type',
                       action='store_const',
                       const='sampling',
                       help='compute sketch by sampling')
    parser.add_argument('-p',
                        dest='projection_type',
                        default='gaussian',
                        choices=('cw', 'gaussian', 'rademacher', 'srdht'),
                        help='underlying projection type')
    parser.add_argument('-r',
                        metavar='projectionSize',
                        type=int,
                        help='sketch size')
    parser.add_argument('-s',
                        metavar='samplingSize',
                        type=int,
                        help='sampling size (for samping sektch only)')
    parser.add_argument('-q',
                        '--niters',
                        metavar='numIters',
                        dest='q',
                        type=int,
                        help='number of iterations in LSQR')
    parser.add_argument('-k',
                        '--ntrials',
                        metavar='numTrials',
                        dest='k',
                        default=1,
                        type=int,
                        help='number of independent trials to run')
    parser.add_argument('-t',
                        '--test',
                        action='store_true',
                        help='compute accuracies of the returned solutions')
    parser.add_argument('--save_logs',
                        action='store_true',
                        help='save Spark logs')
    parser.add_argument('--output_filename',
                        metavar='outputFilename',
                        default='ls.out',
                        help='filename of the output file (default: ls.out)')
    parser.add_argument('--load_N', action='store_true', help='load N')
    parser.add_argument('--save_N', action='store_true', help='save N')
    parser.add_argument('--debug', action='store_true', help='debug mode')

    if len(argv) > 0 and argv[0] == 'print_help':
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(argv)
    (m, n) = args.dims

    # validating
    if m < n:
        raise ValueError(
            'Number of rows({0}) should be greater than number of columns({1})'
            .format(m, n))

    if args.sketch_type == 'sampling' and args.s is None:
        raise ValueError('Please enter a sampling size!')

    if args.solver_type == 'high_precision' and args.q is None:
        raise ValueError('Please enter number of iterations!')

    if args.solver_type == 'low_precision' and args.sketch_type is None:
        raise ValueError(
            'Please specify a sketch method for the low-precision solver!')

    if args.sketch_type and args.r is None:
        raise ValueError('Please enter a projection size!')

    # loading configuration file
    config = ConfigParser.RawConfigParser()
    config.read(args.setting_filename)

    data_dir = config.get('local_directories', 'data_dir')
    spark_logs_dir = 'file://' + os.path.dirname(
        os.path.abspath(__file__)) + '/' + config.get('local_directories',
                                                      'spark_logs_dir')

    logging.config.fileConfig(
        args.logging_filename,
        disable_existing_loggers=False)  # setting up the logger
    logger = logging.getLogger('')  #using root

    print_params(args, logger)  # printing parameters

    # instantializing a Spark instance
    if args.save_logs:
        conf = SparkConf().set('spark.eventLog.enabled', 'true').set(
            'spark.eventLog.dir',
            spark_logs_dir).set('spark.driver.maxResultSize', '20g')
    else:
        conf = SparkConf().set('spark.driver.maxResultSize', '20g')
    sc = SparkContext(appName="ls_exp", conf=conf)

    if args.file_source == 'hdfs':
        hdfs_dir = config.get('hdfs', 'hdfs_dir')
        Ab_rdd = sc.textFile(hdfs_dir + args.dataset + '.txt',
                             args.npartitions)  #loading dataset from HDFS
    elif args.file_source == 's3':
        s3_dir = config.get('s3', 's3_dir')
        key_id = config.get('s3', 'key_id')
        secret_key = config.get('s3', 'secret_key')
        Ab_rdd = sc.textFile(
            's3n://' + key_id + ':' + secret_key + '@' + s3_dir +
            args.dataset + '.txt', args.npartitions)
    else:
        A = np.loadtxt(data_dir + args.dataset +
                       '.txt')  #loading dataset from local disc
        Ab_rdd = sc.parallelize(A.tolist(), args.npartitions)

    matrix_Ab = RowMatrix(
        Ab_rdd,
        args.dataset,
        m,
        n,
        args.cache,
        stack_type=args.stack_type,
        repnum=args.nrepetitions)  # creating a RowMatrix instance

    ls = RandLeastSquares(matrix_Ab,
                          solver_type=args.solver_type,
                          sketch_type=args.sketch_type,
                          projection_type=args.projection_type,
                          c=args.r,
                          s=args.s,
                          num_iters=args.q,
                          k=args.k)

    ls.fit(args.load_N, args.save_N, args.debug)  # solving the problem

    result = {'time': ls.time, 'x': ls.x}
    pickle_write('../result/' + args.output_filename, result)  # saving results

    logger.info('Total time elapsed:{0}'.format(ls.time))

    if args.test:  #only need to load these in the test mode
        if os.path.isfile(data_dir + args.dataset + '_x_opt.txt'):
            logger.info('Found precomputed optimal solutions!')
            x_opt = np.loadtxt(data_dir + args.dataset + '_x_opt.txt')
            f_opt = np.loadtxt(data_dir + args.dataset + '_f_opt.txt')
        else:
            logger.info('Computing optimal solutions!')
            Ab = np.array(matrix_Ab.rdd_original.values().collect()
                          )  # might not be accurate, needed to check
            A = Ab[:, :-1]
            b = Ab[:, -1]
            x_opt = np.linalg.lstsq(A, b)[0]
            f_opt = np.linalg.norm(np.dot(A, x_opt) - b)

        rx, rf = ls.comp_relerr(x_opt, f_opt)

        logger.info(
            'Median of the relative error on solution vector is:{0}'.format(
                rx))
        logger.info(
            'Median of the relative error on objective value is:{0}'.format(
                rf))
Exemplo n.º 10
0
 def setUp(self):
     self.matrix_Ab = RowMatrix(matrix_rdd, 'test_data', 1000, 10)
     self.N_dire = 'N/'