Пример #1
0
def main(argv):
    logging.config.fileConfig('logging.conf',disable_existing_loggers=False)
    logger = logging.getLogger('') #using root

    parser = argparse.ArgumentParser(description='Getting parameters.',prog='run_cx.sh')

    parser.add_argument('dataset', type=str, help='dataset.txt stores the input matrix to run CX on; \
           dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \
           dataset_D.txt stores singular values of the input matrix (only needed for -t)')
    parser.add_argument('--dims', metavar=('m','n'), type=int, nargs=2, required=True, help='size of the input matrix')
    parser.add_argument('--sparse', dest='sparse', action='store_true', help='whether the data is sparse')
    parser.add_argument('--hdfs', dest='file_source', default='local', action='store_const', const='hdfs', help='load dataset from HDFS')
    parser.add_argument('-k', '--rank', metavar='targetRank', dest='k', default=5, type=int, help='target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n')
    parser.add_argument('-r', metavar='numRowsToSelect', default=20, type=int, help='number of rows to select in CX')
    parser.add_argument('-q', '--niters', metavar='numIters', dest='q', default=2, type=int, help='number of iterations to run in approximation of leverage scores')
    parser.add_argument('--deterministic', dest='scheme', default='randomized', action='store_const', const='deterministic', help='use deterministic scheme instead of randomized when selecting rows')
    parser.add_argument('-c', '--cache', action='store_true', help='cache the dataset in Spark')
    parser.add_argument('-t', '--test', action='store_true', help='compute accuracies of the returned solutions')
    parser.add_argument('-s', '--save_logs', action='store_true', help='save Spark logs')
    parser.add_argument('--nrepetitions', metavar='numRepetitions', default=1, type=int, help='number of times to stack matrix vertically in order to generate large matrices')
    parser.add_argument('--npartitions', metavar='numPartitions', default=280, type=int, help='number of partitions in Spark')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--row', dest='axis', default=0, action='store_const', const=0, help='compute row leverage scores')
    group.add_argument('--column', dest='axis', default=0, action='store_const', const=1, help='compute column leverage scores')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--leverage-scores-only', dest='stage', default='full', action='store_const', const='leverage', help='return approximate leverage scores only')
    group.add_argument('--indices-only', dest='stage', default='full', action='store_const', const='indices', help='return approximate leverage scores and selected row indices only')
    
    if len(argv)>0 and argv[0]=='print_help':
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(argv)
    (m,n) = args.dims

    # validating
    if args.k > m or args.k > n:
        raise ValueError('Rank parameter({0}) should not be greater than m({1}) or n({2})'.format(args.k,m,n))

    if args.npartitions > m or args.npartitions > n:
        args.npartitions = min(m,n)

    if args.test and args.nrepetitions>1:
        raise OptionError('Do not use the test mode(-t) on replicated data(numRepetitions>1)!')

    if args.axis == 0:
        raise OptionError('Need to implement transpose first!')

    if args.sparse and args.file_source=='hdfs':
        raise OptionError('Not yet!')

    # print parameters
    print_params(args, logger)

    # TO-DO: put these to a configuration file
    dire = '../data/'
    hdfs_dire = 'data/'
    logs_dire = 'file:///home/jiyan/cx_logs'

    # instantializing a Spark instance
    if args.save_logs:
        conf = SparkConf().set('spark.eventLog.enabled','true').set('spark.eventLog.dir',logs_dire)
    else:
        conf = SparkConf()
    sc = SparkContext(appName="cx_exp",conf=conf)

    # loading data
    if args.file_source=='hdfs':
        A_rdd = sc.textFile(hdfs_dire+args.dataset+'.txt',args.npartitions) #loading dataset from HDFS
    else:
        A = np.loadtxt(dire+args.dataset+'.txt') #loading dataset from local disc
        if args.sparse:
            sA = to_sparse(A)
            A_rdd = sc.parallelize(sA,args.npartitions)
        else:
            A_rdd = sc.parallelize(A.tolist(),args.npartitions)

    if args.axis == 0:
        pass # get rdd from the transpose of A

    t = time.time()
    if args.sparse:
        matrix_A = SparseRowMatrix(A_rdd,args.dataset,m,n,args.cache) # creating a SparseRowMatrix instance
    else:
        matrix_A = RowMatrix(A_rdd,args.dataset,m,n,args.cache,repnum=args.nrepetitions) # creating a RowMatrix instance
        
    cx = CX(matrix_A)

    lev, p = cx.get_lev(args.k, q=args.q) # getting the approximate row leverage scores. it has the same size as the number of rows 

    if args.test:
        if args.file_source != 'local':
            A = np.loadtxt(dire+args.dataset+'.txt')
        U, D, V = np.linalg.svd(A,0)

        if args.axis == 0:
            lev_exact = np.sum(U[:,:args.k]**2,axis=1)
        else:
            lev_exact = np.sum(V.T[:,:args.k]**2,axis=1)
        p_exact = lev_exact/args.k
        logger.info('KL divergence between the estimation of leverage scores and the exact one is {0}'.format( scipy.stats.entropy(p_exact,p) ))
    logger.info('finished stage 1')
    logger.info('----------------------------------------------')

    if args.stage=='indices' or args.stage=='full':
        idx = cx.comp_idx(args.scheme,args.r) # choosing rows based on the leverage scores
        # maybe to store the indices to file
        logger.info('finished stage 2')
        logger.info('----------------------------------------------')

    if args.stage=='full':
        rows = cx.get_rows() # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above)

        if args.test:
            diff = cx.comp_err() # computing the relative error
            logger.info('relative error ||A-CX||/||A|| is {0}'.format( diff/np.linalg.norm(A,'fro') ))
            logger.info('raltive error of the best rank-{0} approximation is {1}'.format( args.k, np.sqrt(np.sum(D[args.k:]**2))/np.sqrt(np.sum(D**2)) ))
        logger.info('finished stage 3')

    rtime = time.time() - t
    logger.info('time elapsed: {0} second'.format( rtime ))
Пример #2
0
    if axis == 0:
        lev = np.sum(U[:,:k]**2,axis=1)
    else:
        lev = np.sum(V[:k,:]**2,axis=0)

    p = lev/k

    return lev, p

if __name__ == '__main__':
    from pyspark import SparkContext

    A = np.loadtxt('../data/unif_bad_1000_100.txt')
    A2 = np.loadtxt('../data/unif_bad_100_1000.txt')
    sA = to_sparse(A)
    sA2 = to_sparse(A2)

    sc = SparkContext(appName="cx_test_exp")

    matrix_rdd = sc.parallelize(A.tolist(),140)
    matrix_rdd2 = sc.parallelize(A2.tolist(),20)
    sparse_matrix_rdd = sc.parallelize(sA,140)  # sparse_matrix_rdd has records in (row,col,val) format
    sparse_matrix_rdd2 = sc.parallelize(sA2,50)
    sparse_matrix_rdd = prepare_matrix(sparse_matrix_rdd)
    sparse_matrix_rdd2 = prepare_matrix(sparse_matrix_rdd2)

    runner = unittest.TextTestRunner(stream=sys.stderr, descriptions=True, verbosity=1)
    runner.run(suite)

Пример #3
0
    if axis == 0:
        lev = np.sum(U[:, :k]**2, axis=1)
    else:
        lev = np.sum(V[:k, :]**2, axis=0)

    p = lev / k

    return lev, p


if __name__ == '__main__':
    from pyspark import SparkContext

    A = np.loadtxt('../data/unif_bad_1000_100.txt')
    A2 = np.loadtxt('../data/unif_bad_100_1000.txt')
    sA = to_sparse(A)
    sA2 = to_sparse(A2)

    sc = SparkContext(appName="cx_test_exp")

    matrix_rdd = sc.parallelize(A.tolist(), 140)
    matrix_rdd2 = sc.parallelize(A2.tolist(), 20)
    sparse_matrix_rdd = sc.parallelize(
        sA, 140)  # sparse_matrix_rdd has records in (row,col,val) format
    sparse_matrix_rdd2 = sc.parallelize(sA2, 50)
    sparse_matrix_rdd = prepare_matrix(sparse_matrix_rdd)
    sparse_matrix_rdd2 = prepare_matrix(sparse_matrix_rdd2)

    runner = unittest.TextTestRunner(stream=sys.stderr,
                                     descriptions=True,
                                     verbosity=1)
Пример #4
0
def main(argv):
    logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
    logger = logging.getLogger('')  #using root

    parser = argparse.ArgumentParser(description='Getting parameters.',
                                     prog='run_cx.sh')

    parser.add_argument(
        'dataset',
        type=str,
        help='dataset.txt stores the input matrix to run CX on; \
           dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \
           dataset_D.txt stores singular values of the input matrix (only needed for -t)'
    )
    parser.add_argument('--dims',
                        metavar=('m', 'n'),
                        type=int,
                        nargs=2,
                        required=True,
                        help='size of the input matrix')
    parser.add_argument('--sparse',
                        dest='sparse',
                        action='store_true',
                        help='whether the data is sparse')
    parser.add_argument('--hdfs',
                        dest='file_source',
                        default='local',
                        action='store_const',
                        const='hdfs',
                        help='load dataset from HDFS')
    parser.add_argument(
        '-k',
        '--rank',
        metavar='targetRank',
        dest='k',
        default=5,
        type=int,
        help=
        'target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n'
    )
    parser.add_argument('-r',
                        metavar='numRowsToSelect',
                        default=20,
                        type=int,
                        help='number of rows to select in CX')
    parser.add_argument(
        '-q',
        '--niters',
        metavar='numIters',
        dest='q',
        default=2,
        type=int,
        help='number of iterations to run in approximation of leverage scores')
    parser.add_argument(
        '--deterministic',
        dest='scheme',
        default='randomized',
        action='store_const',
        const='deterministic',
        help=
        'use deterministic scheme instead of randomized when selecting rows')
    parser.add_argument('-c',
                        '--cache',
                        action='store_true',
                        help='cache the dataset in Spark')
    parser.add_argument('-t',
                        '--test',
                        action='store_true',
                        help='compute accuracies of the returned solutions')
    parser.add_argument('-s',
                        '--save_logs',
                        action='store_true',
                        help='save Spark logs')
    parser.add_argument(
        '--nrepetitions',
        metavar='numRepetitions',
        default=1,
        type=int,
        help=
        'number of times to stack matrix vertically in order to generate large matrices'
    )
    parser.add_argument('--npartitions',
                        metavar='numPartitions',
                        default=280,
                        type=int,
                        help='number of partitions in Spark')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--row',
                       dest='axis',
                       default=0,
                       action='store_const',
                       const=0,
                       help='compute row leverage scores')
    group.add_argument('--column',
                       dest='axis',
                       default=0,
                       action='store_const',
                       const=1,
                       help='compute column leverage scores')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--leverage-scores-only',
                       dest='stage',
                       default='full',
                       action='store_const',
                       const='leverage',
                       help='return approximate leverage scores only')
    group.add_argument(
        '--indices-only',
        dest='stage',
        default='full',
        action='store_const',
        const='indices',
        help='return approximate leverage scores and selected row indices only'
    )

    if len(argv) > 0 and argv[0] == 'print_help':
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(argv)
    (m, n) = args.dims

    # validating
    if args.k > m or args.k > n:
        raise ValueError(
            'Rank parameter({0}) should not be greater than m({1}) or n({2})'.
            format(args.k, m, n))

    if args.npartitions > m or args.npartitions > n:
        args.npartitions = min(m, n)

    if args.test and args.nrepetitions > 1:
        raise OptionError(
            'Do not use the test mode(-t) on replicated data(numRepetitions>1)!'
        )

    if args.axis == 0:
        raise OptionError('Need to implement transpose first!')

    if args.sparse and args.file_source == 'hdfs':
        raise OptionError('Not yet!')

    # print parameters
    print_params(args, logger)

    # TO-DO: put these to a configuration file
    dire = '../data/'
    hdfs_dire = 'data/'
    logs_dire = 'file:///home/jiyan/cx_logs'

    # instantializing a Spark instance
    if args.save_logs:
        conf = SparkConf().set('spark.eventLog.enabled',
                               'true').set('spark.eventLog.dir', logs_dire)
    else:
        conf = SparkConf()
    sc = SparkContext(appName="cx_exp", conf=conf)

    # loading data
    if args.file_source == 'hdfs':
        A_rdd = sc.textFile(hdfs_dire + args.dataset + '.txt',
                            args.npartitions)  #loading dataset from HDFS
    else:
        A = np.loadtxt(dire + args.dataset +
                       '.txt')  #loading dataset from local disc
        if args.sparse:
            sA = to_sparse(A)
            A_rdd = sc.parallelize(sA, args.npartitions)
        else:
            A_rdd = sc.parallelize(A.tolist(), args.npartitions)

    if args.axis == 0:
        pass  # get rdd from the transpose of A

    t = time.time()
    if args.sparse:
        matrix_A = SparseRowMatrix(
            A_rdd, args.dataset, m, n,
            args.cache)  # creating a SparseRowMatrix instance
    else:
        matrix_A = RowMatrix(
            A_rdd, args.dataset, m, n, args.cache,
            repnum=args.nrepetitions)  # creating a RowMatrix instance

    cx = CX(matrix_A)

    lev, p = cx.get_lev(
        args.k, q=args.q
    )  # getting the approximate row leverage scores. it has the same size as the number of rows

    if args.test:
        if args.file_source != 'local':
            A = np.loadtxt(dire + args.dataset + '.txt')
        U, D, V = np.linalg.svd(A, 0)

        if args.axis == 0:
            lev_exact = np.sum(U[:, :args.k]**2, axis=1)
        else:
            lev_exact = np.sum(V.T[:, :args.k]**2, axis=1)
        p_exact = lev_exact / args.k
        logger.info(
            'KL divergence between the estimation of leverage scores and the exact one is {0}'
            .format(scipy.stats.entropy(p_exact, p)))
    logger.info('finished stage 1')
    logger.info('----------------------------------------------')

    if args.stage == 'indices' or args.stage == 'full':
        idx = cx.comp_idx(args.scheme,
                          args.r)  # choosing rows based on the leverage scores
        # maybe to store the indices to file
        logger.info('finished stage 2')
        logger.info('----------------------------------------------')

    if args.stage == 'full':
        rows = cx.get_rows(
        )  # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above)

        if args.test:
            diff = cx.comp_err()  # computing the relative error
            logger.info('relative error ||A-CX||/||A|| is {0}'.format(
                diff / np.linalg.norm(A, 'fro')))
            logger.info(
                'raltive error of the best rank-{0} approximation is {1}'.
                format(args.k,
                       np.sqrt(np.sum(D[args.k:]**2)) / np.sqrt(np.sum(D**2))))
        logger.info('finished stage 3')

    rtime = time.time() - t
    logger.info('time elapsed: {0} second'.format(rtime))