예제 #1
0
def run_stage3(params_dict):
    
    input_matrix = params_dict.get('inputmatrix')
    #leverage_scores_file = params_dict.get('leveragescores')
    #p_score_file = params_dict.get('pscores')
    on_rows = stage_3_params.get('on_rows', False)
    sc = params_dict.get('sc')
    if on_rows: # we do the flip
        rows_assigned = sc.textFile(input_matrix).map(lambda x:x.split(',')).map(lambda x:(int(x[1]), int(x[0]), float(x[2])))
        leverage_scores_file = params_dict.get('rowleveragescores')
        p_score_file = params_dict.get('rowpscores')
    else:
        rows_assigned = sc.textFile(input_matrix).map(lambda x:x.split(',')).map(lambda x:(int(x[0]), int(x[1]), float(x[2])))
        leverage_scores_file = params_dict.get('columnleveragescores')
        p_score_file = params_dict.get('columnpscores')

    row_shape = rows_assigned.map(lambda x:x[0]).max() + 1 
    column_shape = rows_assigned.map(lambda x:x[1]).max() + 1

    matrix_A = SparseRowMatrix(rows_assigned,'output', row_shape,column_shape, True)
    start = time.time()
    cx = CX(matrix_A)
    k = 5
    q = 3
    lev, p = cx.get_lev(k,axis=0, q=q) 
    end = time.time()
    np.savetxt(leverage_scores_file, np.array(lev))
    np.savetxt(p_score_file, np.array(p))
    print 'lev score ', lev, len(lev)
    print 'p is ', p, len(p)
    print 'time ', end-start
예제 #2
0
    s = str(string)
    val = ast.literal_eval(s)
    return val[0], (np.array(val[1][0]), np.array(val[1][1]))


data = sc.textFile('/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/ncolumns_matrix').map(lambda x:parse(x))
#row_shape = 131048
#column_shape = 8258911
#131047 8258910
row_shape = 8258911
column_shape =131048
#column_shape+=20

print data.take(1)

matrix_A = SparseRowMatrix(data,'output', row_shape,column_shape, False)
cx = CX(matrix_A)
k = 2
q = 2
lev, p = cx.get_lev(k,axis=0, q=q) 
#end = time.time()
leverage_scores_file='/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/columns_row_leverage_scores_logged'
p_score_file='/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/columns_p_scores_logged'
np.savetxt(leverage_scores_file, np.array(lev))
np.savetxt(p_score_file, np.array(p))



"""
def parse_func(x):
    stringed  = str(x)
예제 #3
0
파일: ml.py 프로젝트: rustandruin/sc-2015
    for tup in grouped_list:
        indexed.append(tup[0])
        values.append(tup[1])
    return np.array(indexed), np.array(values)


filename = "/global/u2/m/msingh/sc_paper/new_version/sc-2015/cx_spark/data/movielens/ml-10M100K/ratings.dat"
#filename = '/global/u2/m/msingh/sc_paper/new_version/sc-2015/cx_spark/data/ml-100k/u.data'
data = sc.textFile(filename).map(lambda x: parse(x))
row_shape = data.map(lambda x: x[0]).max() + 1
column_shape = data.map(lambda x: x[1]).max() + 1
drdd = data.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().map(
    lambda x: (x[0], list(x[1]))).map(lambda x: (x[0], _indexed(x[1])))
print drdd.take(1)
#prep_rdd = prepare_matrix(data)
matrix_A = SparseRowMatrix(drdd, 'output', row_shape, column_shape, True)
cx = CX(matrix_A)
k = 2
q = 2
lev, p = cx.get_lev(k, axis=0, q=q)
#end = time.time()
leverage_scores_file = '/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/movielens_leverage_scores_full1'
p_score_file = '/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/movielens_p_scores_full1'
np.savetxt(leverage_scores_file, np.array(lev))
np.savetxt(p_score_file, np.array(p))

#825 post street
"""
rows_rdd = data.map(lambda x:str(x)).map(lambda x:x.split(',')).map(lambda x:(int(x[0]), int(x[1]), float(x[2])))
sorted_Rdd = prepare_matrix(rows_rdd)
sorted_Rdd.saveAsTextFile('/scratch1/scratchdirs/msingh/sc_paper/experiments/striped_data/rows_matrix')
예제 #4
0
 def setUp(self):
     self.matrix_A = SparseRowMatrix(sparse_matrix_rdd, 'test_data', 1000,
                                     100)
     self.matrix_A2 = SparseRowMatrix(sparse_matrix_rdd2, 'test_data', 100,
                                      1000)
예제 #5
0
from utils import prepare_matrix
import os
import logging.config
logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
logger = logging.getLogger(__name__)

sc = SparkContext()
logger.info("job_cx starting with appId=" + sc._jsc.sc().applicationId())
prefix = 'hdfs:///sc-2015/'
name = 'Lewis_Dalisay_Peltatum_20131115_hexandrum_1_1-masked'
logger.info("job_cx loading RDD from %s" % name)
#dataset = MSIDataset.load(sc, 'meta/' + name, prefix + name).cache()
#msimat = MSIMatrix.from_dataset(sc, dataset)
#msimat.save(prefix, 'meta', name)
msimat = MSIMatrix.load(sc, prefix, 'meta', name)
logger.info("shape: %s" % (msimat.shape, ))
mat = prepare_matrix(msimat.nonzeros).cache()
mat = SparseRowMatrix(mat,
                      "msimat",
                      msimat.shape[0],
                      msimat.shape[1],
                      cache=False)
cx = CX(mat)
k = 32
q = 5
lev, p = cx.get_lev(k, axis=0, q=q)
with open('dump.pkl', 'w') as outf:
    import cPickle as pickle
    data = {'lev': lev, 'p': p}
    pickle.dump(data, outf)
예제 #6
0
def main(argv):
    logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
    logger = logging.getLogger('')  #using root

    parser = argparse.ArgumentParser(description='Getting parameters.',
                                     prog='run_cx.sh')

    parser.add_argument(
        'dataset',
        type=str,
        help='dataset.txt stores the input matrix to run CX on; \
           dataset_U.txt stores left-singular vectors of the input matrix (only needed for -t); \
           dataset_D.txt stores singular values of the input matrix (only needed for -t)'
    )
    parser.add_argument('--dims',
                        metavar=('m', 'n'),
                        type=int,
                        nargs=2,
                        required=True,
                        help='size of the input matrix')
    parser.add_argument('--sparse',
                        dest='sparse',
                        action='store_true',
                        help='whether the data is sparse')
    parser.add_argument('--hdfs',
                        dest='file_source',
                        default='local',
                        action='store_const',
                        const='hdfs',
                        help='load dataset from HDFS')
    parser.add_argument(
        '-k',
        '--rank',
        metavar='targetRank',
        dest='k',
        default=5,
        type=int,
        help=
        'target rank parameter in the definition of leverage scores, this value shoud not be greater than m or n'
    )
    parser.add_argument('-r',
                        metavar='numRowsToSelect',
                        default=20,
                        type=int,
                        help='number of rows to select in CX')
    parser.add_argument(
        '-q',
        '--niters',
        metavar='numIters',
        dest='q',
        default=2,
        type=int,
        help='number of iterations to run in approximation of leverage scores')
    parser.add_argument(
        '--deterministic',
        dest='scheme',
        default='randomized',
        action='store_const',
        const='deterministic',
        help=
        'use deterministic scheme instead of randomized when selecting rows')
    parser.add_argument('-c',
                        '--cache',
                        action='store_true',
                        help='cache the dataset in Spark')
    parser.add_argument('-t',
                        '--test',
                        action='store_true',
                        help='compute accuracies of the returned solutions')
    parser.add_argument('-s',
                        '--save_logs',
                        action='store_true',
                        help='save Spark logs')
    parser.add_argument(
        '--nrepetitions',
        metavar='numRepetitions',
        default=1,
        type=int,
        help=
        'number of times to stack matrix vertically in order to generate large matrices'
    )
    parser.add_argument('--npartitions',
                        metavar='numPartitions',
                        default=280,
                        type=int,
                        help='number of partitions in Spark')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--row',
                       dest='axis',
                       default=0,
                       action='store_const',
                       const=0,
                       help='compute row leverage scores')
    group.add_argument('--column',
                       dest='axis',
                       default=0,
                       action='store_const',
                       const=1,
                       help='compute column leverage scores')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--leverage-scores-only',
                       dest='stage',
                       default='full',
                       action='store_const',
                       const='leverage',
                       help='return approximate leverage scores only')
    group.add_argument(
        '--indices-only',
        dest='stage',
        default='full',
        action='store_const',
        const='indices',
        help='return approximate leverage scores and selected row indices only'
    )

    if len(argv) > 0 and argv[0] == 'print_help':
        parser.print_help()
        sys.exit(1)

    args = parser.parse_args(argv)
    (m, n) = args.dims

    # validating
    if args.k > m or args.k > n:
        raise ValueError(
            'Rank parameter({0}) should not be greater than m({1}) or n({2})'.
            format(args.k, m, n))

    if args.npartitions > m or args.npartitions > n:
        args.npartitions = min(m, n)

    if args.test and args.nrepetitions > 1:
        raise OptionError(
            'Do not use the test mode(-t) on replicated data(numRepetitions>1)!'
        )

    if args.axis == 0:
        raise OptionError('Need to implement transpose first!')

    if args.sparse and args.file_source == 'hdfs':
        raise OptionError('Not yet!')

    # print parameters
    print_params(args, logger)

    # TO-DO: put these to a configuration file
    dire = '../data/'
    hdfs_dire = 'data/'
    logs_dire = 'file:///home/jiyan/cx_logs'

    # instantializing a Spark instance
    if args.save_logs:
        conf = SparkConf().set('spark.eventLog.enabled',
                               'true').set('spark.eventLog.dir', logs_dire)
    else:
        conf = SparkConf()
    sc = SparkContext(appName="cx_exp", conf=conf)

    # loading data
    if args.file_source == 'hdfs':
        A_rdd = sc.textFile(hdfs_dire + args.dataset + '.txt',
                            args.npartitions)  #loading dataset from HDFS
    else:
        A = np.loadtxt(dire + args.dataset +
                       '.txt')  #loading dataset from local disc
        if args.sparse:
            sA = to_sparse(A)
            A_rdd = sc.parallelize(sA, args.npartitions)
        else:
            A_rdd = sc.parallelize(A.tolist(), args.npartitions)

    if args.axis == 0:
        pass  # get rdd from the transpose of A

    t = time.time()
    if args.sparse:
        matrix_A = SparseRowMatrix(
            A_rdd, args.dataset, m, n,
            args.cache)  # creating a SparseRowMatrix instance
    else:
        matrix_A = RowMatrix(
            A_rdd, args.dataset, m, n, args.cache,
            repnum=args.nrepetitions)  # creating a RowMatrix instance

    cx = CX(matrix_A)

    lev, p = cx.get_lev(
        args.k, q=args.q
    )  # getting the approximate row leverage scores. it has the same size as the number of rows

    if args.test:
        if args.file_source != 'local':
            A = np.loadtxt(dire + args.dataset + '.txt')
        U, D, V = np.linalg.svd(A, 0)

        if args.axis == 0:
            lev_exact = np.sum(U[:, :args.k]**2, axis=1)
        else:
            lev_exact = np.sum(V.T[:, :args.k]**2, axis=1)
        p_exact = lev_exact / args.k
        logger.info(
            'KL divergence between the estimation of leverage scores and the exact one is {0}'
            .format(scipy.stats.entropy(p_exact, p)))
    logger.info('finished stage 1')
    logger.info('----------------------------------------------')

    if args.stage == 'indices' or args.stage == 'full':
        idx = cx.comp_idx(args.scheme,
                          args.r)  # choosing rows based on the leverage scores
        # maybe to store the indices to file
        logger.info('finished stage 2')
        logger.info('----------------------------------------------')

    if args.stage == 'full':
        rows = cx.get_rows(
        )  # getting back the selected rows based on the idx computed above (this might give you different results if you rerun the above)

        if args.test:
            diff = cx.comp_err()  # computing the relative error
            logger.info('relative error ||A-CX||/||A|| is {0}'.format(
                diff / np.linalg.norm(A, 'fro')))
            logger.info(
                'raltive error of the best rank-{0} approximation is {1}'.
                format(args.k,
                       np.sqrt(np.sum(D[args.k:]**2)) / np.sqrt(np.sum(D**2))))
        logger.info('finished stage 3')

    rtime = time.time() - t
    logger.info('time elapsed: {0} second'.format(rtime))