Exemplo n.º 1
0
def second_mapper(data):
    
    n = gopts.getintkey('ncols')
    m = int(os.getenv('nrows'))
    maxlocal = int(os.getenv('maxlocal'))
    
    totalrows = 0
    totalouts = 0
    rows = []
    util.setstatus('acquiring data with ncols=%i'%(n))
    
    for key,value in data:
        assert(len(value) == n)
        
        rows.append(value)
        totalrows += 1
        
        if len(rows) >= maxlocal:
            dumbo.util.incrcounter('Program','rows acquired',len(rows))
            totalouts += 1
            
            for row in localQoutput(rows):
                key = random.randint(0, 4000000000)
                yield key, row
            
            # reset rows, status
            rows = []
            util.setstatus('acquiring data with ncols=%i'%(n))
            
            
    if len(rows) > 0:
        for row in localQoutput(rows):
            key = random.randint(0, 4000000000)
            yield key, row
Exemplo n.º 2
0
def first_mapper(data):
    """ This mapper doesn't take any input, and generates the R factor. """
    hostname = os.uname()[1]
    print >>sys.stderr, hostname, "is a mapper"
    
    # suck up all the data so Hadoop doesn't complain
    for key,val in data:
        pass
    
    n = gopts.getintkey('ncols')
    m = int(os.getenv('nrows'))
    k = int(os.getenv('maprows'))/n
    s = float(m)/float(n)
    util.setstatus(
        "generating %i-by-%i R matrix with scale factor %i/%i=%s"%(
        n, n, m, n, s))
    
    R = numpy.triu(numpy.ones((n,n)))/math.sqrt(s)
    
    for i in xrange(k):
        util.setstatus(
            'step %i/%i: generating local %i-by-%i Q matrix'%(i+1,k,n,n))
        
        Q = numpy.linalg.qr(numpy.random.randn(n,n))[0] # just the Q factor
        util.setstatus('step %i/%i: multiplying local matrix'%(i+1,k))
        A = Q.dot(R)
        util.setstatus('step %i/%i: outputting %i rows'%(i+1,k,A.shape[0]))
        for row in A:
            key = random.randint(0, 4000000000)
            yield key, util.array2list(row)
Exemplo n.º 3
0
def localQoutput(rows):
    
    util.setstatus('converting to numpy array')
    A = numpy.array(rows)
    localm = A.shape[0]
    
    util.setstatus('generating local Q of size %i-by-%i'%(localm,localm))
    Q = numpy.linalg.qr(numpy.random.randn(localm,localm))[0] # just the Q factor
    util.setstatus(
        'multiplying %i-by-%i A by %i-by-%i Q'%(localm,A.shape[1],localm,localm))
    A = Q.dot(A)
    
    util.setstatus('outputting')
    for row in A:
        yield util.array2list(row)
Exemplo n.º 4
0
 def setstatus(self, msg):
     setstatus(msg)