示例#1
0
def estimate(master, input, center, k, iterations, map_reader=chain_reader):
    """
    Optimize k-clustering for `iterations` iterations with cluster
    center definitions as given in `center`.
    """
    job = master.new_job(name='k-clustering_init',
                         input=input,
                         map_reader=map_reader,
                         map_init=map_init,
                         map=random_init_map,
                         combiner=estimate_combiner,
                         reduce=estimate_reduce,
                         params=Params(k=k, seed=None, **center),
                         nr_reduces=k)

    centers = [(i, c) for i, c in result_iterator(job.wait())]
    job.purge()

    for j in range(iterations):
        job = master.new_job(name='k-clustering_iteration_%s' % (j, ),
                             input=input,
                             map_reader=map_reader,
                             map=estimate_map,
                             combiner=estimate_combiner,
                             reduce=estimate_reduce,
                             params=Params(centers=centers, **center),
                             nr_reduces=k)

        centers = [(i, c) for i, c in result_iterator(job.wait())]
        job.purge()

    return centers
示例#2
0
 def _assert_csv_reader(self, fields, values, expected):
     stream = StringIO.StringIO(values)
     params = Params()
     params.csv_fields = fields
     params.csv_dialect = csv.excel_tab
     actual = csv_reader(stream, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
示例#3
0
 def _assert_csv_reader(self, fields, values, expected):
     stream = StringIO.StringIO(values)
     params = Params()
     params.csv_fields = fields
     params.csv_dialect = csv.excel_tab
     actual = csv_reader(stream, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
示例#4
0
def load_one_dim(master, input, config_path, nr_maps=1, nr_reduces=1,\
                 load_method=offdimetlmr, dimnames= repr([]), \
                 go_live=1, profile=False):
	dim_job = master.new_job(
		name = 'dim',
		input = input,
		map_init = load_method.dim_map_init,
		map_reader = load_method.map_reader,
		map = load_method.dim_map_func,
	        partition = load_method.dim_partition_func,
		combiner = load_method.dim_combiner_func,
		reduce = load_method.dim_reduce_func,
		scheduler = {'max_cores': nr_maps},
		nr_reduces = nr_reduces,
		required_modules=[('config', config_path)],
		profile = profile,
		status_interval = 1000000,
		params = Params(count=0, dimnames=dimnames, \
	                        nr_maps=nr_maps, nr_reduces=nr_reduces)
	)
	results = dim_job.wait()
	shelvedb_paths = []
	if results!=None:
		for key,value in result_iterator(results):
			shelvedb_paths.append(key)
		if go_live==1:
			load_method.golive(config, shelvedb_paths)
示例#5
0
class InitTestCase(DiscoJobTestFixture, DiscoTestCase):
    inputs = range(10)
    params = Params(x=10)
    sort = False

    def getdata(self, path):
        return 'skipthis\n' + ('%s\n' % path) * 10

    @staticmethod
    def map_init(input_iter, params):
        input_iter.next()
        params.x += 100

    @staticmethod
    def map(e, params):
        return [(e, int(e) + params.x)]

    @staticmethod
    def reduce_init(input_iter, params):
        params.y = 1000

    @staticmethod
    def reduce(iter, out, params):
        for k, v in iter:
            out.add(k, int(v) + params.y)

    def runTest(self):
        results = list(self.results)
        for k, v in results:
            self.assertEquals(int(k) + 1110, int(v))
        self.assertEquals(len(results), 100)
示例#6
0
文件: job.py 项目: 0scarLi/inferno
    def __init__(self, rule, settings, urls=None):
        self.job_options = JobOptions(rule, settings)
        self.rule = rule
        self.settings = settings
        rule_params = dict(rule.params.__dict__)
        self.disco, self.ddfs = get_disco_handle(
            rule_params.get('server', settings.get('server')))
        rule_params.update(settings)
        self.params = Params(**rule_params)
        self.urls = urls

        try:
            # attempt to allow for overriden worker class from settings file or rule
            if rule.worker:
                worker = rule.worker
            else:
                worker_mod, dot, worker_class = settings.get(
                    'worker').rpartition('.')
                mod = __import__(worker_mod, {}, {}, worker_mod)
                worker = getattr(mod, worker_class)()
            self.job = Job(name=rule.name,
                           master=self.disco.master,
                           worker=worker)
        except Exception as e:
            log.warn(
                "Error instantiating worker: %s %s - loading default worker" %
                (settings.get('worker'), e))
            self.job = Job(name=rule.name, master=self.disco.master)
        self.full_job_id = None
        self.jobinfo = None
        self._notify(JOB_START)
示例#7
0
def run(program, jobclass, *inputs):
    """Usage: jobclass [-n name] [--save] [--sort] [--profile] [--partitions P] [--sched_max_cores C] [--status_interval I] [input ...]

    Create an instance of jobclass and run it.
    Input urls are specified as arguments or read from stdin.
    """
    from disco.core import Params
    from disco.util import reify

    def maybe_list(seq):
        return seq[0] if len(seq) == 1 else seq

    name = program.options.name or jobclass.split('.')[-1]
    input = inputs or [
        maybe_list(line.split()) for line in fileinput.input(inputs)
    ]
    job = reify(jobclass)(program.disco, name)

    try:
        params = job.params
    except AttributeError:
        params = Params()
    params.__dict__.update(**dict(program.options.params))

    job.run(input=input, **program.option_parser.jobdict)
    print job.name
示例#8
0
 def _assert_reduce(self, data, expected, **kwargs):
     # turn disco_debug on for more code coverage
     if kwargs is None:
         kwargs = dict()
     kwargs['disco_debug'] = True
     params = Params(**kwargs)
     actual = keyset_reduce(data, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
示例#9
0
class Grep(Job):
    map = nop_map
    params = Params(pattern=None)

    def map_reader(fd, size, url, params):
        import re
        if params.pattern:
            pattern = re.compile(params.pattern)
            for line in fd:
                if pattern.match(line):
                    yield url, line
示例#10
0
def predict(input, loglikelihoods, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])
    job = Job(name='naive_bayes_predict')
    job.run(input=input,
            map_reader=map_reader,
            map=predict_map,
            params=Params(loglikelihoods=loglikelihoods,
                          ys=ys,
                          splitter=splitter),
            clean=False)
    return job.wait()
示例#11
0
 def test_keyset_multiplier(self):
     params = Params()
     params.keysets = {
         'last_name_keyset':
         dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
         ),
         'first_name_keyset':
         dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
         )
     }
     data = [{
         'first_name': 'Willow',
         'last_name': 'Harvey'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke'
     }]
     expected = [{
         'first_name': 'Willow',
         'last_name': 'Harvey',
         '_keyset': 'first_name_keyset'
     }, {
         'first_name': 'Willow',
         'last_name': 'Harvey',
         '_keyset': 'last_name_keyset'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke',
         '_keyset': 'first_name_keyset'
     }, {
         'first_name': 'Noam',
         'last_name': 'Clarke',
         '_keyset': 'last_name_keyset'
     }]
     actual = keyset_multiplier(data, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
示例#12
0
class LineChunker(Job):
    params = Params(ddfs_master=None, tag=None)

    def _map_input_stream(fd, size, url, params):
        from disco.ddfs import DDFS
        tag = params.tag or 'disco:chunks:%s' % Task.jobname
        master = params.ddfs_master or Task.master
        yield url, DDFS(master).chunk(tag, [url])
    map_input_stream = [_map_input_stream]

    def map(entry, params):
        yield entry
示例#13
0
    def __init__(self, master, name, index, method, arg, streams, reduce,
                 **kwargs):
        super(DiscoDBIterator, self).__init__(name=name, master=master)
        self.input = [[
            '%s!%s/%s' % (url, method, arg) if method else url for url in urls
        ] for urls in index.ichunks]
        self.map_input_stream = [scheme_discodb.input_stream] + streams
        self.params = Params(**kwargs)

        if reduce:
            self.partitions = len(self.master.nodeinfo())
            self.reduce = reduce
示例#14
0
def predict(master, input, center, centers, map_reader=chain_reader):
    """
    Predict the closest clusters for the datapoints in input.
    """
    job = master.new_job(name='kcluster_predict',
                         input=input,
                         map_reader=map_reader,
                         map=predict_map,
                         params=Params(centers=centers, **center),
                         nr_reduces=0)

    return job.wait()
示例#15
0
 def test_keyset_multiplier(self):
     params = Params()
     params.keysets = {
         'last_name_keyset': dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
          ),
         'first_name_keyset': dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
          )}
     data = [
         {'first_name': 'Willow', 'last_name': 'Harvey'},
         {'first_name': 'Noam', 'last_name': 'Clarke'}]
     expected = [
         {
             'first_name': 'Willow',
             'last_name': 'Harvey',
             '_keyset': 'first_name_keyset'
         },
         {
             'first_name': 'Willow',
             'last_name': 'Harvey',
             '_keyset': 'last_name_keyset'
         },
         {
             'first_name': 'Noam',
             'last_name': 'Clarke',
             '_keyset': 'first_name_keyset'
         },
         {
             'first_name': 'Noam',
             'last_name': 'Clarke',
             '_keyset': 'last_name_keyset'
         }]
     actual = keyset_multiplier(data, None, None, params)
     ok_(isinstance(actual, types.GeneratorType))
     eq_(list(actual), expected)
示例#16
0
 def setUp(self):
     sys.stdout = self.capture_stdout = cStringIO.StringIO()
     self.params = Params()
     self.params.keysets = {
         'last_name_keyset':
         dict(
             key_parts=['_keyset', 'last_name'],
             value_parts=['count'],
         ),
         'first_name_keyset':
         dict(
             key_parts=['_keyset', 'first_name'],
             value_parts=['count'],
         )
     }
示例#17
0
    def __init__(self, master, name, dataset):
        super(Indexer, self).__init__(name=name, master=master)
        self.input = dataset.input
        self.map_input_stream = dataset.stream
        self.map_reader = dataset.parser
        self.map = dataset.demuxer
        self.partition = dataset.balancer
        self.profile = dataset.profile
        self.partitions = dataset.nr_ichunks
        self.required_files = dataset.required_files
        self.params = Params(n=0, unique_items=dataset.unique_items)

        if self.partitions:
            self.reduce = nop_reduce
            self.reduce_output_stream = [reduce_output_stream, discodb_output]
        else:
            self.map_output_stream = [map_output_stream, discodb_output]
示例#18
0
class ParamsTestCase(DiscoJobTestFixture, DiscoTestCase):
    inputs = range(10)
    params = Params(x=5, f1=fun1, f2=fun2, now=datetime.now())
    sort = False

    def getdata(self, path):
        return '\n'.join([path] * 10)

    @staticmethod
    def map(e, params):
        return [(e, params.f1(int(e), params.x))]

    @staticmethod
    def reduce(iter, out, params):
        for k, v in iter:
            out.add(k, params.f2(int(v)))

    def runTest(self):
        for k, v in self.results:
            self.assertEquals(fun2(int(k) + 5), int(v))
示例#19
0
class PartialTestCase(DiscoJobTestFixture, DiscoTestCase):
    @property
    def inputs(self):
        return [str(x) for x in range(self.num_workers)]

    def getdata(self, path):
        return '1 _ 0 \n'

    map = partial(map, extra='a')
    combiner = partial(combiner, extra='b')
    reduce = partial(reduce, extra='c')
    map_init = partial(init, extra='d')
    reduce_init = partial(init, extra='e')
    map_reader = partial(reader, extra='f')
    map_writer = partial(writer, extra='g')
    reduce_reader = partial(reader, extra='h')
    reduce_writer = partial(writer, extra='i')
    params = Params(foo=partial(foo, extra='z'))

    def runTest(self):
        for k, v in self.results:
            self.assertEquals(k, '_fazbghczi')
示例#20
0
def load_fact(master, input, config_path, nr_maps=1, nr_reduces=1, \
              load_method=offdimetlmr, profile=False):
	#disco = Disco("disco://"+host)
	fact_starttime = time.time()
	fact_job = master.new_job(
		name = 'fact',
		input = input,
		map_init = load_method.fact_map_init,
		map_reader = load_method.map_reader,
		map = load_method.fact_map_func,
		combiner = load_method.fact_combiner_func,
		scheduler = {'max_cores': nr_maps},
		nr_reduces = nr_reduces,
		required_modules=[('config', config_path),],
		status_interval = 1000000,
		profile = profile,
		params = Params(totalcopytime=0, nr_maps=nr_maps, \
	                        nr_reduces=nr_reduces)
	)
	results = fact_job.wait()
	#results = fact_job.wait(show=True, poll_interval = 100, timeout = 10*3600)
	fact_endtime = time.time()
	print "Time of loading facts: %f seconds" % (fact_endtime-fact_starttime)
示例#21
0
文件: admm.py 项目: marchon/phd
class ADMM(Job):
    def map_reader(fd, url, size, params):
        i = Task.id
        z = params.z
        yi = params.y[i] + params.rho * (params.x[i] - z)
        for A, b in iter:
            xi = argmin(fi(x) + dot(yi, x - z) + (params.rho / 2.) * dot(x - z, x - z))
            yield str(i), (xi, yi)

    def reduce(iter, params):
        for n, (i, (xi, yi)) in enumerate(iter):
            zhat += xi + yi / float(params.rho)
        yield zhat / n

# first run a job to put records into A, b format
# and also calculate a first z

if __name__ == '__main__':
    params = Params(rho=1., z=0., objective=)

    while True:
        job = ADMM()
        results = job.wait()
        z = old_z
        params.z = list(RecordIter(job.results()))[0]
        if params.rho * sqrt(n) * pnorm(z - params.z, p=2) <= eta_conv:
            if sum(dot(xi - params.z, xi - params.z)
                   for xi, yi in RecordIter(results)) <= (eta_feas ** 2):
                break
示例#22
0
文件: rule.py 项目: 0scarLi/inferno
    def __init__(
            self,
            # name, on/off
            name='_unnamed_',
            run=True,

            # throttle
            min_blobs=1,
            max_blobs=sys.maxint,
            partitions=200,
            partition_function=crc_partition,
            scheduler=None,
            worker=None,
            time_delta=None,
            newest_first=True,

            # archive
            archive=False,
            archive_tag_prefix='processed',

            # nuke
            nuke=False,

            # map
            map_init_function=lambda x, y: x,
            map_function=keyset_map,
            map_input_stream=chunk_csv_stream,
            map_output_stream=(map_output_stream, disco_output_stream),

            #combine
            combiner_function=None,

            # reduce
            reduce_function=keyset_reduce,
            reduce_output_stream=(reduce_output_stream, disco_output_stream),

            # result
            # result_iterator_override -->
            #   see inferno.lib.disco_ext.sorted_iterator for signature
            result_iterator_override=None,
            result_processor=keyset_result,
            result_tag=None,
            result_tag_suffix=True,
            save=False,
            sort=True,
            sort_buffer_size='10%',
            sorted_results=True,

            # keysets
            keysets=None,
            key_parts=None,
            value_parts=None,
            column_mappings=None,
            table=None,
            keyset_parts_preprocess=None,
            parts_postprocess=None,

            # input
            day_range=0,
            day_offset=0,
            day_start=None,
            source_tags=None,
            source_urls=None,

            # other
            rule_init_function=None,
            rule_cleanup=None,
            parts_preprocess=None,
            field_transforms=None,
            required_files=None,
            required_modules=None,

            # notifications --> notify_addresses must be list of addresses
            notify_on_fail=False,
            notify_on_success=False,
            notify_addresses=None,
            **kwargs):

        self.qualified_name = name
        if kwargs:
            self.params = Params(**kwargs)
        else:
            self.params = Params()

        if not scheduler:
            scheduler = {'force_local': False, 'max_cores': 200}

        # name, on/off
        self.run = run
        self.name = name

        # throttle
        self.min_blobs = min_blobs
        self.max_blobs = max_blobs
        self.partitions = partitions
        self.partition_function = partition_function
        self.scheduler = scheduler
        self.time_delta = time_delta
        if self.time_delta is None:
            self.time_delta = {'minutes': 5}
        self.newest_first = newest_first
        self.worker = worker

        # archive
        self.archive = archive
        self.archive_tag_prefix = archive_tag_prefix

        # nuke
        self.nuke = nuke

        # map
        self.map_init_function = map_init_function
        self.map_function = map_function
        self.map_input_stream = map_input_stream
        self.map_output_stream = map_output_stream
        self.combiner_function = combiner_function

        # reduce
        self.reduce_function = reduce_function
        self.reduce_output_stream = reduce_output_stream

        # result
        self.result_processor = result_processor
        self.result_tag = result_tag
        self.result_tag_suffix = result_tag_suffix
        self.save = save
        self.sort = sort
        self.sort_buffer_size = sort_buffer_size
        if result_iterator_override:
            self.result_iterator = result_iterator_override
        elif self.sort and sorted_results:
            self.result_iterator = sorted_iterator
        else:
            self.result_iterator = result_iterator

        # input
        if isinstance(source_tags, basestring):
            source_tags = [source_tags]
        self.day_range = day_range
        self.day_offset = day_offset
        self.day_start = day_start
        self.source_tags = source_tags or []

        # keysets
        keyset_dict = {}
        if keysets:
            for keyset_name, keyset_obj in keysets.items():
                keyset_dict[keyset_name] = keyset_obj.as_dict()
        else:
            keyset_dict['_default'] = Keyset(key_parts, value_parts,
                                             column_mappings, table,
                                             keyset_parts_preprocess,
                                             parts_postprocess).as_dict()
        self.params.keysets = keyset_dict

        self.params.parts_preprocess = parts_preprocess or []
        self.params.field_transforms = field_transforms or dict()

        # other
        self.rule_init_function = rule_init_function
        self.rule_cleanup = rule_cleanup
        self.required_modules = required_modules or []
        self.required_files = required_files or []
        self.notify_on_fail = notify_on_fail
        self.notify_on_success = notify_on_success
        self.notify_addresses = notify_addresses or []
        self.source_urls = source_urls
示例#23
0
 def params_2(self):
     return Params(job=self.job_1.name)
示例#24
0
def estimate(input, ys, splitter=' ', map_reader=chain_reader):
    ys = dict([(id, 1) for id in ys])

    job = Job(name='naive_bayes_estimate')

    job.run(input=input,
            map_reader=map_reader,
            map=estimate_map,
            combiner=estimate_combiner,
            reduce=estimate_reduce,
            params=Params(ys=ys, splitter=splitter),
            clean=False)
    results = job.wait()

    total = 0
    # will include the items for which we'll be classifying,
    # for example if the dataset includes males and females,
    # this dict will include the keys male and female and the
    # number of times these have been observed in the train set
    items = {}

    # the number of times the classes have been observed.  For
    # example,  if the feature is something like tall or short, then the dict
    # will contain the total number of times we have seen tall and short.
    classes = {}

    # the number of times we have seen a class with a feature.
    pairs = {}

    for key, value in result_iterator(results):
        l = key.split(splitter)
        value = int(value)
        if len(l) == 1:
            if l[0] == '':
                total = value
            elif ys.has_key(l[0]):
                classes[l[0]] = value
            else:
                items[l[0]] = value
        else:
            pairs[key] = value


#counts[key] = [[c,i], [not c, i], [c, not i], [not c, not i]]
    counts = {}
    for i in items:
        for y in ys:
            key = y + splitter + i
            counts[key] = [0, 0, 0, 0]
            if pairs.has_key(key):
                counts[key][0] = pairs[key]
            counts[key][1] = items[i] - counts[key][0]
            if not classes.has_key(y):
                counts[key][2] = 0
            else:
                counts[key][2] = classes[y] - counts[key][0]
            counts[key][3] = total - sum(counts[key][:3])

            # add pseudocounts
            counts[key] = map(lambda x: x + 1, counts[key])
    total += 4

    import math
    loglikelihoods = {}
    for key, value in counts.iteritems():
        l = key.split(splitter)
        if not loglikelihoods.has_key(l[0]):
            loglikelihoods[l[0]] = 0.0
        loglikelihoods[l[0]] += math.log(value[0] +
                                         value[2]) - math.log(value[1] +
                                                              value[3])
        loglikelihoods[key] = math.log(value[0]) - math.log(value[1])

    return loglikelihoods
示例#25
0
文件: blas.py 项目: davin/disco
def dgemm(disco, transA, transB, m, n, k, alpha, A, B, beta, C, maxTotalBlocks=128):
	"""
	Compute general matrix multiplication alpha*op(A)*op(B) + beta*C in double precision where op(X) = X or transpose(X).
	@param transA A boolean value for transposing matrix A or not.
	@param transB A boolean value for transposing matrix B or not.
	@param m Number of rows of matrix op(A) and C.
	@param n Number of columns of matrix op(B) and C.
	@param k Number of columns of matrix op(A) and rows of matrix op(B).
	@param alpha Scalar multiplier for the matrix product A*B.
	@param beta Scalar multiplier for matrix C.
	@param A MatrixWrapper object encapsulating matrix A.
	@param B MatrixWrapper object encapsulating matrix B.
	@param C MatrixWrapper object encapsulating matrix C. If there is no C term, then pass in an empty wrapper, MatrixWrapper(), as placeholder.
	@param disco A Disco instance.
	@param maxTotalBlocks Suggested number of matrix blocks to use for carrying out the multiplication. Ideally, this should equal to the number of cores available in the cluster. The actual number of blocks is selected based on the size of the matrix.
	@return MatrixWrapper object encapsulating the resulting matrix.
	"""
	def _mapRowBlocks(e, params):
		from math import ceil
		from numpy import float64
		if type(e) == tuple:
			e = e[0]
		output = []
		elems = e.split(";")
		for elem in elems:
			i, j, val = map(float64, elem.split(","))
			if params.transA:
				i, j = j, i
			assert i < params.m, "row index %d exceeds matrix dimensions" % int(i)
			assert j < params.k, "col index %d exceeds matrix dimensions" % int(j)
			blockX = int(j / params.blockWidth)
			blockY = int(i / params.blockHeight)
			offsetY = ceil(params.blockHeight * blockY)
			val = params.alpha * val
			if val != 0.0:
				output += [(blockY*params.blocksPerRow+x, "%s,%d,%d,%.14f" % (params.matrixId, int(i-offsetY), int(j), val)) for x in range(0, params.blocksPerRow)]
		return output
		
	def _mapColBlocks(e, params):
		from math import ceil
		from numpy import float64
		if type(e) == tuple:
			e = e[0]
		output = []
		elems = e.split(";")
		for elem in elems:
			i, j, val = map(float64, elem.split(","))
			if params.transB:
				i, j = j, i
			assert i < params.k, "row index %d exceeds matrix dimensions" % int(i)
			assert j < params.n, "col index %d exceeds matrix dimensions" % int(j)
			blockX = int(j / params.blockWidth)
			blockX = int(j / params.blockWidth)
			offsetX = ceil(params.blockWidth * blockX)
			if val != 0.0:
				output += [(y*params.blocksPerRow+blockX, "%s,%d,%d,%.14f" % (params.matrixId, int(i), int(j-offsetX), val)) for y in range(0, params.blocksPerCol)]
		return output
		
	def _mapBlocks(e, params):
		from math import ceil
		from numpy import float64
		if type(e) == tuple:
			e = e[0]
		output = []
		elems = e.split(";")
		for elem in elems:
			i, j, val = map(float64, elem.split(","))
			assert i < params.m, "row index %d exceeds matrix dimensions" % int(i)
			assert j < params.n, "col index %d exceeds matrix dimensions" % int(j)
			blockX = int(j / params.blockWidth)
			blockX = int(j / params.blockWidth)
			blockY = int(i / params.blockHeight)
			offsetX = ceil(params.blockWidth * blockX)
			offsetY = ceil(params.blockHeight * blockY)
			val = params.beta*val
			if val != 0.0:
				output += [(blockY*params.blocksPerRow+blockX, "%s,%d,%d,%.14f" % (params.matrixId, int(i-offsetY), int(j-offsetX), val))]
		return output

	def nop_map(e, params):
		return [e]

	def _reduceMultiplyAndAdd(iter, out, params):
		from numpy import float64
		rows = {}
		cols = {}
		vals = {}
		maxColIdx = {}
		maxRowIdx = {}
		for blockId, s in iter:
			blockId = int(blockId)
			matrixId, rowIdx, colIdx, val = s.split(",")
			rowIdx = int(rowIdx)
			colIdx = int(colIdx)
			val = float64(val)
			if not rows.has_key(blockId):
				rows[blockId] = {}
				cols[blockId] = {}
				vals[blockId] = {}
				maxColIdx[blockId] = {}
				maxRowIdx[blockId] = {}
			if not rows[blockId].has_key(matrixId):
				rows[blockId][matrixId] = []
				cols[blockId][matrixId] = []
				vals[blockId][matrixId] = []
				maxColIdx[blockId][matrixId] = 0
				maxRowIdx[blockId][matrixId] = 0
			rows[blockId][matrixId].append(rowIdx)
			cols[blockId][matrixId].append(colIdx)
			vals[blockId][matrixId].append(val)
			maxColIdx[blockId][matrixId] = max(maxColIdx[blockId][matrixId], cols[blockId][matrixId][-1])
			maxRowIdx[blockId][matrixId] = max(maxRowIdx[blockId][matrixId], rows[blockId][matrixId][-1])
		# initialize sparse matrices
		from math import ceil
		from scipy.sparse import coo_matrix
		for blockId in rows.keys():
			# compute the index offset in the original matrix
			blockY = blockId / params.blocksPerRow
			blockX = blockId % params.blocksPerRow
			offsetY = ceil(params.blockHeight * blockY)
			offsetX = ceil(params.blockWidth * blockX)
			# compute matrix product
			if not vals[blockId].has_key('A') or not vals[blockId].has_key('B'):
				# skip multiplication since either block A or B is empty
				if vals[blockId].has_key('C'):
					# return beta*C
					P = coo_matrix((vals[blockId]['C'],(rows[blockId]['C'],cols[blockId]['C'])), dtype=float64, dims=(maxRowIdx[blockId]['C']+1, maxColIdx[blockId]['C']+1))
				else:
					P = None
			else:
				if vals[blockId].has_key('C'):
					m = max(maxRowIdx[blockId]['A'], maxRowIdx[blockId]['C']) + 1
					n = max(maxColIdx[blockId]['B'], maxColIdx[blockId]['C']) + 1
					C = coo_matrix((vals[blockId]['C'],(rows[blockId]['C'],cols[blockId]['C'])), dtype=float64, dims=(m,n))
				else:
					m = maxRowIdx[blockId]['A'] + 1
					n = maxColIdx[blockId]['B'] + 1
					C = coo_matrix(([],([],[])), dtype=float64, dims=(m,n))
				A = coo_matrix((vals[blockId]['A'],(rows[blockId]['A'],cols[blockId]['A'])), dtype=float64, dims=(m,max(maxColIdx[blockId]['A'], maxRowIdx[blockId]['B'])+1))
				B = coo_matrix((vals[blockId]['B'],(rows[blockId]['B'],cols[blockId]['B'])), dtype=float64, dims=(max(maxColIdx[blockId]['A'], maxRowIdx[blockId]['B'])+1, n))
				P = (A * B + C).tocoo()
			# map block indices into original indices
			if P != None:
				start = 0
				while start < len(P.row):
					end = min(start+params.elemsPerLine, len(P.row))
					out.add(";".join(["%d,%d,%.14f" % (P.row[i]+offsetY, P.col[i]+offsetX, P.data[i]) for i in range(start,end)]), "")
					start = end

	# find the best way to partition matrix into blocks
	blocksPerRow, blocksPerCol = _partition(m, n, maxTotalBlocks)
	blockHeight = float(m) / blocksPerCol
	blockWidth = float(n) / blocksPerRow
	totalBlocks = blocksPerRow * blocksPerCol
	#print "%dx%d blocks used with block dimension %fx%f" % (blocksPerCol, blocksPerRow, blockHeight, blockWidth)
	params = Params(blocksPerRow=blocksPerRow, blocksPerCol=blocksPerCol, blockHeight=blockHeight, blockWidth=blockWidth, alpha=alpha, beta=beta, transA=transA, transB=transB, m=m, k=k, n=n)
	params.elemsPerLine = 1000
	# map matrix A into row blocks
	params.matrixId = 'A'
	jobMapA = disco.new_job(input=A.urls, name="dgemm_mapA", map_reader=A.mapReader, map=_mapRowBlocks, params=params, nr_reduces=totalBlocks)
	resA = jobMapA.wait(clean=False, poll_interval=2)
	# map matrix B into col blocks
	params.matrixId = 'B'
	jobMapB = disco.new_job(input=B.urls, name="dgemm_mapB", map_reader=B.mapReader, map=_mapColBlocks, params=params, nr_reduces=totalBlocks)
	resB = jobMapB.wait(clean=False, poll_interval=2)
	# map matrix C into blocks
	if len(C.urls) == 0: # quick fix for disco bug
		resC = []
	else:
		params.matrixId = 'C'
		jobMapC = disco.new_job(input=C.urls, name="dgemm_mapC", map_reader=C.mapReader, map=_mapBlocks, params=params, nr_reduces=totalBlocks)
		resC = jobMapC.wait(clean=False, poll_interval=2)
	# multiply the blocks
	res = disco.new_job(input=resA+resB+resC, name="dgemm_reduce", map_reader=chain_reader, map=nop_map, nr_reduces=totalBlocks, reduce=_reduceMultiplyAndAdd, params=params).wait(clean=False, poll_interval=2)
	# clean up
	jobMapA.purge()
	jobMapB.purge()
	if len(C.urls) > 0: # quick fix for disco bug
		jobMapC.purge()
	return MatrixWrapper(res, chain_reader)
示例#26
0
文件: blas.py 项目: davin/disco
def dgema(disco, transA, transB, m, n, alpha, A, B, beta, maxTotalBlocks=128):
	"""
	Compute general matrix addition alpha*op(A) + beta*op(B) in double precision where op(X) = X or transpose(X).
	@param transA A boolean value for transposing matrix A or not.
	@param transB A boolean value for transposing matrix B or not.
	@param m Number of rows of matrix op(A).
	@param n Number of columns of matrix op(B).
	@param alpha Scalar multiplier for matrix A.
	@param beta Scalar multiplier for matrix B.
	@param A MatrixWrapper object encapsulating matrix A.
	@param B MatrixWrapper object encapsulating matrix B.
	@param disco A Disco instance.
	@param maxTotalBlocks Suggested number of matrix blocks to use for carrying out the addition. Ideally, this should equal to the number of cores available in the cluster. The actual number of blocks is selected based on the size of the matrix.
	@return MatrixWrapper object encapsulating the resulting matrix.
	"""
	def _mapBlocks(e, params):
		from math import ceil
		from numpy import float64
		if type(e) == tuple:
			e = e[0]
		output = []
		elems = e.split(";")
		for elem in elems:
			i, j, val = map(float64, elem.split(","))
			if params.transpose:
				i, j = j, i
			assert i < params.m, "row index %d exceeds matrix dimensions" % int(i)
			assert j < params.n, "col index %d exceeds matrix dimensions" % int(j)
			blockX = int(j / params.blockWidth)
			blockY = int(i / params.blockHeight)
			offsetX = ceil(params.blockWidth * blockX)
			offsetY = ceil(params.blockHeight * blockY)
			val = params.scaling * val
			if val != 0.0:
				output += [(blockY*params.blocksPerRow+blockX, "%d,%d,%.14f" % (int(i-offsetY), int(j-offsetX), val))]
		return output

	def nop_map(e, params):
		return [e]

	def _reduceAddBlocks(iter, out, params):
		from numpy import float64
		s = {}
		# add matrices
		for blockId, t in iter:
			blockId = int(blockId)
			rowIdx, colIdx, val = t.split(",")
			rowIdx = int(rowIdx)
			colIdx = int(colIdx)
			if not s.has_key(blockId):
				s[blockId] = {}
			if not s[blockId].has_key(rowIdx):
				s[blockId][rowIdx] = {}
			s[blockId][rowIdx][colIdx] = s[blockId][rowIdx].get(colIdx, 0) + float64(val)
		# output results
		from math import ceil
		from scipy.sparse import coo_matrix
		for blockId in s.keys():
			# compute the index offset in the original matrix
			offsetY = ceil(params.blockHeight * (blockId / params.blocksPerRow))
			offsetX = ceil(params.blockWidth * (blockId % params.blocksPerRow))
			# map block indices into original indices
			for rowIdx in s[blockId].keys():
				for colIdx in s[blockId][rowIdx].keys():
					out.add("%d,%d,%.14f" % (rowIdx+offsetY, colIdx+offsetX, s[blockId][rowIdx][colIdx]), "")

	# find the best way to partition matrix to blocks
	blocksPerRow, blocksPerCol = _partition(m, n, maxTotalBlocks)
	blockHeight = float(m) / blocksPerCol
	blockWidth = float(n) / blocksPerRow
	totalBlocks = blocksPerRow * blocksPerCol
	# map and scale matrices
	params = Params(blocksPerRow=blocksPerRow, blocksPerCol=blocksPerCol, blockHeight=blockHeight, blockWidth=blockWidth)
	params.transpose = transA
	params.scaling = alpha
	params.m = m
	params.n = n
	jobMapA = disco.new_job(input=A.urls, name="dgema_mapA", map_reader=A.mapReader, map=_mapBlocks, params=params, nr_reduces=totalBlocks)
	resA = jobMapA.wait(clean=False, poll_interval=2)
	params.transpose = transB
	params.scaling = beta
	jobMapB = disco.new_job(input=B.urls, name="dgema_mapB", map_reader=B.mapReader, map=_mapBlocks, params=params, nr_reduces=totalBlocks)
	resB = jobMapB.wait(clean=False, poll_interval=2)
	# add matrices
	res = disco.new_job(input=resA+resB, name="dgema_reduce", map_reader=chain_reader, map=nop_map, params=params, reduce=_reduceAddBlocks, nr_reduces=totalBlocks).wait(clean=False, poll_interval=2)
	# clean up
	jobMapA.purge()
	jobMapB.purge()
	return MatrixWrapper(res, chain_reader)