def set_config(self, config): response = json.loads( self.request('/disco/ctrl/save_config_table', json.dumps(config))) if response != 'table saved!': raise DiscoError(response)
def __init__(self, *args, **kwargs): super(JobDict, self).__init__(*args, **kwargs) # -- backwards compatibility -- if 'reduce_writer' in kwargs or 'map_writer' in kwargs: warn("Writers are deprecated - use output_stream.add() instead", DeprecationWarning) # -- required modules and files -- if self['required_modules'] is None: functions = util.flatten( util.iterify(self[f]) for f in chain(self.functions, self.stacks)) self['required_modules'] = find_modules( [f for f in functions if callable(f)]) # -- external flags -- if isinstance(self['map'], dict): self['ext_map'] = True if isinstance(self['reduce'], dict): self['ext_reduce'] = True # -- input -- ddfs = self.pop('ddfs', None) self['input'] = [ list(util.iterify(url)) for i in self['input'] for url in util.urllist(i, listdirs=bool(self['map']), ddfs=ddfs) ] # partitions must be an integer internally self['partitions'] = self['partitions'] or 0 # set nr_reduces: ignored if there is not actually a reduce specified if self['map']: # partitioned map has N reduces; non-partitioned map has 1 reduce self['nr_reduces'] = self['partitions'] or 1 elif self.input_is_partitioned: # Only reduce, with partitions: len(dir://) specifies nr_reduces self['nr_reduces'] = 1 + max( id for dir in self['input'] for id, url in util.read_index(dir[0])) else: # Only reduce, without partitions can only have 1 reduce self['nr_reduces'] = 1 # merge_partitions iff the inputs to reduce are partitioned if self['merge_partitions']: if self['partitions'] or self.input_is_partitioned: self['nr_reduces'] = 1 else: raise DiscoError("Can't merge partitions without partitions") # -- scheduler -- scheduler = self.__class__.defaults['scheduler'].copy() scheduler.update(self['scheduler']) if int(scheduler['max_cores']) < 1: raise DiscoError("max_cores must be >= 1") self['scheduler'] = scheduler # -- sanity checks -- for key in self: if key not in self.defaults: raise DiscoError("Unknown job argument: %s" % key)
def submit(self, jobpack): status, body = json.loads(self.request('/disco/job/new', jobpack)) if status != 'ok': raise DiscoError("Failed to submit jobpack: %s" % body) return body
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: :type input: list of urls or list of list of urls :param input: used to set :attr:`jobdict.input`. Disco natively handles the following url schemes: * ``http://...`` - any HTTP address * ``file://...`` or no scheme - a local file. The file must exist on all nodes where the tasks are run. Due to these restrictions, this form has only limited use. * ``tag://...`` - a tag stored in :ref:`DDFS` * ``raw://...`` - pseudo-address: use the address itself as data. * ``dir://...`` - used by Disco internally. * ``disco://...`` - used by Disco internally. .. seealso:: :mod:`disco.schemes`. :type scheduler: dict :param scheduler: directly sets :attr:`jobdict.scheduler`. Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.util import isiterable, inputlist, ispartitioned, read_index from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) has_map = bool(get('map')) has_reduce = bool(get('reduce')) reduce_shuffle = bool(get('reduce_shuffle')) job_input = get('input', []) has_save_results = get('save', False) or get('save_results', False) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, label=None if has_map else False, settings=job.settings) # -- nr_reduces -- # ignored if there is not actually a reduce specified # XXX: master should always handle this if has_map: # partitioned map has N reduces; non-partitioned map has 1 reduce nr_reduces = get('partitions') or 1 elif ispartitioned(input): # no map, with partitions: len(dir://) specifies nr_reduces nr_reduces = 1 + max( int(id) for dir in input for id, url, size in read_index(dir)) else: # no map, without partitions can only have 1 reduce nr_reduces = 1 jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({ 'input': input, 'worker': self.bin, 'map?': has_map, 'reduce?': has_reduce, 'reduce_shuffle?': reduce_shuffle, 'nr_reduces': nr_reduces, 'save_results': has_save_results }) return jobdict