def canonizetag(tag): if tag: if isiterable(tag): for tag in tag: return canonizetag(tag) elif tag.startswith('tag://'): return tag elif '://' not in tag and '/' not in tag: return 'tag://{0}'.format(tag) raise InvalidTag("Invalid tag: {0}".format(tag))
def canonizetag(tag): if tag: if isiterable(tag): for tag in tag: return canonizetag(tag) elif tag.startswith("tag://"): return tag elif "://" not in tag and "/" not in tag and tag != "-": return "tag://{0}".format(tag) raise InvalidTag("Invalid tag: {0}".format(tag))
def canonizetag(tag): if tag: if isiterable(tag): for tag in tag: return canonizetag(tag) elif tag.startswith('tag://'): return tag elif '://' not in tag and '/' not in tag: return 'tag://%s' % tag raise InvalidTag("Invalid tag: %s" % tag)
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) stages, pipeline = set(), [] for stage in get('pipeline', []): if len(stage) == 2: g, s = stage concurrent = False elif len(stage) == 3: g, s, concurrent = stage else: raise DiscoError("Bad Stage {0}".format(stage)) if g not in self.group_ops: raise DiscoError("Unknown grouping {0}".format(g)) if s.name in stages: raise DiscoError("Repeated stage {0}".format(s.name)) stages.add(s.name) pipeline.append((s.name, g, concurrent)) from disco.util import isiterable, inputlist job_input = get('input', []) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, label=None, settings=job.settings) pipe_input = [[0, 0, inp] for inp in input] jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({ 'worker': self.bin, 'pipeline': pipeline, 'inputs': pipe_input }) return jobdict
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) stages, pipeline = set(), [] for stage in get("pipeline", []): if len(stage) == 2: g, s = stage concurrent = False elif len(stage) == 3: g, s, concurrent = stage else: raise DiscoError("Bad Stage {0}".format(stage)) if g not in self.group_ops: raise DiscoError("Unknown grouping {0}".format(g)) if s.name in stages: raise DiscoError("Repeated stage {0}".format(s.name)) stages.add(s.name) pipeline.append((s.name, g, concurrent)) from disco.util import isiterable, inputlist job_input = get("input", []) if not isiterable(job_input): raise DiscoError( "Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input) ) input = inputlist(job_input, label=None, settings=job.settings) pipe_input = [[0, 0, inp] for inp in input] jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({"worker": self.bin, "pipeline": pipeline, "inputs": pipe_input}) return jobdict
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: :type input: list of urls or list of list of urls :param input: used to set :attr:`jobdict.input`. Disco natively handles the following url schemes: * ``http://...`` - any HTTP address * ``file://...`` or no scheme - a local file. The file must exist on all nodes where the tasks are run. Due to these restrictions, this form has only limited use. * ``tag://...`` - a tag stored in :ref:`DDFS` * ``raw://...`` - pseudo-address: use the address itself as data. * ``dir://...`` - used by Disco internally. * ``disco://...`` - used by Disco internally. .. seealso:: :mod:`disco.schemes`. :type name: string :param name: directly sets :attr:`jobdict.prefix`. :type owner: string :param owner: directly sets :attr:`jobdict.owner`. If not specified, uses :envvar:`DISCO_JOB_OWNER`. :type scheduler: dict :param scheduler: directly sets :attr:`jobdict.scheduler`. Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.util import isiterable, inputlist, ispartitioned, read_index from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) has_map = bool(get('map')) has_reduce = bool(get('reduce')) job_input = get('input', []) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, partition=None if has_map else False, settings=job.settings) # -- nr_reduces -- # ignored if there is not actually a reduce specified # XXX: master should always handle this if has_map: # partitioned map has N reduces; non-partitioned map has 1 reduce nr_reduces = get('partitions') or 1 elif ispartitioned(input): # no map, with partitions: len(dir://) specifies nr_reduces nr_reduces = 1 + max(int(id) for dir in input for id, url in read_index(dir)) else: # no map, without partitions can only have 1 reduce nr_reduces = 1 if get('merge_partitions'): nr_reduces = 1 return {'input': input, 'worker': self.bin, 'map?': has_map, 'reduce?': has_reduce, 'nr_reduces': nr_reduces, 'prefix': get('name'), 'scheduler': get('scheduler', {}), 'owner': get('owner', job.settings['DISCO_JOB_OWNER'])}
def jobdict(self, job, **jobargs): """ Creates :ref:`jobdict` for the :class:`Worker`. Makes use of the following parameters, in addition to those defined by the :class:`Worker` itself: :type input: list of urls or list of list of urls :param input: used to set :attr:`jobdict.input`. Disco natively handles the following url schemes: * ``http://...`` - any HTTP address * ``file://...`` or no scheme - a local file. The file must exist on all nodes where the tasks are run. Due to these restrictions, this form has only limited use. * ``tag://...`` - a tag stored in :ref:`DDFS` * ``raw://...`` - pseudo-address: use the address itself as data. * ``dir://...`` - used by Disco internally. * ``disco://...`` - used by Disco internally. .. seealso:: :mod:`disco.schemes`. :type scheduler: dict :param scheduler: directly sets :attr:`jobdict.scheduler`. .. deprecated:: 0.5 *scheduler* params are now ignored. Uses :meth:`getitem` to resolve the values of parameters. :return: the :term:`job dict`. """ from disco.util import isiterable, inputlist, ispartitioned, read_index from disco.error import DiscoError def get(key, default=None): return self.getitem(key, job, jobargs, default) has_map = bool(get('map')) has_reduce = bool(get('reduce')) job_input = get('input', []) has_save_results = get('save', False) or get('save_results', False) if not isiterable(job_input): raise DiscoError("Job 'input' is not a list of input locations," "or a list of such lists: {0}".format(job_input)) input = inputlist(job_input, label=None if has_map else False, settings=job.settings) # -- nr_reduces -- # ignored if there is not actually a reduce specified # XXX: master should always handle this if has_map: # partitioned map has N reduces; non-partitioned map has 1 reduce nr_reduces = get('partitions') or 1 elif ispartitioned(input): # no map, with partitions: len(dir://) specifies nr_reduces nr_reduces = 1 + max(int(id) for dir in input for id, url, size in read_index(dir)) else: # no map, without partitions can only have 1 reduce nr_reduces = 1 if get('merge_partitions'): nr_reduces = 1 jobdict = super(Worker, self).jobdict(job, **jobargs) jobdict.update({'input': input, 'worker': self.bin, 'map?': has_map, 'reduce?': has_reduce, 'nr_reduces': nr_reduces, 'save_results': has_save_results, 'scheduler': get('scheduler', {})}) return jobdict