def _validate_top_keys(self, spec, path): """Validate keys at the top of the spec """ if 'tasks' not in spec and 'location' not in spec: raise DAGSpecInitializationError( 'Failed to initialize spec. Missing "tasks" key') if 'location' in spec: if len(spec) > 1: raise DAGSpecInitializationError( 'Failed to initialize spec. If ' 'using the "location" key there should not ' 'be other keys') else: valid = { 'meta', 'config', 'clients', 'tasks', 'serializer', 'unserializer', 'executor', 'on_finish', 'on_render', 'on_failure', } validate.keys(valid, spec.keys(), name='dag spec')
def validate(self): """ Validates the data schema """ if 'upstream' not in self.data: self.data['upstream'] = None if self.meta['extract_product']: required = {'source'} else: required = {'product', 'source'} validate.keys(valid=None, passed=self.data, required=required, name=repr(self)) if self.meta['extract_upstream'] and self.data.get('upstream'): raise DAGSpecInitializationError( 'Error validating task "{}", if ' 'meta.extract_upstream is set to True, tasks ' 'should not have an "upstream" key'.format(self.data)) if self.meta['extract_product'] and self.data.get('product'): raise DAGSpecInitializationError( 'Error validating task "{}", if ' 'meta.extract_product is set to True, tasks ' 'should not have a "product" key'.format(self.data))
def _validate_top_keys(self, spec, path): """Validate keys at the top of the spec """ if 'tasks' not in spec and 'location' not in spec: path_ = f'(file: "{path}")' if self._parent_path else '' raise KeyError('Invalid data to initialize DAGSpec, missing ' f'key "tasks" {path_}') if 'location' in spec: if len(spec) > 1: raise KeyError('If specifying dag through a "location" key ' 'it must be the unique key in the spec') else: valid = { 'meta', 'config', 'clients', 'tasks', 'serializer', 'unserializer' } validate.keys(valid, spec.keys(), name='dag spec')
def default_meta(cls, meta=None): """Fill missing values in a meta dictionary """ if meta is None: meta = {} validate.keys(cls.VALID, meta, name='dag spec') if 'extract_upstream' not in meta: meta['extract_upstream'] = True if 'extract_product' not in meta: meta['extract_product'] = False if 'product_relative_to_source' not in meta: meta['product_relative_to_source'] = False if 'jupyter_hot_reload' not in meta: meta['jupyter_hot_reload'] = False if 'jupyter_functions_as_notebooks' not in meta: meta['jupyter_functions_as_notebooks'] = False if 'import_tasks_from' not in meta: meta['import_tasks_from'] = None if 'source_loader' not in meta: meta['source_loader'] = None else: try: meta['source_loader'] = SourceLoader(**meta['source_loader']) except Exception as e: msg = ('Error initializing SourceLoader with ' f'{meta["source_loader"]}. Error message: {e.args[0]}') e.args = (msg, ) raise defaults = { 'SQLDump': 'File', 'NotebookRunner': 'File', 'SQLScript': 'SQLRelation', 'PythonCallable': 'File', 'ShellScript': 'File', } if 'product_default_class' not in meta: meta['product_default_class'] = defaults else: for class_, prod in defaults.items(): if class_ not in meta['product_default_class']: meta['product_default_class'][class_] = prod # validate keys and values in product_default_class for task_name, product_name in meta['product_default_class'].items(): try: validate_task_class_name(task_name) validate_product_class_name(product_name) except Exception as e: msg = f'Error validating product_default_class: {e.args[0]}' e.args = (msg, ) raise return meta
def build(self, input_data, copy=False): """Run the DAG Parameters ---------- input_data : dict A dictionary mapping root tasks (names) to dict params. Root tasks are tasks in the DAG that do not have upstream dependencies, the corresponding dictionary is passed to the respective task source function as keyword arguments copy : bool or callable Whether to copy the output of an upstream task before passing it to the task being processed. It is recommended to turn this off for memory efficiency but if the tasks are not pure functions (i.e. mutate their inputs) this migh lead to bugs, in such case, the best way to fix it would be to make all your tasks pure functions but you can enable this option if memory consumption is not a problem. If True it uses the ``copy.copy`` function before passing the upstream products, if you pass a callable instead, such function is used (for example, you may pass ``copy.deepcopy``) Returns ------- dict A dictionary mapping task names to their respective outputs """ outs = {} input_data_names = set(self.root_nodes) # FIXME: for this particula case, the error here should be TypeError, # not KeyError (the former is the one used when calling functions with # invalid arguments) - maybe an argument validate.keys to choose # which error to raise? validate.keys(valid=input_data_names, passed=set(input_data), required=input_data_names, name='input_data') if copy is True: copying_function = copy_module.copy elif callable(copy): copying_function = copy else: copying_function = _do_nothing for task_name in self.dag: task = self.dag[task_name] params = task.params.to_dict() if task_name in self.root_nodes: params = {**params, 'input_data': input_data[task_name]} # replace params with the returned value from upstream tasks if 'upstream' in params: params['upstream'] = { k: copying_function(outs[k]) for k, v in params['upstream'].items() } params.pop('product', None) output = self.return_postprocessor(task.source.primitive(**params)) if output is None: raise ValueError( 'All callables in a {} must return a value. ' 'Callable "{}", from task "{}" returned None'.format( type(self).__name__, task.source.name, task_name)) outs[task_name] = output return outs