def test_hive_task_query_generation(): with _common_utils.AutoDeletingTempDir( "user_dir") as user_working_directory: context = _common_engine.EngineContext( execution_id=WorkflowExecutionIdentifier(project="unit_test", domain="unit_test", name="unit_test"), execution_date=_datetime.utcnow(), stats=None, # TODO: A mock stats object that we can read later. logging= _logging, # TODO: A mock logging object that we can read later. tmp_dir=user_working_directory, ) references = { name: _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(two_queries.interface.outputs) } qubole_hive_jobs = two_queries._generate_plugin_objects( context, references) assert len(qubole_hive_jobs) == 2 # deprecated, collection is only here for backwards compatibility assert len(qubole_hive_jobs[0].query_collection.queries) == 1 assert len(qubole_hive_jobs[1].query_collection.queries) == 1 # The output references should now have the same fake S3 path as the formatted queries assert references["hive_results"].value[0].uri != "" assert references["hive_results"].value[1].uri != "" assert references["hive_results"].value[0].uri in qubole_hive_jobs[ 0].query.query assert references["hive_results"].value[1].uri in qubole_hive_jobs[ 1].query.query
def execute(self, context, inputs): """ :param flytekit.engines.common.EngineContext context: :param flytekit.models.literals.LiteralMap inputs: :rtype: dict[Text, flytekit.models.common.FlyteIdlEntity] :returns: This function must return a dictionary mapping 'filenames' to Flyte Interface Entities. These entities will be used by the engine to pass data from node to node, populate metadata, etc. etc.. Each engine will have different behavior. For instance, the Flyte engine will upload the entities to a remote working directory (with the names provided), which will in turn allow Flyte Propeller to push along the workflow. Where as local engine will merely feed the outputs directly into the next node. """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }, ) outputs_dict = { name: _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } inputs_dict.update(outputs_dict) self._execute_user_code(context, inputs_dict) return { _constants.OUTPUT_FILE_NAME: _literal_models.LiteralMap(literals={ k: v.sdk_value for k, v in _six.iteritems(outputs_dict) }) }
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: flytekit.models.dynamic_job.DynamicJobSpec """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }, ) outputs_dict = { name: _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } # Add outputs to inputs inputs_dict.update(outputs_dict) nodes = [] tasks = [] # One node per query generated_queries = self._generate_plugin_objects(context, inputs_dict) # Create output bindings always - this has to happen after user code has run output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.value), ) for name, b in _six.iteritems(outputs_dict) ] i = 0 for quboleHiveJob in generated_queries: hive_job_node = _create_hive_job_node("HiveQuery_{}".format(i), quboleHiveJob.to_flyte_idl(), self.metadata) nodes.append(hive_job_node) tasks.append(hive_job_node.executable_sdk_object) i += 1 dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len(nodes), tasks=tasks, nodes=nodes, outputs=output_bindings, subworkflows=[], ) return dynamic_job_spec
def test_sdk_output_references_construction(): references = { name: _task_output.OutputReference(_type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(two_queries.interface.outputs) } # Before user code is run, the outputs passed to the user code should not have values assert references['hive_results'].sdk_value == _base_sdk_types.Void() # Should be a list of schemas assert isinstance(references['hive_results'].sdk_type, _containers.TypedCollectionType) assert isinstance(references['hive_results'].sdk_type.sub_type, _schema.SchemaInstantiator)
def _produce_dynamic_job_spec(self, context, inputs): """ Runs user code and and produces future task nodes to run sub-tasks. :param context: :param flytekit.models.literals.LiteralMap literal_map inputs: :rtype: flytekit.models.dynamic_job.DynamicJobSpec """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }) outputs_dict = { name: _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type(variable.type)) for name, variable in _six.iteritems(self.interface.outputs) } # Add outputs to inputs inputs_dict.update(outputs_dict) # Note: Today a hive task corresponds to a dynamic job spec with one node, which contains multiple # queries. We may change this in future. nodes = [] tasks = [] generated_queries = self._generate_hive_queries(context, inputs_dict) # Create output bindings always - this has to happen after user code has run output_bindings = [ _literal_models.Binding( var=name, binding=_interface.BindingData.from_python_std( b.sdk_type.to_flyte_literal_type(), b.value)) for name, b in _six.iteritems(outputs_dict) ] if len(generated_queries.query_collection.queries) > 0: hive_job_node = _create_hive_job_node( "HiveQueries", generated_queries.to_flyte_idl(), self.metadata) nodes.append(hive_job_node) tasks.append(hive_job_node.executable_sdk_object) dynamic_job_spec = _dynamic_job.DynamicJobSpec( min_successes=len( nodes ), # At most we only have one node for now, see above comment tasks=tasks, nodes=nodes, outputs=output_bindings, subworkflows=[]) return dynamic_job_spec
def execute(self, context, inputs): """ :param flytekit.engines.common.EngineContext context: :param flytekit.models.literals.LiteralMap inputs: :rtype: dict[Text, flytekit.models.common.FlyteIdlEntity] :returns: This function must return a dictionary mapping 'filenames' to Flyte Interface Entities. These entities will be used by the engine to pass data from node to node, populate metadata, etc. etc.. Each engine will have different behavior. For instance, the Flyte engine will upload the entities to a remote working directory (with the names provided), which will in turn allow Flyte Propeller to push along the workflow. Where as local engine will merely feed the outputs directly into the next node. """ inputs_dict = _type_helpers.unpack_literal_map_to_sdk_python_std( inputs, { k: _type_helpers.get_sdk_type_from_literal_type(v.type) for k, v in _six.iteritems(self.interface.inputs) }, ) input_notebook_path = self._notebook_path # Execute Notebook via Papermill. output_notebook_path = input_notebook_path.split( ".ipynb")[0] + "-out.ipynb" _pm.execute_notebook(input_notebook_path, output_notebook_path, parameters=inputs_dict) # Parse Outputs from Notebook. outputs = None with open(output_notebook_path) as json_file: data = _json.load(json_file) for p in data["cells"]: meta = p["metadata"] if "outputs" in meta["tags"]: outputs = " ".join(p["outputs"][0]["data"]["text/plain"]) if outputs is not None: dict = _literal_models._literals_pb2.LiteralMap() _text_format.Parse(outputs, dict) # Add output_notebook as an output to the task. output_notebook = _task_output.OutputReference( _type_helpers.get_sdk_type_from_literal_type( _Types.Blob.to_flyte_literal_type())) output_notebook.set(output_notebook_path) output_literal_map = _literal_models.LiteralMap.from_flyte_idl(dict) output_literal_map.literals[ OUTPUT_NOTEBOOK] = output_notebook.sdk_value return {_constants.OUTPUT_FILE_NAME: output_literal_map}