def verify_purge(fset, targets): fset.reload(update_spec=False) orig_status_targets = list(fset.status.targets.keys()) target_names = [t.name for t in targets] for target in targets: driver = get_target_driver(target_spec=target, resource=fset) filesystem = driver._get_store().get_filesystem(False) assert filesystem.exists(driver._target_path) fset.purge_targets(target_names=target_names) for target in targets: driver = get_target_driver(target_spec=target, resource=fset) filesystem = driver._get_store().get_filesystem(False) assert not filesystem.exists(driver._target_path) fset.reload(update_spec=False) assert set(fset.status.targets.keys() ) == set(orig_status_targets) - set(target_names)
def init_featureset_graph(source, featureset, namespace, targets=None, return_df=True, verbose=False): """create storey ingestion graph/DAG from feature set object""" cache = ResourceCache() graph = featureset.spec.graph.copy() # init targets (and table) targets = targets or [] server = create_graph_server(graph=graph, parameters={}, verbose=verbose) server.init_states(context=None, namespace=namespace, resource_cache=cache) if graph.engine != "sync": _add_data_steps( graph, cache, featureset, targets=targets, source=source, return_df=return_df, context=server.context, ) server.init_object(namespace) if graph.engine != "sync": return graph.wait_for_completion() if hasattr(source, "to_dataframe"): source = source.to_dataframe() elif not hasattr(source, "to_csv"): raise mlrun.errors.MLRunInvalidArgumentError("illegal source") event = MockEvent(body=source) data = server.run(event, get_body=True) for target in targets: target = get_target_driver(target, featureset) size = target.write_dataframe(data) target_status = target.update_resource_status("ready", size=size) if verbose: logger.info(f"wrote target: {target_status}") return data
def context_to_ingestion_params(context): """extract the ingestion task params from job/serving context""" featureset_uri = context.get_param("featureset") featureset = context.get_store_resource(featureset_uri) infer_options = context.get_param("infer_options", InferOptions.Null) source = context.get_param("source") if source: source = get_source_from_dict(source) elif featureset.spec.source.to_dict(): source = get_source_from_dict(featureset.spec.source.to_dict()) targets = context.get_param("targets", None) if not targets: targets = featureset.spec.targets targets = [get_target_driver(target, featureset) for target in targets] return featureset, source, targets, infer_options
def init_featureset_graph( source, featureset, namespace, targets=None, return_df=True, verbose=False, rows_limit=None, ): """create storey ingestion graph/DAG from feature set object""" cache = ResourceCache() graph = featureset.spec.graph.copy() # init targets (and table) targets = targets or [] server = create_graph_server(graph=graph, parameters={}, verbose=verbose) server.init_states(context=None, namespace=namespace, resource_cache=cache) if graph.engine != "sync": # todo: support rows_limit it storey sources _add_data_steps( graph, cache, featureset, targets=targets, source=source, return_df=return_df, context=server.context, ) server.init_object(namespace) return graph.wait_for_completion() server.init_object(namespace) # if the source is a dataframe iterator we load/write it in chunks chunk_id = 0 if hasattr(source, "to_dataframe"): if source.is_iterator(): chunk_id = 1 chunks = source.to_dataframe() else: chunks = [source.to_dataframe()] elif not hasattr(source, "to_csv"): raise mlrun.errors.MLRunInvalidArgumentError("illegal source") else: chunks = [source] entity_columns = list(featureset.spec.entities.keys()) key_fields = entity_columns if entity_columns else None sizes = [0] * len(targets) data_result = None total_rows = 0 targets = [get_target_driver(target, featureset) for target in targets] for chunk in chunks: event = MockEvent(body=chunk) data = server.run(event, get_body=True) if data is not None: for i, target in enumerate(targets): size = target.write_dataframe( data, key_column=key_fields, timestamp_key=featureset.spec.timestamp_key, chunk_id=chunk_id, ) if size: sizes[i] += size chunk_id += 1 if data_result is None: # in case of multiple chunks only return the first chunk (last may be too small) data_result = data total_rows += data.shape[0] if rows_limit and total_rows >= rows_limit: break # todo: fire termination event if iterator for i, target in enumerate(targets): target_status = target.update_resource_status("ready", size=sizes[i]) if verbose: logger.info(f"wrote target: {target_status}") return data_result