def check_results(self, flow, checker): import os from metaflow import namespace run = checker.get_run() if run is None: # CliChecker does not return a run object, that's ok return flow_obj = run.parent # test crazy unicode and spaces in tags # these tags must be set with --tag option in contexts.json tags = ( u"project:basic_tag", u"project_branch:user.tester", u"user:%s" % os.environ.get("METAFLOW_USER"), u"刺身 means sashimi", u"multiple tags should be ok", ) for tag in tags: # test different namespaces: one is a system-tag, # another is a user tag namespace(tag) run = flow_obj[checker.run_id] # the flow object should not have tags assert_equals(frozenset(), frozenset(flow_obj.tags)) # the run object should have the namespace tags assert_equals([True] * len(tags), [t in run.tags for t in tags]) # filtering by a non-existent tag should return nothing assert_equals([], list(flow_obj.runs("not_a_tag"))) # a conjunction of a non-existent tag and an existent tag # should return nothing assert_equals([], list(flow_obj.runs("not_a_tag", tag))) # all steps should be returned with tag filtering assert_equals( frozenset(step.name for step in flow), frozenset(step.id.split("/")[-1] for step in run.steps(tag)), ) # a conjunction of two existent tags should return the original list assert_equals( frozenset(step.name for step in flow), frozenset(step.id.split("/")[-1] for step in run.steps(*tags)), ) # all tasks should be returned with tag filtering for step in run: # the run object should have the tags assert_equals([True] * len(tags), [t in step.tags for t in tags]) # filtering by a non-existent tag should return nothing assert_equals([], list(step.tasks("not_a_tag"))) # filtering by the tag should not exclude any tasks assert_equals( [task.id for task in step], [task.id for task in step.tasks(tag)] ) for task in step.tasks(tag): # the task object should have the tags assert_equals([True] * len(tags), [t in task.tags for t in tags]) for data in task: # the data artifact should have the tags assert_equals( [True] * len(tags), [t in data.tags for t in tags] )
def count_runs(flow_name=None): namespace(None) count_categories = get_week_times() def key_parser(run): return get_formatted_time(datetime.fromtimestamp(run.created_at)) def stop_condition(run): a_week_ago = (datetime.now() - timedelta(days=7)) flow_datetime = datetime.fromtimestamp(run.created_at) return flow_datetime.timestamp() < a_week_ago.timestamp() if flow_name is None: return_counts = MetaflowWrapper().get_count(count_categories, key_parser, stop_condition) else: return_counts = FlowWrapper(flow_name=flow_name).get_count( count_categories, key_parser, stop_condition) return_data = [] for key, value in return_counts.items(): return_data.append({"time": key, "count": value}) return {"data": return_data}
def get_last_n(flow_name, n=5): namespace(None) flow = FlowWrapper(flow_name) runs = flow.get_most_recent_runs(n) return {"data": runs}
def route_flows(): namespace(None) m_wrapper = MetaflowWrapper() result = m_wrapper.get_formatted_flows() return {"data": result}
def get_run_artifacts(flow_name=None, run_id=None): if flow_name is None or run_id is None: raise MetaflowException(400, "Invalid parameters") namespace(None) run = RunWrapper.create_from_lookup(flow_name, run_id) return {"data": run.get_run_output_data()}
def get_run_data(flow=None, run_id=None): if flow is None or run_id is None: raise MetaflowException(400, "Invalid parameters") namespace(None) run = RunWrapper(f"{flow}/{run_id}") return {"data": run.get_formatted_steps()}
def wrapper(*args): run = args[0] if run.cache: namespace(None) run._run = Run(run.pathspec) run.cache = False return func(run)
def resolve_task_from_pathspec(flow_name, pathspec): """ resolves a task object for the pathspec query on the CLI. Args: flow_name : (str) : name of flow pathspec (str) : can be `stepname` / `runid/stepname` / `runid/stepname/taskid` Returns: metaflow.Task | None """ from metaflow import Flow, Step, Task from metaflow.exception import MetaflowNotFound # since pathspec can have many variations. pthsplits = pathspec.split("/") task = None run_id = None resolving_from = "task_pathspec" if len(pthsplits) == 1: # This means stepname resolving_from = "stepname" latest_run = Flow(flow_name).latest_run if latest_run is not None: run_id = latest_run.pathspec try: task = latest_run[pathspec].task except KeyError: pass elif len(pthsplits) == 2: # This means runid/stepname namespace(None) resolving_from = "step_pathspec" try: task = Step("/".join([flow_name, pathspec])).task except MetaflowNotFound: pass elif len(pthsplits) == 3: # this means runid/stepname/taskid namespace(None) resolving_from = "task_pathspec" try: task = Task("/".join([flow_name, pathspec])) except MetaflowNotFound: pass else: # raise exception for invalid pathspec format raise CommandException( msg= "The PATHSPEC argument should be of the form 'stepname' Or '<runid>/<stepname>' Or '<runid>/<stepname>/<taskid>'" ) if task is None: # raise Exception that task could not be resolved for the query. raise TaskNotFoundException(pathspec, resolving_from, run_id=run_id) return task
def get_most_recent(flow_name): namespace(None) flow = FlowWrapper(flow_name) last_run = flow.get_last_successful_run().json_with_steps() if last_run is None: return {} return {"data": last_run}
def _get_client_run_obj(obj, run_id, user_namespace): flow_name = obj.flow.name # handle error messaging for two cases # 1. our user tries to tag a new flow before it is run # 2. our user makes a typo in --namespace try: namespace(user_namespace) Flow(pathspec=flow_name) except MetaflowNotFound: raise CommandException( "No run found for *%s*. Please run the flow before tagging." % flow_name) except MetaflowNamespaceMismatch: raise CommandException( "No run found for *%s* in namespace *%s*. You can switch the namespace using --namespace" % (flow_name, user_namespace)) # throw an error with message to include latest run-id when run_id is None if run_id is None: latest_run_id = Flow(pathspec=flow_name).latest_run.id msg = ("Please specify a run-id using --run-id.\n" "*%s*'s latest run in namespace *%s* has id *%s*." % (flow_name, user_namespace, latest_run_id)) raise CommandException(msg) run_id_parts = run_id.split("/") if len(run_id_parts) == 1: path_spec = "%s/%s" % (flow_name, run_id) else: raise CommandException("Run-id *%s* is not a valid run-id" % run_id) # handle error messaging for three cases # 1. our user makes a typo in --run-id # 2. our user's --run-id does not exist in the default/specified namespace try: namespace(user_namespace) run = Run(pathspec=path_spec) except MetaflowNotFound: raise CommandException("No run *%s* found for flow *%s*" % (path_spec, flow_name)) except MetaflowNamespaceMismatch: msg = "Run *%s* for flow *%s* does not belong to namespace *%s*\n" % ( path_spec, flow_name, user_namespace, ) raise CommandException(msg) return run
def get_cards(task, type=None, follow_resumed=True): """ Get cards related to a Metaflow `Task` Args: task (str or `Task`): A metaflow `Task` object or pathspec (flowname/runid/stepname/taskid) type (str, optional): The type of card to retrieve. Defaults to None. follow_resumed (bool, optional): If a Task has been resumed and cloned, then setting this flag will resolve the card for the origin task. Defaults to True. Returns: `CardContainer` : A `list` like object that holds `Card` objects. """ from metaflow.client import Task from metaflow import namespace if isinstance(task, str): task_str = task if len(task_str.split("/")) != 4: # Exception that pathspec is not of correct form raise IncorrectPathspecException(task_str) # set namepsace as None so that we don't face namespace mismatch error. namespace(None) task = Task(task_str) elif not isinstance(task, Task): # Exception that the task argument should of form `Task` or `str` raise IncorrectArguementException(_TYPE(task)) if follow_resumed: origin_taskpathspec = resumed_info(task) if origin_taskpathspec: task = Task(origin_taskpathspec) card_paths, card_ds = resolve_paths_from_task( _get_flow_datastore(task), pathspec=task.pathspec, type=type, ) return CardContainer( card_paths, card_ds, from_resumed=origin_taskpathspec is not None, origin_pathspec=origin_taskpathspec, )
def test_init_options(s3root, pathspecs, expected): [pathspec] = pathspecs flow_name, run_id = pathspec.split("/") plen = len(s3root) # option 1) s3root as prefix with S3(s3root=s3root) as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url[plen:]) assert s3obj.key == url[plen:] assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get("s3://some/fake/address") # option 2) full url as s3root for url, exp in expected.items(): with S3(s3root=url) as s3: s3obj = s3.get() assert_results([s3obj], {url: exp}) # option 3) full urls with S3() as s3: for url, exp in expected.items(): # s3root should work as a prefix s3obj = s3.get(url) assert s3obj.key == url assert_results([s3obj], {url: exp}) with pytest.raises(MetaflowS3URLException): s3.get("suffix") with pytest.raises(MetaflowS3URLException): s3.get("s3://nopath") with pytest.raises(MetaflowS3URLException): s3.get_many(["suffixes"]) with pytest.raises(MetaflowS3URLException): s3.get_recursive(["suffixes"]) with pytest.raises(MetaflowS3URLException): s3.get_all() # option 4) 'current' environment (fake a running flow) flow = FakeFlow(use_cli=False) parsed = urlparse(s3root) with pytest.raises(MetaflowS3URLException): # current not set yet, so this should fail with S3(run=flow): pass current._set_env( FakeFlow(name=flow_name), run_id, "no_step", "no_task", "no_origin_run_id", "no_ns", "no_user", ) with S3(bucket=parsed.netloc, prefix=parsed.path, run=flow) as s3: for url, exp in expected.items(): name = url.split("/")[-1] s3obj = s3.get(name) assert s3obj.key == name assert_results([s3obj], {url: exp}) names = [url.split("/")[-1] for url in expected] s3objs = s3.get_many(names) assert {e.key for e in s3objs} == set(names) assert_results(s3objs, expected) assert_results(s3.get_all(), expected, info_should_be_empty=True) # option 5) run object if DO_TEST_RUN: # Only works if a metadata service exists with the run in question. namespace(None) with S3(bucket=parsed.netloc, prefix=parsed.path, run=Run(pathspec)) as s3: names = [url.split("/")[-1] for url in expected] assert_results(s3.get_many(names), expected)
import metaflow as mf mf.namespace(None) m = mf.Metaflow() assert m.metadata == mf.plugins.metadata.service.ServiceMetadataProvider, m.metadata assert len(m.flows) > 0, m.flows
def tag_list( obj, run_id, hide_system_tags, list_all, my_runs, group_by_tag, group_by_run, flat, arg_run_id, ): _set_current(obj) if run_id is None and arg_run_id is None and not list_all and not my_runs: # Assume list_all by default list_all = True if list_all and my_runs: raise CommandException( "Option --all cannot be used together with --my-runs.") if run_id is not None and arg_run_id is not None: raise CommandException( "Specify a run either using --run-id or as an argument but not both" ) if arg_run_id is not None: run_id = arg_run_id if group_by_run and group_by_tag: raise CommandException( "Option --group-by-tag cannot be used with --group-by-run") if flat and (group_by_run or group_by_tag): raise CommandException( "Option --flat cannot be used with any --group-by-* option") system_tags_by_some_grouping = dict() all_tags_by_some_grouping = dict() def _populate_tag_groups_from_run(_run): if group_by_run: if hide_system_tags: all_tags_by_some_grouping[ _run.pathspec] = _run.tags - _run.system_tags else: system_tags_by_some_grouping[_run.pathspec] = _run.system_tags all_tags_by_some_grouping[_run.pathspec] = _run.tags elif group_by_tag: for t in _run.tags - _run.system_tags: all_tags_by_some_grouping.setdefault(t, []).append(_run.pathspec) if not hide_system_tags: for t in _run.system_tags: system_tags_by_some_grouping.setdefault(t, []).append( _run.pathspec) else: if hide_system_tags: all_tags_by_some_grouping.setdefault("_", set()).update( _run.tags.difference(_run.system_tags)) else: system_tags_by_some_grouping.setdefault("_", set()).update( _run.system_tags) all_tags_by_some_grouping.setdefault("_", set()).update(_run.tags) pathspecs = [] if list_all or my_runs: user_namespace = resolve_identity() if my_runs else None namespace(user_namespace) try: flow = Flow(pathspec=obj.flow.name) except MetaflowNotFound: raise CommandException( "Cannot list tags because the flow %s has never been run." % (obj.flow.name, )) for run in flow.runs(): _populate_tag_groups_from_run(run) pathspecs.append(run.pathspec) else: run = _get_client_run_obj(obj, run_id, None) _populate_tag_groups_from_run(run) pathspecs.append(run.pathspec) if not group_by_run and not group_by_tag: # We list all the runs that match to print them out if needed. system_tags_by_some_grouping[",".join( pathspecs)] = system_tags_by_some_grouping.get("_", set()) all_tags_by_some_grouping[",".join( pathspecs)] = all_tags_by_some_grouping.get("_", set()) if "_" in system_tags_by_some_grouping: del system_tags_by_some_grouping["_"] if "_" in all_tags_by_some_grouping: del all_tags_by_some_grouping["_"] if flat: if len(all_tags_by_some_grouping) != 1: raise MetaflowInternalError("Failed to flatten tag set") for v in all_tags_by_some_grouping.values(): for tag in v: obj.echo(tag) return _print_tags_for_runs_by_groups(obj, system_tags_by_some_grouping, all_tags_by_some_grouping, group_by_tag)
def __init__(self): super().__init__() namespace(None) self._metaflow = Metaflow()
import hashlib import json from .client import CacheAction from .utils import (cacheable_artifact_value, cacheable_exception_value, progress_event_msg, artifact_cache_id, unpack_pathspec_with_attempt_id, streamed_errors) from services.ui_backend_service.data import unpack_processed_value from services.ui_backend_service.api.utils import operators_to_filters from metaflow import DataArtifact, namespace namespace(None) # Always use global namespace by default class SearchArtifacts(CacheAction): """ Fetches artifacts by pathspecs and performs a search against the object contents. Caches artifacts based on pathspec, and search results based on a combination of query&artifacts searched Parameters ---------- pathspecs : List[str] A list of artifact pathspecs (with attempt id as last component) to fetch and match the search term against: ["FlowId/RunNumber/StepName/TaskId/ArtifactName/0"] searchterm : str A searchterm to match against the fetched S3 artifacts contents. Returns ------- Dict or None example:
from metaflow import Flow, Metaflow, namespace import boto3 # EVENTS_RECORD_STORE = os.environ['EVENTS_RECORD_STORE'] dynamodb = boto3.resource('dynamodb') def parse_tags(self, tags, key_string): for tag in tags: if key_string in tag: return tag.split(":")[-1] return None data = [] namespace(None) for flow in Metaflow().flows: for run in flow.runs(): for step in run.steps(): print(step) run_id = run.path_components[1] flow_name = run.path_components[0] # time to complete # steps # artifact location? data.append({
def resolve_card( ctx, pathspec, follow_resumed=True, hash=None, type=None, ): """Resolves the card path based on the arguments provided. We allow identifier to be a pathspec or a id of card. Args: ctx: click context object pathspec: pathspec hash (optional): This is to specifically resolve the card via the hash. This is useful when there may be many card with same id or type for a pathspec. type : type of card Raises: CardNotPresentException: No card could be found for the pathspec Returns: (card_paths, card_datastore, taskpathspec) : Tuple[List[str], CardDatastore, str] """ if len(pathspec.split("/")) != 3: raise CommandException( msg="Expecting pathspec of form <runid>/<stepname>/<taskid>" ) flow_name = ctx.obj.flow.name run_id, step_name, task_id = None, None, None # what should be the args we expose run_id, step_name, task_id = pathspec.split("/") pathspec = "/".join([flow_name, pathspec]) # we set namespace to be none to avoid namespace mismatch error. namespace(None) task = Task(pathspec) print_str = "Resolving card: %s" % pathspec if follow_resumed: origin_taskpathspec = resumed_info(task) if origin_taskpathspec: pathspec = origin_taskpathspec ctx.obj.echo( "Resolving card resumed from: %s" % origin_taskpathspec, fg="green", ) else: ctx.obj.echo(print_str, fg="green") else: ctx.obj.echo(print_str, fg="green") # to resolve card_id we first check if the identifier is a pathspec and if it is then we check if the `id` is set or not to resolve card_id # todo : Fix this with `coalesce function` card_paths_found, card_datastore = resolve_paths_from_task( ctx.obj.flow_datastore, pathspec=pathspec, type=type, hash=hash, ) if len(card_paths_found) == 0: # If there are no files found on the Path then raise an error of raise CardNotPresentException( flow_name, run_id, step_name, card_hash=hash, card_type=type, ) return card_paths_found, card_datastore, pathspec
def lambda_handler(event, context): print(event) for record in event['Records']: key = record['s3']['object']['key'] bucket_name = record['s3']['bucket']['name'] os.environ['METAFLOW_HOME'] = '/tmp' os.environ['USERNAME'] = "******" obj = { 'METAFLOW_DEFAULT_METADATA': 'service', 'METAFLOW_DEFAULT_DATASTORE': 's3', 'METAFLOW_DATASTORE_SYSROOT_S3': f"s3://{bucket_name}", 'METAFLOW_SERVICE_AUTH_KEY': "yvhNDfEzcRa5fxKq2ZELda1zk8wNXxMs17Jt4OGs", 'METAFLOW_SERVICE_URL': "https://5sqcgnuyte.execute-api.eu-west-1.amazonaws.com/api/" } with open('/tmp/config.json', 'w', encoding='utf-8') as f: json.dump(obj, f, ensure_ascii=False, indent=4) from metaflow import Run, get_metadata, namespace namespace(None) print(get_metadata()) step = key.split("/")[2] flow = key.split("/")[0] run_id = key.split("/")[1] run = Run(f"{flow}/{run_id}") dynamo_object = { "created_at": int( datetime.strptime( run.created_at.split(".")[0], '%Y-%m-%dT%H:%M:%S').timestamp()), "flow_name": flow, "run_id": int(run_id), "success": run.successful, "finished": run.finished, "finished_at": 0 if run.finished_at == None else int( datetime.strptime( run.finished_at.split(".")[0], '%Y-%m-%dT%H:%M:%S').timestamp()), "current_step": step, "user": _parse_tags(run.tags, "user"), "tags": run.tags, "bucket": bucket_name } print(dynamo_object) table = dynamodb.Table(EVENTS_RECORD_STORE) table.put_item(Item=dynamo_object) return